### Data crawling
- Crawl title, time and brief information from the following five websites. 
- Write the data into three csv files respectively. 

In [2]:
import pandas as pd
import numpy as np
import requests
import base64

from bs4 import BeautifulSoup

from tqdm import tqdm
from PIL import Image
from io import BytesIO
from IPython.core.display import HTML

from matplotlib.colors import is_color_like

#### 1. proper.hk
- We select proper.hk (宅谷地产) to fetch the news information about property market in the past 10 years. The link is: https://www.property.hk/news_list.php?  author=PHK_NEWSPROP&page=1
- We crawled from page 1 to page 800, about 16,000 texts in total. 

In [10]:
# The source page 
headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
        'Connection': 'close'
}
url0 = 'https://www.property.hk/news_list.php?author=PHK_NEWSPROP&page=1'
page_text0 = requests.get(url=url0,
                        headers=headers)
content0 = page_text0.text
soup = BeautifulSoup(content0,'lxml')
Volume_lists = soup.find_all('li', class_ = 'media')

num = np.arange(1, 800)
urls = ['https://www.property.hk/news_list.php?author=PHK_NEWSPROP&page=' 
         + str(num[i]) for i in range(0, len(num))]

# We fetched all the urls for the 800 web pages first 
urls[:5]                      

['https://www.property.hk/news_list.php?author=PHK_NEWSPROP&page=1',
 'https://www.property.hk/news_list.php?author=PHK_NEWSPROP&page=2',
 'https://www.property.hk/news_list.php?author=PHK_NEWSPROP&page=3',
 'https://www.property.hk/news_list.php?author=PHK_NEWSPROP&page=4',
 'https://www.property.hk/news_list.php?author=PHK_NEWSPROP&page=5']

##### Then we crawled all needed information for each web page: title, content, time and put them into a dataframe   

In [11]:
infor_df = []

for i in range(len(urls)):
    url = urls[i]
    page_text = requests.get(url=url, headers=headers)
    content = page_text.text
    soup = BeautifulSoup(content,'lxml')
    Volume_lists = soup.find_all('li', class_ = 'media')
    titles = []
    contents = []
    times = []

    for volume in Volume_lists:
        titles.append(volume.find('div', class_ = 'bname').text)
        contents.append(volume.find('div', class_ = 'other').text)
        times.append(volume.find('div', class_ = 'col-xs-12 input text-right').text)

    infor_df.append(pd.DataFrame({
      'titles': titles,
      'contents': contents,
      'times':  times
   }))
    
total_titles = []
total_contents = []
total_times = []
for i in range(len(infor_df)):
    for j in range(len(infor_df[i])):
        total_titles.append(infor_df[i]['titles'][j])
        total_contents.append(infor_df[i]['contents'][j])
        total_times.append(infor_df[i]['times'][j])

In [76]:
df = pd.DataFrame(list(zip(total_titles, total_contents, total_times)),
               columns =['title', 'content','time'])
df

Unnamed: 0,title,content,time
0,樓市五節棍之本港樓價短期見頂回軟,第一棍：領展錢多頻掃貨\r\n\r\n經濟放緩，坐擁資金之企業締造入市機會，領展先後出擊...,2021/11/13
1,今日地產新聞摘要,【文匯報】報導， 在香港測量師學會昨日舉行的「優秀發展及保育大獎」頒獎禮上，擔任...,2021/11/13
2,明日大嶼毀生態 不如發展新界北,各位，「明日大嶼」呢4個字，自從2018年經好高騖遠嘅特首林鄭月娥喺《施政報告》...,2021/11/13
3,公屋供不應求 街坊等7年仍未上樓,政府施政不堪，港人輪候公屋時間創1999年以來新高，以逾25萬宗申請計算，至少有...,2021/11/13
4,二手樓價挫 港島重災,業主擴大減幅 太古城則王劈170萬易手\r\n二手業主減價吸客，樓價繼續由高位回落。...,2021/11/13
...,...,...,...
15975,產業署豪宅宿舍收30份標書,樓市暢旺，政府產業署9月推出5個前公務員宿舍招標出售，包括薄扶林碧瑤灣兩個單位、...,2016/10/15
15976,太古城三房套大減8%獲承租,利嘉閣地產黃凱達指出，市場新近錄得太古城海景花園（西）翠榕閣高層C室租賃成交，實...,2016/10/15
15977,屯門兆山苑1.01萬即睇即租,祥益地產潘鈺恒表示，日前該分行促成一宗屯門兆山苑之「即睇即租」成交個案，是次租客...,2016/10/15
15978,今日地產新聞摘要,【蘋果日報】稱，綠置居派表，15%單位不足200呎，公屋聯會：應設底線300呎。...,2016/10/14


In [73]:
df.to_csv('宅谷新闻数据.csv', encoding='utf-8')

#### 2. For online forums (全港屋苑讨论)
- The link is : https://finance.discuss.com.hk/forumdisplay.php?fid=586&filter=type&typeid=1298&ascdesc=DESC&from=2&page=1

In [103]:
# The source page 
url0 = 'https://finance.discuss.com.hk/forumdisplay.php?fid=586&filter=type&typeid=1298&ascdesc=DESC&from=2&page=1'
page_text0 = requests.get(url=url0,
                        headers=headers)
content0 = page_text0.text
soup = BeautifulSoup(content0,'lxml')
Volume_lists = soup.find_all('span', class_ = 'tsubject')
print(len(Volume_lists))

titles = []
times = []


for volume in Volume_lists:
    titles.append(volume.find('a').text.strip())
titles

23


['邊個女明星你覺得唔靚，但好多人都話靚？(有圖)',
 '大家中意用咩奶配咖啡？',
 '[轉貼] 原來古羅馬軍人係同性戀!!!!!!!',
 '屯門 瓏門 業主專區',
 '⇧【 留言前看版規，本版嚴禁招攬及查詢入群入組 】⇧',
 '******屯門好友請進(15)******',
 '雲疊花園「死過翻生」管理處經理又有新通告啦',
 '馬鞍山錦豐苑住戶專區(第五版)',
 '太古城專區',
 '沙田某屋苑管理處通告出神入化...  萬聖節整鬼你?',
 '屯門卓爾居住戶專區',
 '新屯門中心住戶專區29',
 '嘉湖山莊住戶專區---3---',
 '屯門雙寓2GETHER業主專區',
 '荃灣愉景新城住戶專區 Part 8',
 '小心大廈安全！發現有人假扮外賣仔派賭場傳單！',
 '愉景灣 Discovery Bay - 交流區',
 '屯門怡峰園 Villa Tiara 專區 (第 3 版)',
 '青衣海欣花園 (Grand Horizon) 專區',
 '麗港城住戶專區 (6)',
 '荃灣海之戀/愛炫美業主專區',
 '藍灣半島住戶專區(第七版)',
 '爾巒討論區（第四版）']

In [None]:
# Also fetched the links first 

In [106]:
num = np.arange(1, 78)
urls = ['https://finance.discuss.com.hk/forumdisplay.php?fid=586&filter=type&typeid=1298&ascdesc=DESC&from=2&page=' 
         + str(num[i]) for i in range(0, len(num))]
urls[:5]


['https://finance.discuss.com.hk/forumdisplay.php?fid=586&filter=type&typeid=1298&ascdesc=DESC&from=2&page=1',
 'https://finance.discuss.com.hk/forumdisplay.php?fid=586&filter=type&typeid=1298&ascdesc=DESC&from=2&page=2',
 'https://finance.discuss.com.hk/forumdisplay.php?fid=586&filter=type&typeid=1298&ascdesc=DESC&from=2&page=3',
 'https://finance.discuss.com.hk/forumdisplay.php?fid=586&filter=type&typeid=1298&ascdesc=DESC&from=2&page=4',
 'https://finance.discuss.com.hk/forumdisplay.php?fid=586&filter=type&typeid=1298&ascdesc=DESC&from=2&page=5']

In [129]:
# Then crawled the titles in each page
infor_df = []
titles = []
times = []
for i in range(len(urls)):
    url = urls[i]
    page_text = requests.get(url=url, headers=headers)
    content = page_text.text
    soup = BeautifulSoup(content,'lxml')
    Volume_lists = soup.find_all('span', class_ = 'tsubject')
    for volume in Volume_lists:
        titles.append(volume.find('a').text.strip())
    
titles

['現在我終於明白點解習帝今年要自我引爆......',
 '比特幣明年今日.........',
 '90後，00後是否最有錢💰嘅一群？',
 '屯門 瓏門 業主專區',
 '⇧【 留言前看版規，本版嚴禁招攬及查詢入群入組 】⇧',
 '******屯門好友請進(15)******',
 '雲疊花園「死過翻生」管理處經理又有新通告啦',
 '馬鞍山錦豐苑住戶專區(第五版)',
 '太古城專區',
 '沙田某屋苑管理處通告出神入化...  萬聖節整鬼你?',
 '屯門卓爾居住戶專區',
 '新屯門中心住戶專區29',
 '嘉湖山莊住戶專區---3---',
 '屯門雙寓2GETHER業主專區',
 '荃灣愉景新城住戶專區 Part 8',
 '小心大廈安全！發現有人假扮外賣仔派賭場傳單！',
 '愉景灣 Discovery Bay - 交流區',
 '屯門怡峰園 Villa Tiara 專區 (第 3 版)',
 '青衣海欣花園 (Grand Horizon) 專區',
 '麗港城住戶專區 (6)',
 '荃灣海之戀/愛炫美業主專區',
 '藍灣半島住戶專區(第七版)',
 '爾巒討論區（第四版）',
 'Park Yoho 專區 (第2版) (版主示：留言前留意 P1 #2)',
 '觀塘 凱滙 業主專區',
 '牛頭角淘大花園住戶專區 [第四版] [2014]',
 '粉嶺居民專區(第八版)',
 '啟德 龍譽Vibe Centro 業主專區II',
 '元朗朗庭園住戶專區 （第三章）',
 '[荃灣中心住戶專區] 第十一版',
 '馬鞍山新港城住戶專區',
 '東濤苑住戶專區 (第十二版)',
 '青衣翠怡花園專區',
 '屯門上源業主專區',
 '馬鞍山 雲海 業主專區',
 '藍田匯景花園住戶專區4',
 'TKO[新都城]業主區',
 '青衣美景花園專區（3）',
 '清水灣半島住戶專區 (四)',
 '沙田第一城住戶專區(第5版)',
 '屯門疊茵庭住戶專區 (第2版)',
 '掃管笏星堤討論區',
 '天水圍 Wetland Seasons Park 業主專區',
 '長沙灣 一號九龍道Madison Park 業主專區',
 '紅磡海濱南岸住戶專區',
 '天水圍栢慧豪園 Central Park Towers專區',
 '馬鞍山 峻源 業主專區'

In [132]:
# Crawled the time for each text 
infor_df = []
times = []
for i in range(len(urls)):
    url = urls[i]
    page_text = requests.get(url=url, headers=headers)
    content = page_text.text
    soup = BeautifulSoup(content,'lxml')
    Volume_lists = soup.find_all('td', class_ = 'author')
    for volume in Volume_lists:
        if volume.find('em') != None:
            times.append(volume.find('em').text)

In [142]:
# Did some slicing to filter out the useless information in the first page 

In [143]:
t_titles = titles[3:]
t_times = times[4:]

In [144]:
df = pd.DataFrame(list(zip(t_titles, t_times)),
               columns =['title','time'])
df

Unnamed: 0,title,time
0,屯門 瓏門 業主專區,2018-5-22
1,⇧【 留言前看版規，本版嚴禁招攬及查詢入群入組 】⇧,2019-1-1
2,******屯門好友請進(15)******,2017-6-13
3,雲疊花園「死過翻生」管理處經理又有新通告啦,2021-11-1
4,馬鞍山錦豐苑住戶專區(第五版),2011-3-1
...,...,...
1915,御龍山住戶討論專區(第2版),2009-7-27
1916,元朗加州花園住戶專區,2009-8-20
1917,全港屋苑住戶專區告示版 26 march 2008更新,2008-3-25
1918,住戶專區指南及捷徑,2007-6-28


In [None]:
df_clear = df.drop(df[df['time'] == '\xa0'].index)
df_clear.to_csv('论坛数据.csv')

#### 3. For the financial database (财经数据库)
- The link is: http://www.bjinfobank.com/DataList.do?page=1&db=HK&rl=0&iw=&query=all&pageSize=25&endTime=&metiaName=&typeName=&starTime=&metiaLevel=&method=DataList&className=%E5%9C%B0%E4%BA%A7&areaForArt=%E9%A6%99%E6%B8%AF&myorder=SUTM

In [79]:
headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
        'Connection': 'close'
}
num = np.arange(1, 280)

urls = ['http://www.bjinfobank.com/DataList.do?page=' 
         + str(num[i]) +  '&db=HK&rl=0&iw=&query=all&pageSize=25&endTime=&metiaName=&typeName=&starTime=&metiaLevel=&method=DataList&className=%E5%9C%B0%E4%BA%A7&areaForArt=%E9%A6%99%E6%B8%AF&myorder=SUTM' for i in range(0, len(num)) ]

In [None]:

for i in range(len(urls)):
    url = urls[i]
    page_text = requests.get(url=url, headers=headers)
    content = page_text.text
    soup = BeautifulSoup(content,'lxml')
    Volume_lists = soup.find_all('td', class_ = 'tabListTd1')
    Volume_lists2 = soup.find_all('td', style ='padding-top:7px;color:#206093;width:100px')
    for i in range(len(Volume_lists)):
        titles.append(Volume_lists[i].find('a').text.strip())
        times.append(Volume_lists2[i].text.strip())


In [75]:
dict = {'title': titles, 'time': times}  
df = pd.DataFrame(dict) 
df

Unnamed: 0,title,time
0,香港置地指旗下写字楼的空置率约5.5%(299),2021-11-18
1,团结香港基金对公屋总建屋量预测调整感失望(517),2021-11-16
2,"香港""北部都会区""效应洪水桥住宅项目热销(423)",2021-11-11
3,香港盛洋投资终止出售美国写字楼物业买卖方存在意见分歧(481),2021-11-08
4,香港合能控股赴港递交招股书总土地储备440万平方米(401),2021-10-31


In [78]:
df.to_csv('高校财经数据.csv')

#### 4. Midland
> https://www.midland.com.hk/property-news/category/%E6%A8%93%E5%B7%BF%E6%96%B0%E8%81%9E/page/2/

In [None]:
headers={
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
        'Connection': 'close'
}

# 200 pages news
numpage=615 # in fact 616 pages
weblist=[]
url = 'https://www.midland.com.hk/property-news/category/%E6%A8%93%E5%B7%BF%E6%96%B0%E8%81%9E/'
weblist.append(url)
for i in range(numpage):
    u0='https://www.midland.com.hk/property-news/category/%E6%A8%93%E5%B7%BF%E6%96%B0%E8%81%9E/page/'
    ui=u0+str(i+2)+'/'
    weblist.append(ui)

In [None]:
page_text=[]
content=[]
soup=[]
Title=[]

for j in range(numpage+1):
    page_text.append(requests.get(url=weblist[j],headers=headers))
    content.append(page_text[j].text)
    soup.append(BeautifulSoup(content[j],'lxml'))

In [None]:
for k in range(numpage+1):
    title_list = soup[k].find_all('h3',class_='entry-title')
    time_list=soup[k].find_all('time',class_='entry-date')
    brief_list=soup[k].find_all('div',class_='entry-content')

    for m in range(len(title_list)):
        title=title_list[m].find('a').text.strip()
        time=time_list[m].text.strip()
        brief=brief_list[m].find('p').text.strip()

        Title.append(pd.DataFrame({
            'title':[title],
            'time':[time],
            'brief':[brief]
        }))

Title=pd.concat(Title).reset_index().iloc[:,1:]

In [None]:
Title.to_csv("midland_news.csv",encoding='utf_8',index_label="index")
import pandas as pd
test=pd.read_csv("midland_news.csv")

#### 5. 28hse property company 
- From website "https://www.28hse.com/news/"
- Return all the articles' urls for further crawling. 
- Crawl the data and write all of them to a csv file

In [None]:
headers={
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
        'Connection': 'close'
}

# 200 pages news
numpage=82 # in fact 616 pages
weblist=[]

for i in range(numpage):
    u0='https://www.28hse.com/news/?page='
    ui=u0+str(i+1)+'&keyword=&cat_ids=3'
    weblist.append(ui)

In [None]:
page_text=[]
content=[]
soup=[]


for j in range(len(weblist)):
    page_text.append(requests.get(url=weblist[j],headers=headers))
    content.append(page_text[j].text)
    soup.append(BeautifulSoup(content[j],'lxml'))

In [None]:
urls=[]
# time=[]

for i in range(len(weblist)):
    header=soup[i].find_all('div',class_='item detail_item')
    # web_list=soup[i].find_all('a',rel='bookmark')
    for m in range(len(header)):
        web=header[m]['detail-url']
        time=header[m].find('div',class_='meta').text.strip()
        title=header[m].find('a').text.strip()

        urls.append(pd.DataFrame({
                'time':[time],
                'web':[web],
                'title':[title]
        }))
urls=pd.concat(urls).reset_index().iloc[:,1:]

In [None]:
urls.to_csv("10year_urls.csv",encoding='utf_8')

##### import the urls to crawl articles

In [None]:
urls=pd.read_csv('10year_urls.csv')
webtotal=urls['web'].tolist()
time=urls['time'].tolist()
title=urls['title'].tolist()

number=len(time)

In [None]:
import datetime
starttime = datetime.datetime.now()
#long running
#do something other

num=1000

page_text=[]
content=[]
soup=[]
Time=[]
Title=[]

for j in range(len(time)): # soup all the codes from those websites
    if j%100==0:
        print(j)
    page_text.append(requests.get(url=webtotal[j],headers=headers))
    content.append(page_text[j].text) ##
    soup.append(BeautifulSoup(content[j],'lxml')) ##
    Time.append(time[j])
    Title.append(title[j])

In [None]:
text=[]

for i in range(len(soup)):
    article=[]
    part = soup[i].find_all('div',class_='sixteen wide column')

    if part!=[]:
        paragraph=part[0].find_all('div')
        for k in range(len(paragraph)):
            if (paragraph[k].text.strip() !=('&nbsp;' and '')):
                article.append(paragraph[k].text.strip())

        text.append(pd.DataFrame({
            'time':[Time[i]],
            'text':[article],
            'title':[Title[i]]
        }))

text=pd.concat(text).reset_index().iloc[:,1:]
text.to_csv('10year28Hse.csv',encoding='utf_8')

endtime = datetime.datetime.now()
print ((endtime - starttime).seconds)

### Data process
- Regulate time format
- Concat datesets and sort by time

In [None]:
midland=pd.read_csv("midland_news.csv")
zhaigu=pd.read_csv('宅谷新闻数据.csv')
forum=pd.read_csv('论坛数据.csv')

In [None]:
forum=forum.reindex(columns=['time','title','brief'])
midland=midland.reindex(columns=['time','title','brief'])
zhaigu=zhaigu.reindex(columns=['time','title','content'])
zhaigu=zhaigu.rename(columns={'content':'brief'})
for row_index,row in zhaigu.iterrows():
    rowtime=row['time'].split('/')
    time=[int(i) for i in rowtime]
    row['time']=datetime(time[0],time[1],time[2]).strftime("%Y-%m-%d")

Dataset=pd.concat([forum,zhaigu],keys=['x','y'],ignore_index=True)
Dataset=pd.concat([Dataset,midland],keys=['x','y'],ignore_index=True)

In [None]:
year=[]
month=[]
date=[]

for row_index,row in Dataset.iterrows():
    rowtime=row['time'].split('-')
    rowtime=[int(i) for i in rowtime]
    year.append(rowtime[0])
    month.append(rowtime[1])
    date.append(rowtime[2])

index0=[]
index0.append([i for i in range(len(year))])
index0=index0[0]

Dataset['year']=pd.Series(year,index=index0)
Dataset['month']=pd.Series(month,index=index0)
Dataset['date']=pd.Series(date,index=index0)

In [None]:
Dataset=Dataset[Dataset.year>=2017]
Dataset=Dataset.sort_values(by=['year','month','date'])
Dataset=Dataset.reset_index(drop=True)
Dataset.to_csv('Newsdata.csv',encoding='utf_8')

In [None]:
a=pd.read_csv('midlandnews_total8829.csv')
a