### Sina Finance Web Crawler

This notebook is used for data collection from [Sina Finance](https://finance.sina.com.cn/)
+ Convert price in conversion item, saved in `sina_cvprice.csv`
+ Interest rate in bond issuance item, saved in `sina_rate.csv`

Note: The stability of web crawler code depends on Internet condition. ***No need to run this notebook unless the data above is removed.***



In [None]:
import os
import time
import random
import requests
import pandas as pd
from tqdm import tqdm
from lxml import etree

In [None]:
# load bond tickers
data_path = '../../../../export/scratch/for_yifan/research/'
cbond_info = pd.read_csv(os.path.join(data_path, 'cbond_info.csv'), index_col=False, low_memory=False)
tickers = cbond_info['bond_ticker'].unique().tolist()

In [None]:
# user agent pool
user_agent_list = [
    "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0) Gecko/20100101 Firefox/61.0",
    "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36"
    ]

#### Convert Price

In [None]:
def sinaCrawlerConv(ticker):
    header = {"User-Agent": random.choice(user_agent_list)}

    # sample url: http://money.finance.sina.com.cn/bond/conversion/sh113609.html
    url = 'http://money.finance.sina.com.cn/bond/conversion/{}{}.html'.format(
        ticker.split('.')[1], ticker.split('.')[0])

    response = requests.get(url, headers=header, timeout=5)
    text = response.text
    html = etree.HTML(text)
    title = html.xpath("//div[@class='blk02']/table[2]/tr[@class='bluetit']/td/text()")
    table = html.xpath("//div[@class='blk02']/table[2]/tr[@class='bluecnt']")

    value = []
    for content in table:
        value.append(content.xpath("./td/text()"))

    record = pd.DataFrame(data=value, columns=title)
    
    record = record.iloc[:, [2, 3, 5, 6]].copy()
    record.columns = ['announce_date', 'conv_date', 'conv_price', 'conv_ratio']
    record['bond_ticker'] = ticker
    
    return record

In [None]:
# collect convert price from sina finane 
cv_record = []

for ticker in tqdm(tickers):
    record = sinaCrawlerConv(ticker)
    if len(record) == 0:
        print(ticker)
    cv_record.append(record)
    time.sleep(1)
    
cv_data = pd.concat(cv_record).reset_index(drop=True)

In [None]:
cv_data['announce_date'] = cv_data['announce_date'].map(lambda x: x.replace('-', ''))
cv_data['conv_date'] = cv_data['conv_date'].map(lambda x: x.replace('-', ''))
cv_data.rename(columns={'date': 'conv_date', 'cv_price':'conv_price', 'cv_ratio':'conv_ratio'}, inplace=True)
cv_data = cv_data[['bond_ticker', 'announce_date', 'conv_date', 'conv_price', 'conv_ratio']].copy()

In [None]:
cv_data.to_csv(os.path.join(data_path, 'sina_cvprice.csv'), index=False)

#### interest rate

In [None]:
def sinaCrawlerRate(ticker):
    header = {"User-Agent": random.choice(user_agent_list)}

    # sample url: http://money.finance.sina.com.cn/bond/rateChange/sh110030.html
    url = 'http://money.finance.sina.com.cn/bond/rateChange/{}{}.html'.format(
        ticker.split('.')[1], ticker.split('.')[0])

    response = requests.get(url, headers=header, timeout=5)
    text = response.text
    html = etree.HTML(text)
    title = html.xpath("//div[@class='blk02']/table//tr[@class='bluetit']/td/text()")
    table = html.xpath("//div[@class='blk02']/table//tr[@class='bluecnt']")

    value = []
    for content in table:
        value.append(content.xpath("./td/text()"))

    record = pd.DataFrame(data=value, columns=title)
    
    record = record.iloc[:, [1, 2, 3]].copy()
    record.columns = ['start_date', 'end_date', 'rate']
    record['bond_ticker'] = ticker
    
    return record

In [None]:
# collect interest rate from sina finane 
rate_record = []

for ticker in tqdm(tickers):
    record = sinaCrawlerRate(ticker)
    if len(record) == 0:
        print(ticker)
    rate_record.append(record)
    time.sleep(1)
    
rate = pd.concat(rate_record).reset_index(drop=True)

In [None]:
rate['start_date'] = rate['start_date'].map(lambda x: x.replace('-', '')).astype('int64')
rate['end_date'] = rate['end_date'].map(lambda x: x.replace('-', '')).astype('int64')
rate = rate[['bond_ticker', 'start_date', 'end_date', 'rate']]
rate.head()

In [None]:
rate.to_csv(os.path.join(data_path, 'sina_rate.csv'), index=False)