In [1]:
import requests as rq
import re
from bs4 import BeautifulSoup as bs

# Experimenting
**P.S. The final result goes to the end**
### get the stock list

In [2]:
stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
def get_html(url):
    try:
        useragent = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'} #模拟浏览器
        rsp = rq.get(url, timeout=10, headers=useragent)
        rsp.raise_for_status() #根据状态码抛出HTTPError异常
        rsp.encoding = rsp.apparent_encoding #使得解码正确
        return rsp.text
    except:
        return "Exception with staus code:"+rsp.staus_code
stock_list_rsp = get_html(stock_list_url)

如下图为网页源代码

<img src='pics\stockls_src.png' style='float: left;'>

在`<a name=''></a>`后面的`<ul>`就是要求的列表

In [3]:
soup = bs(stock_list_rsp, 'html.parser')

In [4]:
tmp = soup.find('div', attrs={'id':'quotesearch'})

### pick the sh stock as an experiment

In [5]:
uls = tmp.find_all('ul')
sh_stocks = uls[0]
print(sh_stocks.prettify()[:500])

<ul>
 <li>
  <a href="http://quote.eastmoney.com/sh201000.html" target="_blank">
   R003(201000)
  </a>
 </li>
 <li>
  <a href="http://quote.eastmoney.com/sh201001.html" target="_blank">
   R007(201001)
  </a>
 </li>
 <li>
  <a href="http://quote.eastmoney.com/sh201002.html" target="_blank">
   R014(201002)
  </a>
 </li>
 <li>
  <a href="http://quote.eastmoney.com/sh201003.html" target="_blank">
   R028(201003)
  </a>
 </li>
 <li>
  <a href="http://quote.eastmoney.com/sh201004.html" target="_bla


### what we want on this page
the detail page of a certain stock follows the pattern of  
`https://gupiao.baidu.com/stock/sh600709.html`  
where the resource locator ends with the stock id  
so the stock id mapped from the stock name is what we need

In [6]:
sh_stock_maps = {a.string:a.attrs['href'][:-5].split(r'/')[-1] for a in sh_stocks.find_all('a')}

`a.string`是股票名字  
`a.attrs['href']`可以拿到url，再进一步截取出来id

### obtaining details from gupiao.baidu.com
still i'm taking the first one to make an experiment

In [7]:
one_stock_id = list(sh_stock_maps.values())[0]

In [8]:
stock_detail_url = r'https://gupiao.baidu.com/stock/{}.html'.format(one_stock_id)
stock_detail_url

'https://gupiao.baidu.com/stock/sh201000.html'

In [9]:
detail_page_rsp = get_html(stock_detail_url)

先分析一下页面代码，如下图就是所有需要的信息

<img src='pics\panel.png' style='float: left;'>
右图中的两个line对应了上面的两行
<img src='pics\stockdetail.png' style='float: left;'>
<img src='pics\2line.png'>


In [10]:
soup = bs(detail_page_rsp, 'html.parser')
info = []
title = soup.find('div', class_='price s-up ')
print(title.prettify())

AttributeError: 'NoneType' object has no attribute 'prettify'

然而实际上定位不到这个标签，所以就看了下上下文  
发现现在用做实验的页面是没有上面的很多信息的

<img src='pics\error.png' style='float: left;'>

In [11]:
stock_bets = soup.find('div', class_='stock-bets')
print(stock_bets.prettify())

<div class="stock-bets">
 <h1>
  <a class="bets-name" href="/stock/sh201000.html">
   R003 (
   <span>
    201000
   </span>
   )
  </a>
  <span class="state f-up">
   已休市 2018-03-30  15:01:30
  </span>
 </h1>
 <div class="price s-stop ">
  <strong class="_close">
   2.00
  </strong>
  <span>
   --
  </span>
  <span>
   0.00%
  </span>
 </div>
 <div class="bets-content">
  <div class="clear">
  </div>
 </div>
</div>



### 找了一个折中办法：只用一个`class`定位

In [12]:
soup.find('div', class_='price')

<div class="price s-stop ">
<strong class="_close">2.00</strong>
<span>--</span>
<span>0.00%</span>
</div>

### try another one

In [13]:
stock_detail_url = r'https://gupiao.baidu.com/stock/{}.html'.format(list(sh_stock_maps.values())[-100])
stock_detail_url

'https://gupiao.baidu.com/stock/sh603903.html'

In [14]:
soup = bs(get_html(stock_detail_url), 'html.parser')

this should be normal

In [15]:
title = soup.find('div', class_='price s-up ')
print(title.prettify())

<div class="price s-up ">
 <strong class="_close">
  33.88
 </strong>
 <span>
  +0.85
 </span>
 <span>
  +2.57%
 </span>
</div>



In [16]:
[x for x in title.children]

['\n',
 <strong class="_close">33.88</strong>,
 '\n',
 <span>+0.85</span>,
 '\n',
 <span>+2.57%</span>,
 '\n']

In [17]:
[x.string for x in title.children if x!='\n']

['33.88', '+0.85', '+2.57%']

### 然后解析2line

In [18]:
line1 = soup.find('div', class_='line1')
dls = line1.find_all('dl')
line1.children
[[c.string for c in dl.children] for dl in dls]

[['今开', '33.10'],
 ['成交量', '3.44万手'],
 ['最高', '34.99'],
 ['涨停', '36.33'],
 ['内盘', '1.66万手'],
 ['成交额', '1.17亿'],
 ['委比', '75.19%'],
 ['流通市值', '25.00亿'],
 [None, '56.29'],
 ['每股收益', '0.60'],
 ['总股本', '1.03亿']]

None很麻烦要想办法去掉

In [19]:
tmp = dls[8].children # the eighth dl
tmp = [x for x in tmp] # unwrap cuz tmp is an iterator
tmp = tmp[0] # pick the first who produces that 'None'
tmp

<dt class="mt-1">市盈率<sup>MRQ</sup></dt>

In [20]:
tmp.contents

['市盈率', <sup>MRQ</sup>]

### 由于出现嵌套，用string对第9个dl拿不到文本，所以换一个

In [21]:
line1 = soup.find('div', class_='line1')
dls = line1.find_all('dl')
[[c.contents[0] for c in dl.children] for dl in dls]

[['今开', '33.10'],
 ['成交量', '3.44万手'],
 ['最高', '34.99'],
 ['涨停', '36.33'],
 ['内盘', '1.66万手'],
 ['成交额', '1.17亿'],
 ['委比', '75.19%'],
 ['流通市值', '25.00亿'],
 ['市盈率', '56.29'],
 ['每股收益', '0.60'],
 ['总股本', '1.03亿']]

In [22]:
line1_infos = [[c.contents[0] for c in dl.children] for dl in dls]
titles,values = [x for x in zip(*line1_infos)]

In [23]:
print(titles)
print(values)

('今开', '成交量', '最高', '涨停', '内盘', '成交额', '委比', '流通市值', '市盈率', '每股收益', '总股本')
('33.10', '3.44万手', '34.99', '36.33', '1.66万手', '1.17亿', '75.19%', '25.00亿', '56.29', '0.60', '1.03亿')


试一下line2

In [24]:
line2 = soup.find('div', class_='line2')
dls = line2.find_all('dl')
[[c.contents[0] for c in dl.children] for dl in dls]

AttributeError: 'NavigableString' object has no attribute 'contents'

上面出错的原因是有个NavigableString

In [25]:
line2 = soup.find('div', class_='line2')
dls = line2.find_all('dl')
[[type(c) for c in dl.children] for dl in dls]

[[bs4.element.Tag, bs4.element.Tag],
 [bs4.element.Tag, bs4.element.Tag],
 [bs4.element.Tag, bs4.element.Tag, bs4.element.NavigableString],
 [bs4.element.Tag, bs4.element.Tag],
 [bs4.element.Tag, bs4.element.Tag],
 [bs4.element.Tag, bs4.element.Tag],
 [bs4.element.Tag, bs4.element.Tag],
 [bs4.element.Tag, bs4.element.Tag],
 [bs4.element.Tag, bs4.element.Tag],
 [bs4.element.Tag, bs4.element.Tag],
 [bs4.element.Tag, bs4.element.Tag]]

### 为了和line1写法兼容找了这么个折中办法
顺便`strip()`把没用的空格换行去了

In [26]:
line2 = soup.find('div', class_='line2')
dls = line2.find_all('dl')
[[c.contents[0].strip('\n ') for c in dl.children if isinstance(c, type(line2))] for dl in dls]

[['昨收', '33.03'],
 ['换手率', '4.67%'],
 ['最低', '33.10'],
 ['跌停', '29.73'],
 ['外盘', '1.79万手'],
 ['振幅', '5.72%'],
 ['量比', '--'],
 ['总市值', '35.01亿'],
 ['市净率', '4.88'],
 ['每股净资产', '6.94'],
 ['流通股本', '7379.97万']]

# final codes
**P.S. jupyter上多线程可能有点问题，运行的话还是在pycharm跑stock.py，代码是一样的**
### featuring
1. multithreading using `threading`
2. IO using `pandas`: writing csv (convenient for appending data)
3. tradeoff between memory and IO overhead :  
take the approach of periodical IO 

In [27]:
import requests as rq
import re
from bs4 import BeautifulSoup as bs
import pandas as pd

### convenient function for obtaining text of a webpage

In [28]:
def get_html(url):
    try:
        useragent = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'}  # 模拟浏览器
        rsp = rq.get(url, headers=useragent)
        rsp.raise_for_status()  # 根据状态码抛出HTTPError异常
        rsp.encoding = rsp.apparent_encoding  # 使得解码正确
        print('succeed in requesting url {}'.format(url))
        return rsp.text
    except Exception as e:
        print("Error occurs when requesting url {}\n" + url + repr(e))
        return None

### the function for extracting stock name with its id from the `eastmoney` website
there are exactly two elements in `mappings`: the shanghai stock and shenzhen stock

In [29]:
def extract_mappings():
    stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
    stock_list_rsp = get_html(stock_list_url)
    soup = bs(stock_list_rsp, 'html.parser')
    tmp = soup.find('div', attrs={'id': 'quotesearch'})
    uls = tmp.find_all('ul')

    mappings = []  # a list of 2, each dict
    for ul in uls:
        mappings.append({a.string: a.attrs['href'][:-5].split(r'/')[-1] for a in ul.find_all('a')})

    return mappings

### the function for extracting details for the given stock
the stock info is given in the form of tupe, i.e. (name,id)  
two aforementioned steps are merged into this function:
1. extracting the heading 
2. extracting two line details

In [30]:
def one_stock_detail(stock):
    name, stock_id = stock

    stock_detail_url = r'https://gupiao.baidu.com/stock/{}.html'.format(stock_id)
    detail_page_rsp = get_html(stock_detail_url)
    if detail_page_rsp is None:
        print('fail to load {}'.format(stock_detail_url))
    soup = bs(detail_page_rsp, 'html.parser')

    head = soup.find('div', class_='price')
    if head is None:
        print('{} with id={} has no information on gupiao.baidu.com'.format(name, stock_id))
        return list(stock) + [None] * 25

    head_values = list(stock) + [x.string for x in head.children if x != '\n']

    lines = ('line1', 'line2')
    for line in lines:
        linetag = soup.find('div', class_=line)
        if linetag is None:
            head_values += [None] * 22
            break
        dls = linetag.find_all('dl')
        contents = [[c.contents[0] for c in dl.children if isinstance(c, type(linetag))] for dl in dls]
        line_titles, line_values = [x for x in zip(*contents)]

        head_values.extend(line_values)

    return head_values

### the function for multithreading
the crawling of the stocks is organized as such:
1. take a small batch of stock_info
2. concurrently fetching data of this batch using multithreading
3. feed this batch of data to csv file
4. loop   

In [31]:
import threading
def threading_data(data=None, fn=None, thread_count=None, **kwargs):

    def apply_fn(results, i, data, kwargs):
        results[i] = fn(data, **kwargs)

    if thread_count is None:
        results = [None] * len(data)
        threads = []
        # for i in range(len(data)):
        #     t = threading.Thread(name='threading_and_return', target=apply_fn, args=(results, i, data[i], kwargs))
        for i, d in enumerate(data):
            t = threading.Thread(name='threading_and_return', target=apply_fn, args=(results, i, d, kwargs))
            t.start()
            threads.append(t)
    else:
        divs = np.linspace(0, len(data), thread_count + 1)
        divs = np.round(divs).astype(int)
        results = [None] * thread_count
        threads = []
        for i in range(thread_count):
            t = threading.Thread(name='threading_and_return', target=apply_fn, args=(results, i, data[divs[i]:divs[i + 1]], kwargs))
            t.start()
            threads.append(t)

    for t in threads:
        t.join()

    if thread_count is None:
        try:
            return np.asarray(results)
        except Exception:
            return results
    else:
        return np.concatenate(results)

### put it all together

In [32]:
def main():
    n_parallel = 20
    save_path = 'test.csv'
    mappings = extract_mappings()

    titles = ['名称', 'id', '今收', '增幅', '增比', '今开', '成交量', '最高', '涨停', '内盘', '成交额', '委比', '流通市值', '市盈率', '每股收益', '总股本',
              '昨收', '换手率', '最低', '跌停', '外盘', '振幅', '量比', '总市值', '市净率', '每股净资产', '流通股本']

    for mapping in mappings:  # sh & sz

        stocks = list(mapping.items())
        rounds = (len(stocks) - 1) // n_parallel + 1
        gen = (stocks[i * n_parallel:min(len(stocks), (i + 1) * n_parallel)] for i in range(rounds))

        # pool = ProcessPool(n_parallel)

        start = True
        for batch in gen:

            med_results = threading_data(batch, one_stock_detail)

            if start:
                df = pd.DataFrame(med_results, columns=titles)
                df.to_csv(save_path, index=False)
                start = False
            else:  # https://gupiao.baidu.com/stock/sh500009.html
                df = pd.DataFrame(med_results)
                df.to_csv('test.csv', mode='a', index=False, header=False)