In [1]:
import numpy as np
import pandas as pd

In [2]:
import requests
import re
from bs4 import BeautifulSoup

In [3]:
COLUMN_DICT = {
    '日增长率': 'rzdf',
    '近一周': 'zzf',
    '近一月': '1yzf',
    '近3月': '3yzf',
    '近6月': '6yzf',
    '近1年': '1nzf',
    '近2年': '2nzf',
    '近3年': '3nzf',
    '今年以来': 'jnzf',
    '成立以来': 'lnzf'
}

COLS = [
    '代码', '名称', '简拼', '更新日期', '单位净值', '累积净值', '日增长率', '近一周', '近一月', '近3月',
    '近6月', '近1年', '近2年', '近3年', '今年以来', '成立以来', '成立日期'
]

In [11]:
def make_funds_rank_url(order_item):
    '''根据排名项生成获取基金排名数据的url'''
    # 参数pi=1，2...为页数，每页50条记录
    return 'http://fund.eastmoney.com/data/rankhandler.aspx?op=ph&sc={}&st=desc'.format(
        COLUMN_DICT[order_item])


def get_funds_rank(url, n):
    '''获取基金排名数据'''
    r = requests.get(url)
    r.encoding = 'utf-8'
    datas = eval(re.findall('\[.*\]', r.text)[0])
    if n != 0:
        datas = datas[:n]
    return datas


def make_funds_df(datas):
    '''根据基金排名数据生成DataFrame'''
    funds = []
    for data in datas:
        funds.append(data.split(',')[:17])

    df = pd.DataFrame(funds, columns=COLS)
    return df


def get_funds_df(order_item, n):
    '''根据指定排名项获取基金排名数据的DataFrame'''
    url = make_funds_rank_url(order_item)
    datas = get_funds_rank(url, n)
    df = make_funds_df(datas)
    return df


def get_fund_size(soup):
    '''获取基金规模'''
    tag = soup.select_one('div.infoOfFund tr:nth-child(1) td:nth-child(2)')
    fund_size = re.findall(r'基金规模\D*(\d.+\d)亿元', str(tag))[0]
    return fund_size


def get_stocks_by_fund(fund):
    '''根据单个基金获取持仓股票信息'''
    url = 'http://fund.eastmoney.com/{}.html'.format(fund['代码'])
    r = requests.get(url)
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text, 'lxml')

    # 基金规模
    fund_size = get_fund_size(soup)

    # tag为每个股票的链接html
    i = 0
    stocks = []
    for item in soup.select(
            '#position_shares > div.poptableWrap > table > tr')[1:]:
        i += 1
        elements = item.get_text().split()
        if len(elements) == 1:
            break
        stocks.append([
            i, fund['代码'], fund['名称'], fund_size,
            re.sub('\D', '', item.a['href']), elements[0], elements[1],
            item.a['href']
        ])

    return stocks


def get_stocks_df(order_item, n=0):
    '''获取所有基金的持仓股票信息'''
    funds_df = get_funds_df(order_item, n)

    stocks = []
    for index, fund in funds_df.iterrows():
        stocks_by_fund = get_stocks_by_fund(fund)
        stocks.extend(stocks_by_fund)

    stocks_df = pd.DataFrame(stocks,
                             columns=[
                                 'rank', 'fund_code', 'fund_name', 'fund_size',
                                 'stock_code', 'stock_name', 'stock_percent',
                                 'stock_url'
                             ])
    return stocks_df

In [5]:
order_item = '近一周'

In [12]:
# TODO 可以获取大于50条记录
df=get_stocks_df(order_item,5)

In [17]:
df

Unnamed: 0,rank,fund_code,fund_name,fund_size,stock_code,stock_name,stock_percent,stock_url
0,1,162703,广发小盘成长混合(LOF),46.41,300601,康泰生物,9.55%,http://quote.eastmoney.com/sz300601.html
1,2,162703,广发小盘成长混合(LOF),46.41,600703,三安光电,9.22%,http://quote.eastmoney.com/sh600703.html
2,3,162703,广发小盘成长混合(LOF),46.41,300661,圣邦股份,8.76%,http://quote.eastmoney.com/sz300661.html
3,4,162703,广发小盘成长混合(LOF),46.41,300014,亿纬锂能,7.73%,http://quote.eastmoney.com/sz300014.html
4,5,162703,广发小盘成长混合(LOF),46.41,600536,中国软件,6.89%,http://quote.eastmoney.com/sh600536.html
5,6,162703,广发小盘成长混合(LOF),46.41,603986,兆易创新,6.34%,http://quote.eastmoney.com/sh603986.html
6,7,162703,广发小盘成长混合(LOF),46.41,63,中兴通讯,5.88%,http://quote.eastmoney.com/sz000063.html
7,8,162703,广发小盘成长混合(LOF),46.41,2156,通富微电,5.57%,http://quote.eastmoney.com/sz002156.html
8,9,162703,广发小盘成长混合(LOF),46.41,300529,健帆生物,5.14%,http://quote.eastmoney.com/sz300529.html
9,10,162703,广发小盘成长混合(LOF),46.41,300457,赢合科技,5.00%,http://quote.eastmoney.com/sz300457.html


In [38]:
# 计算各股票持仓金额
df['stock_money'] = df['fund_size'].astype(
    float) * df['stock_percent'].str.strip('%').astype(float) / 100

In [48]:
# 根据基金去除重复数据，保留排名第一的股票
# df.drop_duplicates(['fund_code','fund_name'])

In [49]:
# 统计每个股票有多少基金持仓
# df.groupby(['stock_code','stock_name','stock_url']).count().sort_values('rank',ascending=False)