In [1]:
import numpy as np
import pandas as pd

In [2]:
import requests
import re
from bs4 import BeautifulSoup

In [3]:
COLUMN_DICT = {
    '日增长率': 'rzdf',
    '近一周': 'zzf',
    '近一月': '1yzf',
    '近3月': '3yzf',
    '近6月': '6yzf',
    '近1年': '1nzf',
    '近2年': '2nzf',
    '近3年': '3nzf',
    '今年以来': 'jnzf',
    '成立以来': 'lnzf'
}

COLS = [
    '代码', '名称', '简拼', '更新日期', '单位净值', '累积净值', '日增长率', '近一周', '近一月', '近3月',
    '近6月', '近1年', '近2年', '近3年', '今年以来', '成立以来', '成立日期'
]

In [53]:
def make_funds_rank_url(order_item):
    '''根据排名项生成获取基金排名数据的url'''
    return 'http://fund.eastmoney.com/data/rankhandler.aspx?op=ph&sc={}&st=desc'.format(
        COLUMN_DICT[order_item])


def get_funds_rank(url, n):
    '''获取基金排名数据'''
    r = requests.get(url)
    r.encoding = 'utf-8'
    datas = eval(re.findall('\[.*\]', r.text)[0])
    if n != 0:
        datas = datas[:n]
    return datas


def make_funds_df(datas):
    '''根据基金排名数据生成DataFrame'''
    funds = []
    for data in datas:
        funds.append(data.split(',')[:17])

    df = pd.DataFrame(funds, columns=COLS)
    return df


def get_funds_df(order_item, n):
    '''根据指定排名项获取基金排名数据的DataFrame'''
    url = make_funds_rank_url(order_item)
    datas = get_funds_rank(url, n)
    df = make_funds_df(datas)
    return df


def get_fund_size(soup):
    '''获取基金规模'''
    tag = soup.select_one('div.infoOfFund tr:nth-child(1) td:nth-child(2)')
    fund_size = re.findall(r'基金规模\D*(\d.+\d)亿元', str(tag))[0]
    return fund_size


def get_stocks_by_fund(fund):
    '''根据单个基金获取持仓股票信息'''
    url = 'http://fund.eastmoney.com/{}.html'.format(fund['代码'])
    r = requests.get(url)
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text, 'lxml')

    fund_size = get_fund_size(soup)

    # TODO:获取股票的持仓百分比
#     for item in soup.select('#position_shares > div.poptableWrap > table > tr')[1:]:
#     print(item.get_text())

    # tag为每个股票的链接html
    i = 0
    stocks = []
    for tag in soup.select('#position_shares td.alignLeft a'):
        i += 1
        stocks.append([
            i, fund['代码'], fund['名称'], fund_size,
            re.findall('\d+\d', tag['href'])[0], tag['title'], tag['href']
        ])
    return stocks


def get_stocks_df(order_item, n=0):
    '''获取所有基金的持仓股票信息'''
    funds_df = get_funds_df(order_item, n)

    stocks = []
    for index, fund in funds_df.iterrows():
        stocks_by_fund = get_stocks_by_fund(fund)
        stocks.extend(stocks_by_fund)

    stocks_df = pd.DataFrame(stocks,
                             columns=[
                                 'rank', 'fund_code', 'fund_name', 'fund_size',
                                 'stock_code', 'stock_name', 'stock_url'
                             ])
    return stocks_df

In [5]:
order_item = '近一周'

In [54]:
df=get_stocks_df(order_item,5)

In [55]:
df

Unnamed: 0,rank,fund_code,fund_name,fund_size,stock_code,stock_name,stock_url
0,1,519674,银河创新成长混合,19.86,2241,歌尔股份,http://quote.eastmoney.com/sz002241.html
1,2,519674,银河创新成长混合,19.86,600745,闻泰科技,http://quote.eastmoney.com/sh600745.html
2,3,519674,银河创新成长混合,19.86,603005,晶方科技,http://quote.eastmoney.com/sh603005.html
3,4,519674,银河创新成长混合,19.86,2156,通富微电,http://quote.eastmoney.com/sz002156.html
4,5,519674,银河创新成长混合,19.86,600584,长电科技,http://quote.eastmoney.com/sh600584.html
5,6,519674,银河创新成长混合,19.86,725,京东方Ａ,http://quote.eastmoney.com/sz000725.html
6,7,519674,银河创新成长混合,19.86,600703,三安光电,http://quote.eastmoney.com/sh600703.html
7,8,519674,银河创新成长混合,19.86,2371,北方华创,http://quote.eastmoney.com/sz002371.html
8,9,519674,银河创新成长混合,19.86,300136,信维通信,http://quote.eastmoney.com/sz300136.html
9,10,519674,银河创新成长混合,19.86,300115,长盈精密,http://quote.eastmoney.com/sz300115.html


In [48]:
# 根据基金去除重复数据，保留排名第一的股票
# df.drop_duplicates(['fund_code','fund_name'])

In [49]:
# 统计每个股票有多少基金持仓
# df.groupby(['stock_code','stock_name','stock_url']).count().sort_values('rank',ascending=False)