In [1]:
import requests
import math
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
COLUMN_DICT = {
    '日增长率': 'rzdf',
    '近一周': 'zzf',
    '近一月': '1yzf',
    '近3月': '3yzf',
    '近6月': '6yzf',
    '近1年': '1nzf',
    '近2年': '2nzf',
    '近3年': '3nzf',
    '今年以来': 'jnzf',
    '成立以来': 'lnzf'
}

COLS = [
    '代码', '名称', '简拼', '更新日期', '单位净值', '累积净值', '日增长率', '近一周', '近一月', '近3月',
    '近6月', '近1年', '近2年', '近3年', '今年以来', '成立以来', '成立日期'
]

# sc:排序项目,见COLUMN_DICT;
# pn:每页获取记录条数
# pi:页码
FUNDS_RANK_URL = 'http://fund.eastmoney.com/data/rankhandler.aspx?op=ph&sc={}&pi=1&st=desc&pn={}&dx=1'

In [20]:
def get_funds_rank(order_item, n):
    '''获取基金排名数据'''
    url = FUNDS_RANK_URL.format(COLUMN_DICT[order_item], n)
    r = requests.get(url)
    r.encoding = 'utf-8'
    datas = eval(re.findall('\[.*\]', r.text)[0])
    return datas


def make_funds_df(datas):
    '''根据基金排名数据生成DataFrame'''
    funds = []
    for data in datas:
        funds.append(data.split(',')[:17])

    df = pd.DataFrame(funds, columns=COLS)
    return df


def get_funds_df(order_item, n):
    '''根据指定排名项获取基金排名数据的DataFrame'''
    #     url = make_funds_rank_url(order_item)
    datas = get_funds_rank(order_item, n)
    df = make_funds_df(datas)
    return df


def get_fund_size(soup):
    '''获取基金规模'''
    tag = soup.select_one('div.infoOfFund tr:nth-of-type(1) td:nth-of-type(2)')
    fund_size = re.findall(r'基金规模\D*(\d.+\d)亿元', str(tag))[0]
    return fund_size


def get_stocks_by_fund(fund):
    '''根据单个基金获取持仓股票信息'''
    url = 'http://fund.eastmoney.com/{}.html'.format(fund['代码'])
    r = requests.get(url)
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text, 'lxml')

    # 基金规模
    fund_size = get_fund_size(soup)

    # tag为每个股票的链接html
    i = 0
    stocks = []

    # [1:11] “1”是去除标题行，“11”是因为有的基金在该处显示的是基金持仓，旁边才是股票持仓，但两处选择器完全一样，会造成行数大于10行
    for item in soup.select(
            '#position_shares > div.poptableWrap > table > tr')[1:11]:
        i += 1
        elements = item.get_text().split()
        if len(elements) == 1:
            break
        stocks.append([
            i, fund['代码'], fund['名称'], fund_size,
            re.sub('\D', '', item.a['href']), elements[0], elements[1],
            item.a['href']
        ])

    return stocks


def get_stocks_df(order_item, n=50):
    '''获取所有基金的持仓股票信息'''
    funds_df = get_funds_df(order_item, n)

    stocks = []
    for index, fund in funds_df.iterrows():
        stocks_by_fund = get_stocks_by_fund(fund)
        stocks.extend(stocks_by_fund)

    stocks_df = pd.DataFrame(
        stocks,
        columns=[
            'rank', 'fund_code', 'fund_name', 'fund_size', 'stock_code',
            'stock_name', 'stock_percent', 'stock_url'
        ])
    return stocks_df

In [4]:
order_item = '近一周'

In [21]:
df = get_stocks_df(order_item, 75)

In [None]:
df

In [38]:
# 计算各股票持仓金额
df['stock_money'] = df['fund_size'].astype(
    float) * df['stock_percent'].str.strip('%').astype(float) / 100

In [29]:
# 根据基金去除重复数据，保留排名第一的股票
# df.drop_duplicates(['fund_code','fund_name'])

In [49]:
# 统计每个股票有多少基金持仓
# df.groupby(['stock_code','stock_name','stock_url']).count().sort_values('rank',ascending=False)