In [4]:
import pandas as pd
import numpy as np
import tushare as ts
from tqdm import tqdm
import time

# 初始化pro接口
pro = ts.pro_api('3339f390298f8503dc5cbcca9fce9898f10bd3a5cb8ce9315803e5cd')

def fetch_all_daily_basic(start_date, end_date):
    """
    按交易日循环获取指定日期范围内的全部A股每日基本面指标数据。
    """
    
    # 1. 获取指定范围内的所有交易日
    try:
        trade_dates_df = pro.trade_cal(exchange='SSE', start_date=start_date, end_date=end_date)
        trade_dates_list = trade_dates_df[trade_dates_df['is_open'] == 1]['cal_date'].tolist()
        print(f"获取到从 {start_date} 到 {end_date} 共有 {len(trade_dates_list)} 个交易日。")
    except Exception as e:
        print(f"获取交易日历失败: {e}")
        return None

    # 2. 循环获取每日数据
    all_data_list = []
    # 使用tqdm添加进度条
    for date in tqdm(trade_dates_list, desc="正在获取每日基本面数据"):
        try:
            # 按单个交易日获取数据
            df_daily = pro.daily_basic(trade_date=date, fields=[
                "ts_code","trade_date",
                "close","turnover_rate","turnover_rate_f","volume_ratio",
                "pe","pe_ttm","pb","ps",
                "ps_ttm","dv_ratio","dv_ttm","total_share",
                "float_share","free_share","total_mv","circ_mv"
            ])
            all_data_list.append(df_daily)
            
            # 3. 添加一个小的延时，防止触发Tushare的接口限流
            #time.sleep(0.1) # 100毫秒的延时

        except Exception as e:
            print(f"获取 {date} 的数据时发生错误: {e}")
            # 即使某一天出错，也继续尝试下一天
            continue
            
    # 4. 将所有获取到的数据合并成一个DataFrame
    if not all_data_list:
        print("未能获取到任何数据。")
        return None
        
    final_df = pd.concat(all_data_list, ignore_index=True)
    return final_df

if __name__ == "__main__":
    start_date = '20150101'
    end_date = "20251017" # 注意：Tushare数据通常有1-2天的延迟，若今天运行，最新数据可能是昨天或前天
    
    daily_basic_data = fetch_all_daily_basic(start_date, end_date)
    
    if daily_basic_data is not None and not daily_basic_data.empty:
        print("\n数据获取成功！")
        print(f"总共获取了 {len(daily_basic_data)} 条数据。")
        print("数据预览:")
        print(daily_basic_data.info())
        
        # 建议保存到本地文件，避免每次重新下载
        # daily_basic_data.to_csv("daily_basic_20150101_20251017.csv", index=False)
        # print("\n数据已保存到 daily_basic_20150101_20251017.csv")

获取到从 20150101 到 20251017 共有 2621 个交易日。


正在获取每日基本面数据: 100%|██████████| 2621/2621 [24:06<00:00,  1.81it/s]  



数据获取成功！
总共获取了 10417239 条数据。
数据预览:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10417239 entries, 0 to 10417238
Data columns (total 18 columns):
 #   Column           Dtype  
---  ------           -----  
 0   ts_code          object 
 1   trade_date       object 
 2   close            float64
 3   turnover_rate    float64
 4   turnover_rate_f  float64
 5   volume_ratio     float64
 6   pe               float64
 7   pe_ttm           float64
 8   pb               float64
 9   ps               float64
 10  ps_ttm           float64
 11  dv_ratio         float64
 12  dv_ttm           float64
 13  total_share      float64
 14  float_share      float64
 15  free_share       float64
 16  total_mv         float64
 17  circ_mv          float64
dtypes: float64(16), object(2)
memory usage: 1.4+ GB
None


In [6]:
daily_basic_data.to_csv('data/daily_basic_data_20150101_20251017.csv', index=False)

In [17]:
# 获取申万行业分类数据
def get_sw_industry(offset):
    """
    获取申万行业分类的成分股数据。
    """
    try:
        sw_ind_df = pro.index_member_all(**{
            "l1_code": "",
            "l2_code": "",
            "l3_code": "",
            "is_new": "",
            "ts_code": "",
            "src": "",
            "limit": "",
            "offset": offset
        }, fields=[
            "l1_code",
            "l1_name",
            "l2_code",
            "l2_name",
            "l3_code",
            "l3_name",
            "ts_code",
            "name",
            "in_date",
            "out_date",
            "is_new"
        ])

        print(f"获取到 {len(sw_ind_df)} 条申万行业成分股数据。")
        return sw_ind_df
    except Exception as e:
        print(f"获取申万行业分类数据失败: {e}")
        return None

sw_ind_df1 = get_sw_industry(0)
sw_ind_df2 = get_sw_industry(3000)
sw_ind_df = pd.concat([sw_ind_df1, sw_ind_df2], ignore_index=True)
sw_ind_df.to_csv('data/sw_industry_components.csv', index=False)
print("申万行业分类数据已保存到 sw_industry_components.csv")

获取到 3000 条申万行业成分股数据。
获取到 2847 条申万行业成分股数据。
申万行业分类数据已保存到 sw_industry_components.csv


In [14]:
sw_ind_df   

Unnamed: 0,l1_code,l1_name,l2_code,l2_name,l3_code,l3_name,ts_code,name,in_date,out_date,is_new
0,801880.SI,汽车,801881.SI,摩托车及其他,858811.SI,其他运输设备,600679.SH,上海凤凰,19931008,,Y
1,801180.SI,房地产,801181.SI,房地产开发,851811.SI,住宅开发,600683.SH,京投发展,19931025,,Y
2,801740.SI,国防军工,801744.SI,航海装备Ⅱ,850935.SI,航海装备Ⅲ,600685.SH,中船防务,19931028,,Y
3,801960.SI,石油石化,801963.SI,炼化及贸易,859631.SI,炼油化工,600688.SH,上海石化,19931108,,Y
4,801200.SI,商贸零售,801203.SI,一般零售,852031.SI,百货,600693.SH,东百集团,19931122,,Y
...,...,...,...,...,...,...,...,...,...,...,...
5842,801130.SI,纺织服饰,801131.SI,纺织制造,851316.SI,其他纺织,603238.SH,诺邦股份,20250811,,Y
5843,801010.SI,农林牧渔,801017.SI,养殖业,850172.SI,生猪养殖,603717.SH,天域生物,20250811,,Y
5844,801080.SI,电子,801081.SI,半导体,850818.SI,半导体设备,603991.SH,至正股份,20250811,,Y
5845,801180.SI,房地产,801183.SI,房地产服务,851831.SI,物业管理,600136.SH,ST明诚,20250811,,Y


In [None]:
# 合并申万三级行业成分与股票日线
stk_ind_df = pd.merge(daily_basic_data, sw_ind_df[['ts_code','l3_name']], on='ts_code', how='left')

In [18]:
stk_ind_df.to_csv('data/stk_ind_data_20150101_20251017.csv', index=False)
print("合并后的股票日线与行业数据已保存到 stk_ind_data_20150101_20251017.csv")

合并后的股票日线与行业数据已保存到 stk_ind_data_20150101_20251017.csv


In [None]:
# 下面获取季度财务指标数据

# 连接Mongodb数据库
from pymongo import MongoClient, DESCENDING


# 配置数据库连接信息
MONGO_CONNECTION_STRING = "mongodb://localhost:27017/"
DB_NAME = "barra_financial_data"

client = MongoClient(MONGO_CONNECTION_STRING)
db = client[DB_NAME]
print("成功连接到 MongoDB。")


def load_collection_to_df(db, collection_name: str, query: dict, projection: dict) -> pd.DataFrame:
    """加载经过筛选和投影的集合数据。"""
    print(f"正在从 '{collection_name}' 加载数据...")
    collection = db[collection_name]
    cursor = collection.find(query, projection)
    df = pd.DataFrame(list(cursor))
    print(f"-> 成功加载 {len(df):,} 行数据。")
    return df


financial_indicators_query = {
    "end_date": {"$gte": "20150101"}
}

financial_indicators_projection = {
    "_id": 0,
}

financial_indicators_df = load_collection_to_df(
    db,
    collection_name="financial_indicators",
    query=financial_indicators_query,
    projection=financial_indicators_projection
)

financial_indicators_processed_df = financial_indicators_df.copy()

financial_indicators_processed_df['ann_date'] = pd.to_datetime(financial_indicators_processed_df['ann_date'], format='%Y%m%d')
financial_indicators_processed_df['end_date'] = pd.to_datetime(financial_indicators_processed_df['end_date'], format='%Y%m%d')

financial_indicators_processed_df = financial_indicators_processed_df.sort_values(
    ['ts_code', 'ann_date', 'end_date'],
    ascending=[True, True, False]  # end_date 降序，最新在前
).drop_duplicates(subset=['ts_code', 'ann_date'], keep='first')

print("数据加载和预处理全部完成。")
client.close()

成功连接到 MongoDB。
正在从 'financial_indicators' 加载数据...
-> 成功加载 194,742 行数据。
数据加载和预处理全部完成。
财务指标数据已保存到 financial_indicators_2015q1_2025q2.csv


In [37]:
# 合并申万三级行业成分与股票季度财务指标
stk_fin_df = pd.merge(financial_indicators_processed_df, sw_ind_df[['ts_code','l3_name']], on='ts_code', how='left')
stk_fin_df.to_csv('data/ind_financial_indicators_2015q1_2025q2.csv', index=False)
print("财务指标数据与行业信息已保存到 ind_financial_indicators_2015q1_2025q2.csv")

财务指标数据与行业信息已保存到 ind_financial_indicators_2015q1_2025q2.csv


In [45]:
# 沪深300日线数据

def get_index_daily(ts_code, start_date='', end_date=''):
    """
    获取指定指数的日线数据。
    """
    try:
        df = pro.index_daily(**{
            "ts_code": ts_code,
            "trade_date": "",
            "start_date": start_date,
            "end_date": end_date,
            "limit": "",
            "offset": ""
        }, fields=[
            "ts_code",
            "trade_date",
            "close",
            "open",
            "high",
            "low",
            "pre_close",
            "change",
            "pct_chg",
            "vol",
            "amount"
        ])
        print(f"获取到 {len(df)} 条 {ts_code} 的日线数据。")
        return df
    except Exception as e:
        print(f"获取 {ts_code} 日线数据失败: {e}")
        return None

csi300_daily_df = get_index_daily(ts_code='000300.SH', start_date='20150101', end_date='20251017')
csi300_daily_df.to_csv('data/csi300_daily_20150101_20251017.csv', index=False)
print("沪深300日线数据已保存到 csi300_daily_20150101_20251017.csv")


获取到 2621 条 000300.SH 的日线数据。
沪深300日线数据已保存到 csi300_daily_20150101_20251017.csv


In [44]:
csi300_daily_df

Unnamed: 0,ts_code,trade_date,close,open,high,low,pre_close,change,pct_chg,vol,amount
0,000300.SH,20251017,4514.2345,4610.6967,4618.5621,4509.3289,4618.4218,-104.1873,-2.2559,256905617.0,5.590860e+08
1,000300.SH,20251016,4618.4218,4588.9206,4645.0959,4586.2801,4606.2879,12.1339,0.2634,248235518.0,5.605672e+08
2,000300.SH,20251015,4606.2879,4544.2431,4608.8239,4526.8340,4539.0644,67.2235,1.4810,284771855.0,6.073263e+08
3,000300.SH,20251014,4539.0644,4634.0543,4641.4492,4520.9258,4593.9785,-54.9141,-1.1953,372954030.0,8.089590e+08
4,000300.SH,20251013,4593.9785,4490.8661,4601.8072,4490.8661,4616.8341,-22.8556,-0.4950,315010354.0,7.140016e+08
...,...,...,...,...,...,...,...,...,...,...,...
2616,000300.SH,20150109,3546.7230,3547.5740,3689.7530,3536.3950,3559.2590,-12.5360,-0.3522,349982672.0,4.302106e+08
2617,000300.SH,20150108,3559.2590,3650.0730,3659.9450,3552.1000,3643.7900,-84.5310,-2.3199,295003045.0,3.558320e+08
2618,000300.SH,20150107,3643.7900,3620.9240,3671.1900,3601.6980,3641.0590,2.7310,0.0750,320191232.0,3.987317e+08
2619,000300.SH,20150106,3641.0590,3608.4280,3683.2260,3587.2310,3641.5410,-0.4820,-0.0132,420962185.0,4.985296e+08
