## Instruction

In this notebook, I will download and process stock data like P4 project. It is preparing for backtesting.

In [3]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

import tushare as ts

In [4]:
# register token
token = 'f3d9169b387b6f38d4b4455e62a8cb84fa2d56ece1e42d0709cc8c57' # your token
ts.set_token(token)
pro = ts.pro_api()

## Pick Stock List
We got stock list by each day, and pick up stock exist in that day with some ohter custom constraints.

In [5]:
# trade calendar
start_date, end_date = '20170101' , '20230317'
calendar = pro.trade_cal(exchange='SSE', is_open='1', 
                            start_date=start_date, 
                            end_date=end_date, 
                            fields='cal_date')
calendar = calendar.values.flatten()[::-1]
print((calendar[0], calendar[-1]), len(calendar))

('20170103', '20230317') 1508


In [4]:
# get stock list info
def get_basic_from_Tushare(trade_date):
    # column: list_date(exist on )
    basic_fields = 'trade_date,ts_code,name,industry,list_date'
    df = pro.bak_basic(trade_date=trade_date, fields=basic_fields)
    return df
# test function
df = get_basic_from_Tushare(calendar[0])
df.head()

Unnamed: 0,trade_date,ts_code,name,industry,list_date
0,20170103,601375.SH,N中原,证券,20170103
1,20170103,603186.SH,N华正,元器件,20170103
2,20170103,600249.SH,两面针,日用化工,20040130
3,20170103,002277.SZ,友阿股份,百货,20090717
4,20170103,000547.SZ,航天发展,通信设备,19931130


In [5]:
# filter stock df
my_indusry = {'医药商业':1,'中成药':2,'生物制药':3,'化学制药':4,'互联网':5,'软件服务':6}
def filter_basic_stockDf(df):
    # got sotck exist on that day
    df = df.loc[df.list_date!='0']
    # filter custom industry
    df = df.loc[df.industry.isin(my_indusry.keys())]
    return df
# test function
df = filter_basic_stockDf(df)
print(df.shape)
df.head()

(373, 5)


Unnamed: 0,trade_date,ts_code,name,industry,list_date
9,20170103,300573.SZ,兴齐眼药,化学制药,20161208
15,20170103,300571.SZ,平治信息,互联网,20161213
17,20170103,300579.SZ,数字认证,软件服务,20161223
19,20170103,300561.SZ,汇金科技,软件服务,20161117
43,20170103,603716.SH,塞力斯,医药商业,20161031


## Download Data by Tickers

In [8]:
def get_daydata_from_Tushare(fundamental_df):
    
    def get_daydata_by_ticker_(ts_code):
        # indicator check: https://tushare.pro/document/2?doc_id=32
        my_fields = 'ts_code,trade_date,turnover_rate,volume_ratio,pe,pb,total_share,free_share,total_mv,circ_mv'
        df = pro.query('daily_basic', ts_code=ts_code, start_date=calendar[0], end_date=calendar[-1], fields=my_fields)
        return df
    
    tickers = fundamental_df['ts_code'].values
    df = pd.DataFrame()
    for ts_code in tickers:
        ticker_df = get_daydata_by_ticker_(ts_code)
        tmp = fundamental_df.loc[fundamental_df.ts_code==ts_code]
        ticker_df['name'] = tmp['name'].unique()[0]
        ticker_df['industry'] = tmp['industry'].unique()[0]
        ticker_df['list_date'] = tmp['list_date'].unique()[0]
        df = df.append(ticker_df)
    return df
    
# test function 
df_20170103 = get_daydata_from_Tushare(df.iloc[:2,])
print(df_20170103.shape)
df_20170103.head()

(3011, 13)


Unnamed: 0,ts_code,trade_date,turnover_rate,volume_ratio,pe,pb,total_share,free_share,total_mv,circ_mv,name,industry,list_date
0,300573.SZ,20230317,3.1363,1.1,60.1276,7.6579,8855.5082,6173.5715,1170698.0,899351.1798,兴齐眼药,化学制药,20161208
1,300573.SZ,20230316,1.9747,0.59,59.4908,7.5768,8855.5082,6173.5715,1158300.0,889827.0372,兴齐眼药,化学制药,20161208
2,300573.SZ,20230315,2.4161,0.6,59.1406,7.5322,8855.5082,6173.5715,1151482.0,884588.7588,兴齐眼药,化学制药,20161208
3,300573.SZ,20230314,3.5934,0.8,60.6734,7.7274,8855.5082,6173.5715,1181325.0,907514.7306,兴齐眼药,化学制药,20161208
4,300573.SZ,20230313,3.2828,0.72,59.3316,7.5565,8855.5082,6173.5715,1155201.0,887446.0016,兴齐眼药,化学制药,20161208


In [9]:
def get_dayIndicator_from_Tushare(fundamental_df):
    
    def get_daydata_by_ticker_(ts_code):
        # inidcator check: https://tushare.pro/document/2?doc_id=296
        my_fields = 'ts_code,trade_date,close_qfq,open_qfq,high_qfq,low_qfq,pct_change,amount,adj_factor,\
                macd,rsi_6,rsi_12,rsi_24,boll_upper,boll_mid,boll_lower,cci'
        df = pro.stk_factor(ts_code=ts_code, start_date=calendar[0], end_date=calendar[-1], fields=my_fields)
        df.rename(columns = {'close_qfq':'close', 'open_qfq':'open', 'high_qfq':'high', 'low_qfq':'low'}, inplace = True)
        return df
    
    tickers = fundamental_df['ts_code'].values
    df = pd.DataFrame()
    for ts_code in tickers:
        ticker_df = get_daydata_by_ticker_(ts_code)
        df = df.append(ticker_df)
    return df
        
df_20170103_ = get_dayIndicator_from_Tushare(df.iloc[:2,])
df_20170103 = df_20170103.merge(df_20170103_, on=['trade_date','ts_code'], how='left')
print(df_20170103.shape)
df_20170103.head()

(3011, 28)


Unnamed: 0,ts_code,trade_date,turnover_rate,volume_ratio,pe,pb,total_share,free_share,total_mv,circ_mv,...,high,low,macd,rsi_6,rsi_12,rsi_24,boll_upper,boll_mid,boll_lower,cci
0,300573.SZ,20230317,3.1363,1.1,60.1276,7.6579,8855.5082,6173.5715,1170698.0,899351.1798,...,132.63,126.0,0.203,48.675,45.01,47.131,151.388,134.789,118.191,-0.293
1,300573.SZ,20230316,1.9747,0.59,59.4908,7.5768,8855.5082,6173.5715,1158300.0,889827.0372,...,132.49,129.15,-0.179,44.293,43.016,46.276,154.155,135.827,117.5,-3.724
2,300573.SZ,20230315,2.4161,0.6,59.1406,7.5322,8855.5082,6173.5715,1151482.0,884588.7588,...,133.97,129.6,-0.383,42.024,41.955,45.813,156.614,136.951,117.288,-9.687
3,300573.SZ,20230314,3.5934,0.8,60.6734,7.7274,8855.5082,6173.5715,1181325.0,907514.7306,...,137.77,131.28,-0.413,49.356,45.343,47.529,158.722,138.112,117.502,10.383
4,300573.SZ,20230313,3.2828,0.72,59.3316,7.5565,8855.5082,6173.5715,1155201.0,887446.0016,...,136.49,129.3,-1.007,41.971,41.556,45.827,162.308,139.506,116.704,-17.575


## Download ALL

In [20]:
# download start day
win_len = 5
fundamental_df = get_basic_from_Tushare(calendar[0])
fundamental_df = filter_basic_stockDf(df)
universe_raw = pd.DataFrame()
for i in tqdm(range(0,fundamental_df.shape[0]-1,win_len), desc='downloaing'):
    tmp = fundamental_df.iloc[i:i+win_len,]
    universe = get_daydata_from_Tushare(tmp)
    tmp = get_dayIndicator_from_Tushare(tmp)
    universe = universe.merge(tmp, on=['trade_date','ts_code'], how='left')
    universe_raw = universe_raw.append(universe)

if (i+win_len) <= (fundamental_df.shape[0]-1):
    tmp = fundamental_df.iloc[i+win_len:]
    universe = get_daydata_from_Tushare(tmp)
    tmp = get_dayIndicator_from_Tushare(tmp)
    universe = universe.merge(tmp, on=['trade_date','ts_code'], how='left')
    universe_raw = universe_raw.append(universe)
    
print(universe_raw.shape)
universe_raw.head()

downloaing: 100%|███████████████████████████████| 75/75 [05:53<00:00,  4.71s/it]

(548349, 28)





Unnamed: 0,ts_code,trade_date,turnover_rate,volume_ratio,pe,pb,total_share,free_share,total_mv,circ_mv,...,high,low,macd,rsi_6,rsi_12,rsi_24,boll_upper,boll_mid,boll_lower,cci
0,300573.SZ,20230317,3.1363,1.1,60.1276,7.6579,8855.5082,6173.5715,1170698.0,899351.1798,...,132.63,126.0,0.203,48.675,45.01,47.131,151.388,134.789,118.191,-0.293
1,300573.SZ,20230316,1.9747,0.59,59.4908,7.5768,8855.5082,6173.5715,1158300.0,889827.0372,...,132.49,129.15,-0.179,44.293,43.016,46.276,154.155,135.827,117.5,-3.724
2,300573.SZ,20230315,2.4161,0.6,59.1406,7.5322,8855.5082,6173.5715,1151482.0,884588.7588,...,133.97,129.6,-0.383,42.024,41.955,45.813,156.614,136.951,117.288,-9.687
3,300573.SZ,20230314,3.5934,0.8,60.6734,7.7274,8855.5082,6173.5715,1181325.0,907514.7306,...,137.77,131.28,-0.413,49.356,45.343,47.529,158.722,138.112,117.502,10.383
4,300573.SZ,20230313,3.2828,0.72,59.3316,7.5565,8855.5082,6173.5715,1155201.0,887446.0016,...,136.49,129.3,-1.007,41.971,41.556,45.827,162.308,139.506,116.704,-17.575


In [27]:
# download other day
for trade_date in tqdm(calendar[1:], desc='funmental_df downloading'):
    df = get_basic_from_Tushare(trade_date)
    df = filter_basic_stockDf(df)
    df = df.loc[df.ts_code.isin(fundamental_df.ts_code)==False]
    fundamental_df = fundamental_df.append(df)
    if df.empty != True:
        universe = get_daydata_from_Tushare(df)
        tmp = get_dayIndicator_from_Tushare(df)
        universe = universe.merge(tmp, on=['trade_date','ts_code'], how='left')
        universe_raw = universe_raw.append(universe)
        
print(universe_raw.shape)
universe_raw.head()

funmental_df downloading: 100%|█████████████| 1507/1507 [19:00<00:00,  1.32it/s]

(887573, 28)





Unnamed: 0,ts_code,trade_date,turnover_rate,volume_ratio,pe,pb,total_share,free_share,total_mv,circ_mv,...,high,low,macd,rsi_6,rsi_12,rsi_24,boll_upper,boll_mid,boll_lower,cci
0,300573.SZ,20230317,3.1363,1.1,60.1276,7.6579,8855.5082,6173.5715,1170698.0,899351.1798,...,132.63,126.0,0.203,48.675,45.01,47.131,151.388,134.789,118.191,-0.293
1,300573.SZ,20230316,1.9747,0.59,59.4908,7.5768,8855.5082,6173.5715,1158300.0,889827.0372,...,132.49,129.15,-0.179,44.293,43.016,46.276,154.155,135.827,117.5,-3.724
2,300573.SZ,20230315,2.4161,0.6,59.1406,7.5322,8855.5082,6173.5715,1151482.0,884588.7588,...,133.97,129.6,-0.383,42.024,41.955,45.813,156.614,136.951,117.288,-9.687
3,300573.SZ,20230314,3.5934,0.8,60.6734,7.7274,8855.5082,6173.5715,1181325.0,907514.7306,...,137.77,131.28,-0.413,49.356,45.343,47.529,158.722,138.112,117.502,10.383
4,300573.SZ,20230313,3.2828,0.72,59.3316,7.5565,8855.5082,6173.5715,1155201.0,887446.0016,...,136.49,129.3,-1.007,41.971,41.556,45.827,162.308,139.506,116.704,-17.575


## Clean Data

In [40]:
# check exist stock list
data_L = pro.query('stock_basic',list_status='L', 
                   fields='ts_code,symbol,name,area,industry,list_date,delist_date,market')
data_L = data_L.loc[data_L.industry.isin(my_indusry)]
data_L

In [68]:
# remove close is null
universe_raw = universe_raw.loc[universe_raw.close.isnull()==False]
fundamental_df = fundamental_df.loc[fundamental_df.ts_code.isin(universe_raw.ts_code)]

In [119]:
# remove pct_change not in range
universe_raw = universe_raw.loc[(universe_raw['pct_change']<=25.) & (universe_raw['pct_change']>=-25.)]

In [120]:
# remove amount < 0.5 million yuan
universe_raw = universe.loc[universe_raw['amount']>=500]

In [132]:
# removw ST
def column_from_names(n, name):
    return list(filter(lambda x: name in x, n))
ST_list = column_from_names(universe_raw['name'].unique(), 'ST')
universe_raw = universe_raw.loc[universe.name.isin(ST_list)==False]

In [149]:
# update fundamental df
fundamental_df = fundamental_df.loc[fundamental_df.ts_code.isin(universe_raw.ts_code.unique())]

In [None]:
# As we delete some rows, the pct_change isn't correct, recalculate pct_change
# universe_raw.drop(columns='pct_change', inplace=True)
universe = pd.DataFrame()
for group_ticker in universe_raw.groupby('ts_code'):
    ticker = group_ticker[1]
    ticker['date'] = pd.to_datetime(ticker['trade_date'], format='%Y%m%d')
    ticker = ticker.sort_values(by=['date']).reset_index(drop=True)
    ticker['pct_change'] = ticker['close'].pct_change().fillna(method='bfill')
    universe = universe.append(ticker)
universe[view_columns]
universe_raw = universe

In [184]:
# reset pe is null to 1.e3
universe_raw['pe'] = np.where(universe_raw['pe'].isnull(), 1.*1e3, universe_raw['pe'])

## Download Factor from Tushare

In [23]:
def find_next_close_date(date):
    if date in calendar:
        return date
    int_date = int(date)
    min_interval = np.inf
    idx = 0
    for ii,dt in enumerate(calendar):
        if int(dt) > int_date and int(dt) - int_date < min_interval:
            min_interval = int(dt) - int_date
            idx = ii
    return calendar[idx]
        
# download profit notice
def get_profit_notice_from_Tushare(ts_code):
    start_date='20161201'
    my_fields = 'ts_code,ann_date,first_ann_date,type,p_change_min,p_change_max'
    df = pro.forecast_vip(ts_code=ts_code ,start_date=start_date, end_date=calendar[-1] ,fields=my_fields)
    df['first_ann_date'] = np.where(df['first_ann_date'].isnull(), df['ann_date'], df['first_ann_date'])
    df.fillna(method='bfill', inplace=True)
    return df

# test function
get_profit_notice_from_Tushare('603538.SH')

Unnamed: 0,ts_code,ann_date,type,p_change_min,p_change_max,first_ann_date
0,603538.SH,20230131,预增,126.76,166.78,20230131
1,603538.SH,20220705,预增,54.7,71.89,20220705
2,603538.SH,20200122,预增,45.0,60.0,20200122
3,603538.SH,20190122,预增,116.0,131.0,20190122
4,603538.SH,20171018,略减,-45.0,-35.0,20171018
5,603538.SH,20170316,略增,5.0,20.0,20170316


In [267]:
# download all profit notice
ticker_list = fundamental_df.ts_code.unique()
notice_df_all = pd.DataFrame()
for ts_code in tqdm(ticker_list, desc='download profit notice'):
    notice_df = get_profit_notice_from_Tushare(ts_code)
    for ii, dt in enumerate(notice_df['first_ann_date']):
        dt = find_next_close_date(dt)
        notice_df.at[ii,'first_ann_date'] = dt
    notice_df['trade_date'] = notice_df['first_ann_date'].apply(np.int64)
    notice_df = notice_df.drop_duplicates(subset=['trade_date'])
    notice_df_all = notice_df_all.append(notice_df)
    
universe = universe_raw.copy(deep=True)
universe = universe.merge(notice_df_all[['ts_code','trade_date','type','p_change_min','p_change_max']], 
                        on=['ts_code','trade_date'], how='left')

download profit notice: 100%|█████████████████| 746/746 [01:57<00:00,  6.36it/s]


In [290]:
# fillna use ffill then fillna(0)
universe_raw = pd.DataFrame()
for group_ticker in tqdm(universe.groupby('ts_code'), desc='aggregeate df'):
    ticker = group_ticker[1]
    ticker[['type', 'p_change_min', 'p_change_max']] = ticker[['type', 'p_change_min', 'p_change_max']].fillna(method='ffill')
    ticker['type'].fillna('不确定',inplace=True)
    ticker[['p_change_min', 'p_change_max']] = ticker[['p_change_min', 'p_change_max']].fillna(0.)
    universe_raw = universe_raw.append(ticker)

aggregeate df: 100%|██████████████████████████| 746/746 [01:11<00:00, 10.37it/s]


In [339]:
# add type value
type_dict = {'不确定':-1, '预增':2, '首亏':-2, '预减':-2, '扭亏':0, '续亏':-3, '略增':1, '续盈':3, '略减':-1}
for key in type_dict.keys():
    universe_raw['type_value'] = np.where(universe_raw['type']==key, type_dict[key], universe_raw['type_value'])

## Download IPO info 

In [306]:
df1 = pro.new_share(start_date='20140101', end_date='20200101')
time.sleep(1)
df2 = pro.new_share(start_date='20200102', end_date='20230318')
df = df1.append(df2)
print(df.shape)
df.head()

(2765, 12)


Unnamed: 0,ts_code,sub_code,name,ipo_date,issue_date,amount,market_amount,price,pe,limit_amount,funds,ballot
0,002973.SZ,2973,侨银股份,20191225,20200106,4089.0,3680.0,5.74,22.99,1.2,2.347,0.04
1,688181.SH,787181,八亿时空,20191225,20200106,2412.0,828.0,43.98,37.35,0.6,3.532,0.04
2,300812.SZ,300812,易天股份,20191225,20200109,1938.0,1938.0,21.46,22.99,1.9,4.159,0.02
3,688081.SH,787081,兴图新科,20191224,20200106,1840.0,699.0,28.21,51.7,0.5,4.558,0.04
4,300811.SZ,300811,铂科新材,20191219,20191230,1440.0,1440.0,26.22,22.99,1.4,3.776,0.01


In [323]:
universe = pd.DataFrame()
for group_ticker in tqdm(universe_raw.groupby('ts_code'), desc='aggregeate df'):
    ticker = group_ticker[1]
    if group_ticker[0] in df.ts_code.values:
        tmp = df.loc[df.ts_code==group_ticker[0]]
        ticker['list_date'] = tmp['issue_date'].values[0]
        ticker['issue_price'] = tmp['price'].values[0]
        ticker['issue_amount'] = tmp['amount'].values[0] # issue stcke unite is 10000 share
    else:
        ticker['issue_price'] = ticker.iloc[0,:]['open']
        ticker['issue_amount'] = ticker.iloc[0,:]['free_share']
    universe = universe.append(ticker)
universe_raw = universe      
#ticker[['trade_date','ts_code','name','list_date','issue_price','issue_amount','free_share']]

aggregeate df: 100%|██████████████████████████| 746/746 [01:18<00:00,  9.55it/s]


## Download Finance Info

In [26]:
def get_finace_reports_from_Tushre(ts_code):
    # https://tushare.pro/document/2?doc_id=79
    start_date='20161201'
    my_fields = [
                  'ts_code','ann_date','cfps','revenue_ps', 'quick_ratio', # 每股现金流，每股营业收入，速冻比率
                  'dt_eps','basic_eps_yoy','dt_eps_yoy', # 每股收益
                  'bps','bps_yoy', # 每股净资产
                  'extra_item','profit_dedt', # 扣非，扣非净利润，扣费净利润单季度 
                  'roe_dt','q_dt_roe','roe_yoy', # 净资产收益
                  'capital_rese_ps','surplus_rese_ps', # 每股资本公积，每股公积盈余 
                  'gross_margin','interestdebt','ca_to_assets',# 毛利，速冻比率，带息债务
                  'ebt_yoy','roe_yoy','or_yoy','equity_yoy' # 总利润增长，净资产收益增长，营业收入增长，净资产增长
                 ]
    df = pro.fina_indicator(ts_code=ts_code, start_date=start_date, end_date=calendar[-1])[my_fields]
    df = df.fillna(method='bfill').dropna()
    return df

# function test
get_finace_reports_from_Tushre('603538.SH').head()

Unnamed: 0,ts_code,ann_date,cfps,revenue_ps,quick_ratio,dt_eps,basic_eps_yoy,dt_eps_yoy,bps,bps_yoy,...,roe_yoy,capital_rese_ps,surplus_rese_ps,gross_margin,interestdebt,ca_to_assets,ebt_yoy,roe_yoy.1,or_yoy,equity_yoy
0,603538.SH,20221028,0.1674,5.7783,0.8542,0.95,54.5455,53.2258,8.8382,-20.289,...,48.4919,2.5581,0.1163,502780400.0,1424953000.0,43.3206,60.0877,48.4919,33.9594,6.4431
1,603538.SH,20220816,0.2172,4.2131,0.924,0.82,52.6316,54.717,8.6753,-21.7581,...,49.0908,2.5336,0.1163,379191300.0,1409538000.0,43.8655,66.9877,49.0908,29.738,6.4495
2,603538.SH,20220426,0.0479,3.2814,1.11,0.7,123.5294,105.8824,12.0966,9.0983,...,106.922,4.1745,0.1626,202535400.0,1295483000.0,42.7028,130.5781,106.922,47.8641,10.643
3,603538.SH,20220426,1.3101,8.3147,0.979,0.92,-9.434,-13.2075,11.0878,6.9033,...,-18.3608,3.8807,0.1638,469219900.0,1260414000.0,42.544,-12.017,-18.3608,5.4278,11.9375
4,603538.SH,20211028,0.49,6.1415,1.1275,0.87,-5.1546,-9.375,11.2975,8.9252,...,-20.269,3.7641,0.1656,339250200.0,1148648000.0,43.1607,-7.8135,-20.269,-3.1838,20.4111


In [28]:
# download all profit notice
ticker_list = fundamental_df.ts_code.unique()
finance_df_all = pd.DataFrame()
for ts_code in tqdm(ticker_list, desc='finance info download...'):
    finance_df = get_finace_reports_from_Tushre(ts_code)
    for ii, dt in enumerate(finance_df['ann_date']):
        dt = find_next_close_date(dt)
        finance_df.at[ii,'ann_date'] = dt
    finance_df['trade_date'] = finance_df['ann_date'].apply(np.int64)
    finance_df = finance_df.drop_duplicates(subset=['trade_date']).drop(columns=['ann_date'])
    finance_df_all = finance_df_all.append(finance_df)

print(finance_df_all.shape)
finance_df_all.head()

finance info download...: 100%|███████████████| 746/746 [02:10<00:00,  5.71it/s]

(12742, 24)





Unnamed: 0,ts_code,cfps,revenue_ps,quick_ratio,dt_eps,basic_eps_yoy,dt_eps_yoy,bps,bps_yoy,extra_item,...,capital_rese_ps,surplus_rese_ps,gross_margin,interestdebt,ca_to_assets,ebt_yoy,roe_yoy,or_yoy,equity_yoy,trade_date
0,300573.SZ,-1.9341,11.3999,4.0629,2.3,21.0526,21.0526,17.0405,4.3509,-4488617.89,...,9.2007,0.5,804688600.0,18654660.0,50.4077,23.3388,-27.1227,30.3662,78.3302,20221027
1,300573.SZ,-2.4958,6.8173,4.7004,1.37,38.3838,38.3838,16.6485,1.9504,-8187669.17,...,9.2419,0.5,479443000.0,17101080.0,47.6592,40.1626,-21.9269,29.8916,92.8076,20220829
2,300573.SZ,-0.5873,3.4276,4.5492,0.84,110.0,110.0,16.1573,-1.0576,-1093518.42,...,8.9381,0.5,242191900.0,11260110.0,56.2816,107.1915,12.3208,45.7332,98.8565,20220427
3,300573.SZ,7.8091,12.4849,3.3433,2.38,119.2661,120.3704,16.33,97.1627,9881472.83,...,9.5063,0.535,802668900.0,111279600.0,58.3588,137.5056,12.2897,49.2582,97.0852,20220420
5,300573.SZ,1.5298,9.3565,1.6025,1.9,192.3077,192.3077,10.2244,23.4458,935852.45,...,3.9434,0.5002,604187900.0,101210600.0,40.8004,212.1494,114.3444,64.1312,35.5078,20211028


In [35]:
# merage all
universe = universe_raw.copy(deep=True)
universe = universe.merge(finance_df_all, on=['ts_code','trade_date'], how='left')
# fillna use ffill then fillna(0)
universe_raw = pd.DataFrame()
for group_ticker in tqdm(universe.groupby('ts_code'), desc='aggregeate df'):
    ticker = group_ticker[1]
    ticker.fillna(method='ffill', inplace=True)
    universe_raw = universe_raw.append(ticker)
universe_raw.reset_index(drop=True, inplace=True)

aggregeate df: 100%|██████████████████████████| 746/746 [01:51<00:00,  6.69it/s]


In [36]:
view_columns = ['trade_date','ts_code','name','open','close','pct_change','amount','volume_ratio','pe',
                'type','p_change_min','p_change_max','list_date']
#universe_raw.loc[universe_raw.ts_code=='603538.SH'][view_columns]
print(universe_raw.shape)
universe_raw.loc[universe_raw.ts_code=='603538.SH']

(867437, 57)


Unnamed: 0,ts_code,trade_date,turnover_rate,volume_ratio,pe,pb,total_share,free_share,total_mv,circ_mv,...,roe_yoy,capital_rese_ps,surplus_rese_ps,gross_margin,interestdebt,ca_to_assets,ebt_yoy,roe_yoy.1,or_yoy,equity_yoy
759397,603538.SH,20170414,0.0595,2.33,49.5108,3.6430,12000.0000,3000.0000,390360.0000,97590.0000,...,,,,,,,,,,
759398,603538.SH,20170417,1.6947,58.10,54.4573,4.0070,12000.0000,3000.0000,429360.0000,107340.0000,...,,,,,,,,,,
759399,603538.SH,20170418,49.6959,136.06,57.4861,4.2299,12000.0000,3000.0000,453240.0000,113310.0000,...,,,,,,,,,,
759400,603538.SH,20170419,43.8395,4.26,59.9061,4.4079,12000.0000,3000.0000,472320.0000,118080.0000,...,,,,,,,,,,
759401,603538.SH,20170420,30.0098,1.57,56.4816,4.1559,12000.0000,3000.0000,445320.0000,111330.0000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
760761,603538.SH,20230313,1.4098,1.04,37.9850,2.8709,21339.6218,15379.6469,541386.2051,535693.5957,...,48.4919,2.5581,0.1163,5.027804e+08,1.424953e+09,43.3206,60.0877,48.4919,33.9594,6.4431
760762,603538.SH,20230314,1.4092,1.05,37.5807,2.8403,21339.6218,15379.6469,535624.5072,529992.4813,...,48.4919,2.5581,0.1163,5.027804e+08,1.424953e+09,43.3206,60.0877,48.4919,33.9594,6.4431
760763,603538.SH,20230315,1.3603,1.10,38.0149,2.8731,21339.6218,15379.6469,541812.9975,536115.9004,...,48.4919,2.5581,0.1163,5.027804e+08,1.424953e+09,43.3206,60.0877,48.4919,33.9594,6.4431
760764,603538.SH,20230316,1.3649,1.03,37.4909,2.8335,21339.6218,15379.6469,534344.1299,528725.5670,...,48.4919,2.5581,0.1163,5.027804e+08,1.424953e+09,43.3206,60.0877,48.4919,33.9594,6.4431


## Save Raw Data

In [37]:
#'20170103', '20230317'
universe_raw = universe_raw.drop_duplicates(keep='first').reset_index(drop=True)
fundamental_df = fundamental_df.drop_duplicates(keep='first').reset_index(drop=True)
universe_raw.to_csv('raw_20170103_20230317.csv')
fundamental_df.to_csv('fundamental_20170103_20230317.csv')

## Load Data

In [33]:
universe_raw = pd.read_csv('raw_20170103_20230317.csv').iloc[:,1:]
fundamental_df = pd.read_csv('fundamental_20170103_20230317.csv').iloc[:,1:]