## Instruction

In this notebook, I will download and process stock data like P4 project. It is preparing for backtesting.

In [1]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

import tushare as ts

In [4]:
# register token
token = '' # your token
ts.set_token(token)
pro = ts.pro_api()

## Pick Stock List
We got stock list by each day, and pick up stock exist in that day with some ohter custom constraints.

In [5]:
# trade calendar
start_date, end_date = '20180101' , '20230327'
calendar = pro.trade_cal(exchange='SSE', is_open='1', 
                            start_date=start_date, 
                            end_date=end_date, 
                            fields='cal_date')
calendar = calendar.values.flatten()[::-1]
print((calendar[0], calendar[-1]), len(calendar))

('20180102', '20230327') 1270


### Load Data

In [62]:
# get stock list info
def get_basic_from_Tushare(trade_date):
    # column: list_date(exist on )
    basic_fields = 'trade_date,ts_code,name,industry,list_date'
    df = pro.bak_basic(trade_date=trade_date, fields=basic_fields)
    return df
# test function
#fundamental_df = get_basic_from_Tushare(calendar[0])
#print(fundamental_df.shape)
#fundamental_df.head()

(3477, 5)


Unnamed: 0,trade_date,ts_code,name,industry,list_date
0,20180102,600802.SH,福建水泥,水泥,19940103
1,20180102,600727.SH,鲁北化工,农药化肥,19960702
2,20180102,002793.SZ,东音股份,专用机械,20160415
3,20180102,600903.SH,贵州燃气,供气供热,20171107
4,20180102,002864.SZ,盘龙药业,中成药,20171116


In [63]:
# removw ST
def column_from_names(n, name):
    return list(filter(lambda x: name in x, n))

# got ticker exist exist over 3 month
def exist_time_filter(df):
    condition = (pd.to_datetime(df['trade_date'],format='%Y%m%d')\
                -pd.to_datetime(df['list_date'],format='%Y%m%d')).dt.days > 90
    df = df.loc[condition==True]
    return df

# filter stock df
my_indusry = {'医药商业':1,'中成药':2,'生物制药':3,'化学制药':4}
def filter_basic_stockDf(df):
    # got ticker exist on that day
    df = df.loc[df.list_date!='0']
    # filter custom industry
    df = df.loc[df.industry.isin(my_indusry.keys())]
    ST_list = column_from_names(df['name'].unique(), 'ST')
    df = df.loc[df.name.isin(ST_list)==False]
    if df.empty == False:
        df = exist_time_filter(df)
    return df
# test function
#fundamental_df = filter_basic_stockDf(fundamental_df)
#print(fundamental_df.shape)
#fundamental_df.head()

(216, 5)


Unnamed: 0,trade_date,ts_code,name,industry,list_date
60,20180102,603963.SH,大理药业,化学制药,20170922
70,20180102,000813.SZ,德展健康,化学制药,19980519
74,20180102,603566.SH,普莱柯,生物制药,20150518
91,20180102,002412.SZ,汉森制药,中成药,20100525
115,20180102,002001.SZ,新 和 成,化学制药,20040625


## Download Data by Tickers

In [31]:
def get_daydata_from_Tushare(fundamental_df):
    
    def get_daydata_by_ticker_(ts_code):
        # indicator check: https://tushare.pro/document/2?doc_id=32
        my_fields = 'ts_code,trade_date,turnover_rate,pe,pb,total_share,free_share,total_mv,circ_mv'
        df = pro.query('daily_basic', ts_code=ts_code, start_date=calendar[0], end_date=calendar[-1], fields=my_fields)
        return df
    
    tickers = fundamental_df['ts_code'].values
    df = pd.DataFrame()
    for ts_code in tickers:
        ticker_df = get_daydata_by_ticker_(ts_code)
        tmp = fundamental_df.loc[fundamental_df.ts_code==ts_code]
        ticker_df['name'] = tmp['name'].unique()[0]
        ticker_df['industry'] = tmp['industry'].unique()[0]
        ticker_df['list_date'] = tmp['list_date'].unique()[0]
        df = df.append(ticker_df)
    return df
    
# test function 
df_base = get_daydata_from_Tushare(fundamental_df.iloc[:1,])
print(df_base.shape)
df_base.head()

(1270, 12)


Unnamed: 0,ts_code,trade_date,turnover_rate,pe,pb,total_share,free_share,total_mv,circ_mv,name,industry,list_date
0,603963.SH,20230327,0.4707,,6.5741,21970.0,6147.75,267814.3,267814.3,大理药业,化学制药,20170922
1,603963.SH,20230324,0.6047,,6.5848,21970.0,6147.75,268253.7,268253.7,大理药业,化学制药,20170922
2,603963.SH,20230323,0.6656,,6.5848,21970.0,6147.75,268253.7,268253.7,大理药业,化学制药,20170922
3,603963.SH,20230322,0.7367,,6.6388,21970.0,6147.75,270450.7,270450.7,大理药业,化学制药,20170922
4,603963.SH,20230321,0.8041,,6.6226,21970.0,6147.75,269791.6,269791.6,大理药业,化学制药,20170922


In [32]:
def get_dayIndicator_from_Tushare(fundamental_df):
    
    def get_daydata_by_ticker_(ts_code):
        # inidcator check: https://tushare.pro/document/2?doc_id=296
        my_fields = 'ts_code,trade_date,close_qfq,open_qfq,high_qfq,low_qfq,amount,vol,rsi_6,rsi_12'
        df = pro.stk_factor(ts_code=ts_code, start_date=calendar[0], end_date=calendar[-1], fields=my_fields)
        df.rename(columns = {'close_qfq':'close', 'open_qfq':'open', 'high_qfq':'high', 'low_qfq':'low'}, inplace = True)
        return df
    
    tickers = fundamental_df['ts_code'].values
    df = pd.DataFrame()
    for ts_code in tickers:
        ticker_df = get_daydata_by_ticker_(ts_code)
        df = df.append(ticker_df)
    return df
        
df_day = get_dayIndicator_from_Tushare(fundamental_df.iloc[:1,])
df_day = df_day.merge(df_base, on=['trade_date','ts_code'], how='left')
print(df_day.shape)
df_day.head()

(1270, 20)


Unnamed: 0,ts_code,trade_date,vol,amount,open,close,high,low,rsi_6,rsi_12,turnover_rate,pe,pb,total_share,free_share,total_mv,circ_mv,name,industry,list_date
0,603963.SH,20230327,10342.27,12581.84,12.18,12.19,12.27,12.09,29.712,35.818,0.4707,,6.5741,21970.0,6147.75,267814.3,267814.3,大理药业,化学制药,20170922
1,603963.SH,20230324,13285.79,16230.46,12.22,12.21,12.39,12.15,30.529,36.179,0.6047,,6.5848,21970.0,6147.75,268253.7,268253.7,大理药业,化学制药,20170922
2,603963.SH,20230323,14624.08,17828.54,12.31,12.21,12.31,12.11,30.529,36.179,0.6656,,6.5848,21970.0,6147.75,268253.7,268253.7,大理药业,化学制药,20170922
3,603963.SH,20230322,16184.38,19947.3,12.28,12.31,12.46,12.25,33.751,37.781,0.7367,,6.6388,21970.0,6147.75,270450.7,270450.7,大理药业,化学制药,20170922
4,603963.SH,20230321,17665.3,21646.27,12.32,12.28,12.32,12.19,31.956,37.014,0.8041,,6.6226,21970.0,6147.75,269791.6,269791.6,大理药业,化学制药,20170922


## Download ALL

In [33]:
# download start day
win_len = 5

fundamental_df = get_basic_from_Tushare(calendar[0])
fundamental_df = filter_basic_stockDf(fundamental_df)

universe_raw = pd.DataFrame()
for i in tqdm(range(0,fundamental_df.shape[0]-1,win_len), desc='downloaing'):
    fundamental_i = fundamental_df.iloc[i:i+win_len,]
    universe = get_daydata_from_Tushare(fundamental_i)
    universe_i = get_dayIndicator_from_Tushare(fundamental_i)
    universe = universe.merge(universe_i, on=['trade_date','ts_code'], how='left')
    universe_raw = universe_raw.append(universe)

if (i+win_len) <= (fundamental_df.shape[0]-1):
    fundamental_i = fundamental_df.iloc[i+win_len:]
    universe = get_daydata_from_Tushare(fundamental_i)
    universe_i = get_dayIndicator_from_Tushare(fundamental_i)
    universe = universe.merge(universe_i, on=['trade_date','ts_code'], how='left')
    universe_raw = universe_raw.append(universe)
    
print(universe_raw.shape)
universe_raw.head()

downloaing: 100%|███████████████████████████████| 43/43 [02:58<00:00,  4.16s/it]


(270407, 20)


Unnamed: 0,ts_code,trade_date,turnover_rate,pe,pb,total_share,free_share,total_mv,circ_mv,name,industry,list_date,vol,amount,open,close,high,low,rsi_6,rsi_12
0,603963.SH,20230327,0.4707,,6.5741,21970.0,6147.75,267814.3,267814.3,大理药业,化学制药,20170922,10342.27,12581.84,12.18,12.19,12.27,12.09,29.712,35.818
1,603963.SH,20230324,0.6047,,6.5848,21970.0,6147.75,268253.7,268253.7,大理药业,化学制药,20170922,13285.79,16230.46,12.22,12.21,12.39,12.15,30.529,36.179
2,603963.SH,20230323,0.6656,,6.5848,21970.0,6147.75,268253.7,268253.7,大理药业,化学制药,20170922,14624.08,17828.54,12.31,12.21,12.31,12.11,30.529,36.179
3,603963.SH,20230322,0.7367,,6.6388,21970.0,6147.75,270450.7,270450.7,大理药业,化学制药,20170922,16184.38,19947.3,12.28,12.31,12.46,12.25,33.751,37.781
4,603963.SH,20230321,0.8041,,6.6226,21970.0,6147.75,269791.6,269791.6,大理药业,化学制药,20170922,17665.3,21646.27,12.32,12.28,12.32,12.19,31.956,37.014


In [65]:
# download other days
for trade_date in tqdm(calendar[1:], desc='funmental_df downloading'):
    df = get_basic_from_Tushare(trade_date)
    df = filter_basic_stockDf(df)
    df = df.loc[df.ts_code.isin(fundamental_df.ts_code)==False]
    fundamental_df = fundamental_df.append(df)
    if df.empty != True:
        universe = get_daydata_from_Tushare(df)
        tmp = get_dayIndicator_from_Tushare(df)
        universe = universe.merge(tmp, on=['trade_date','ts_code'], how='left')
        universe_raw = universe_raw.append(universe)
        
print(universe_raw.shape)
universe_raw.head()

funmental_df downloading: 100%|█████████████| 1269/1269 [13:19<00:00,  1.59it/s]

(372907, 20)





Unnamed: 0,ts_code,trade_date,turnover_rate,pe,pb,total_share,free_share,total_mv,circ_mv,name,industry,list_date,vol,amount,open,close,high,low,rsi_6,rsi_12
0,603963.SH,20230327,0.4707,,6.5741,21970.0,6147.75,267814.3,267814.3,大理药业,化学制药,20170922,10342.27,12581.84,12.18,12.19,12.27,12.09,29.712,35.818
1,603963.SH,20230324,0.6047,,6.5848,21970.0,6147.75,268253.7,268253.7,大理药业,化学制药,20170922,13285.79,16230.46,12.22,12.21,12.39,12.15,30.529,36.179
2,603963.SH,20230323,0.6656,,6.5848,21970.0,6147.75,268253.7,268253.7,大理药业,化学制药,20170922,14624.08,17828.54,12.31,12.21,12.31,12.11,30.529,36.179
3,603963.SH,20230322,0.7367,,6.6388,21970.0,6147.75,270450.7,270450.7,大理药业,化学制药,20170922,16184.38,19947.3,12.28,12.31,12.46,12.25,33.751,37.781
4,603963.SH,20230321,0.8041,,6.6226,21970.0,6147.75,269791.6,269791.6,大理药业,化学制药,20170922,17665.3,21646.27,12.32,12.28,12.32,12.19,31.956,37.014


## Clean Data
Here we just clean some error values data. pick tikers process in the next notebokk.

In [67]:
# remove close is null
universe_raw = universe_raw.loc[universe_raw.close.isnull()==False]

In [68]:
# replace all inf values to nan
universe_raw = universe_raw.replace([np.inf, -np.inf], np.nan)

In [75]:
# check null columns
for col in universe_raw.columns:
    if universe_raw[col].isnull().any():
        print(col)

In [76]:
# replace free_share to total share
universe_raw['free_share'] = np.where(universe_raw['free_share'].isnull(), 
                                      universe_raw['total_share'], universe_raw['free_share'])
# reset pe is null to 1.e3
universe_raw['pe'] = np.where(universe_raw['pe'].isnull(), 1.*1e3, universe_raw['pe'])
# fill pb null and infinite values
universe_raw['pb'] = np.where(universe_raw['pb'].isnull(), 1.*1e2, universe_raw['pb'])
# rest null value is rsi_6 rsi_12 foward fill tail vales
universe_raw = universe_raw.fillna(method='ffill')
# drop duplicates
universe_raw = universe_raw.drop_duplicates(['trade_date','ts_code'])

In [77]:
universe_raw.shape

(371624, 20)

## Download PnL Procast from Tushare

In [82]:
def find_next_close_date(date, calendar):
    if date in calendar:
        return date
    int_date = int(date)
    min_interval = np.inf
    idx = 0
    for ii,dt in enumerate(calendar):
        if int(dt) > int_date and int(dt) - int_date < min_interval:
            min_interval = int(dt) - int_date
            idx = ii
    return calendar[idx]
        
# download profit notice
def get_profit_notice_from_Tushare(ts_code, start_date):
    my_fields = 'ts_code,ann_date,type,p_change_min,p_change_max'
    df = pro.forecast_vip(ts_code=ts_code ,start_date=start_date, end_date=calendar[-1] ,fields=my_fields)
    df.fillna(method='bfill', inplace=True)
    return df

# test function
get_profit_notice_from_Tushare('603538.SH', '20170101')

Unnamed: 0,ts_code,ann_date,type,p_change_min,p_change_max
0,603538.SH,20230131,预增,126.76,166.78
1,603538.SH,20220705,预增,54.7,71.89
2,603538.SH,20200122,预增,45.0,60.0
3,603538.SH,20190122,预增,116.0,131.0
4,603538.SH,20171018,略减,-45.0,-35.0
5,603538.SH,20170316,略增,5.0,20.0


In [83]:
# download all profit notice
# set least time of report. eg data date from 2018, we set report start time is 20170101
def dowanload_procastInfo_from_Tushare(df):
    start_date = '20170101'
    calendar_procast = pro.trade_cal(exchange='SSE', is_open='1', 
                                start_date=start_date, 
                                end_date=calendar[-1], 
                                fields='cal_date').values.flatten()[::-1]

    ticker_list = universe_raw.ts_code.unique()
    procast_df = pd.DataFrame()
    for ts_code in tqdm(ticker_list, desc='download profit notice'):
        tmp = get_profit_notice_from_Tushare(ts_code, start_date)
        for ii, dt in enumerate(tmp.ann_date):
            latest_dt = find_next_close_date(dt, calendar_procast)
            tmp.at[ii,'ann_date'] = dt
        tmp['trade_date'] = tmp['ann_date'].apply(np.int64)
        tmp = tmp.drop_duplicates(subset=['trade_date'])
        procast_df = procast_df.append(tmp, ignore_index=True)
        
    return procast_df

procast_df = dowanload_procastInfo_from_Tushare(universe_raw)
procast_df['trade_date'] = procast_df['trade_date'].astype(int)
universe_raw['trade_date'] = universe_raw['trade_date'].astype(int)
universe = universe_raw.merge(procast_df, on=['ts_code','trade_date'], how='left')
universe_raw = universe.sort_values(by=['trade_date'], ascending=True).reset_index(drop=True)

download profit notice: 100%|█████████████████| 358/358 [00:53<00:00,  6.70it/s]


### Wait Here
If we download new data, we should append data to old sheet then excute this section

In [131]:
# fill procast nan forward then history nan to others
universe_raw[['ann_date', 'type','p_change_min','p_change_max']] = universe_raw.groupby('ts_code')\
                                        [['ann_date', 'type','p_change_min','p_change_max']].fillna(method='ffill')
#universe_raw['ann_date'] = universe_raw['ann_date'].astype(int)
universe_raw['type'] = np.where(universe_raw['type'].isnull(), '不确定', universe_raw['type'])
universe_raw['ann_date'] = np.where(universe_raw['ann_date'].isnull(), universe_raw['trade_date'], universe_raw['ann_date'])
universe_raw['p_change_min'] = np.where(universe_raw['p_change_min'].isnull(), 0, universe_raw['p_change_min'])
universe_raw['p_change_max'] = np.where(universe_raw['p_change_max'].isnull(), 0, universe_raw['p_change_max'])
universe_raw['p_change_max'] = np.where(universe['p_change_max']>300, 300., np.where(universe['p_change_max']<-300, -300, universe['p_change_max']))
universe_raw['p_change_min'] = np.where(universe['p_change_min']>300, 300., np.where(universe['p_change_min']<-300, -300, universe['p_change_min']))
#tmp_bak = universe_raw.copy(deep=True)

## Download IPO info 
If add new data, end_date should reset

In [143]:
df1 = pro.new_share(start_date='20100101', end_date='20180101')
time.sleep(1)
df2 = pro.new_share(start_date='20180102', end_date='20230318')
df = df1.append(df2)
print(df.shape)
df.head()

(3535, 12)


Unnamed: 0,ts_code,sub_code,name,ipo_date,issue_date,amount,market_amount,price,pe,limit_amount,funds,ballot
0,300736.SZ,300736,百邦科技,20171228,20180109,1358.0,1358.0,19.18,22.99,1.35,2.604,0.01
1,603161.SH,732161,科华控股,20171226,20180105,3340.0,3006.0,16.75,22.98,1.3,5.595,0.03
2,300664.SZ,300664,鹏鹞环保,20171226,20180105,8000.0,7200.0,8.88,22.98,2.4,7.104,0.05
3,002923.SZ,2923,润都股份,20171226,20180105,2500.0,2250.0,17.01,22.82,1.0,4.253,0.02
4,603080.SH,732080,新疆火炬,20171221,20180103,3550.0,3195.0,13.6,22.99,1.4,4.828,0.03


In [144]:
universe = pd.DataFrame()
for group_ticker in tqdm(universe_raw.groupby('ts_code'), desc='aggregeate df'):
    ticker = group_ticker[1]
    if group_ticker[0] in df.ts_code.values:
        tmp = df.loc[df.ts_code==group_ticker[0]]
        ticker['list_date'] = tmp['issue_date'].values[0]
        ticker['issue_price'] = tmp['price'].values[0]
        ticker['issue_amount'] = tmp['amount'].values[0] # issue stcke unite is 10000 share
    else:
        ticker['issue_price'] = ticker.iloc[0,:]['open']
        ticker['issue_amount'] = ticker.iloc[0,:]['free_share']
    universe = universe.append(ticker)
universe_raw = universe      
#ticker[['trade_date','ts_code','name','list_date','issue_price','issue_amount','free_share']]

aggregeate df: 100%|██████████████████████████| 358/358 [00:14<00:00, 24.40it/s]


In [149]:
universe_raw.head()

Unnamed: 0,ts_code,trade_date,turnover_rate,pe,pb,total_share,free_share,total_mv,circ_mv,name,...,high,low,rsi_6,rsi_12,ann_date,type,p_change_min,p_change_max,issue_price,issue_amount
132,000004.SZ,20180102,0.7548,47.7372,16.2585,8397.6684,4927.8141,187603.9121,185340.8701,国农科技,...,22.49,22.0,40.83,39.538,20180102,不确定,0.0,0.0,22.29,4927.8141
418,000004.SZ,20180103,2.2655,50.857,17.321,8397.6684,4927.8141,199864.5079,197453.5679,国农科技,...,23.89,22.27,75.02,55.772,20180103,不确定,0.0,0.0,22.29,4927.8141
476,000004.SZ,20180104,1.7527,49.6604,16.9135,8397.6684,4927.8141,195161.8136,192807.6016,国农科技,...,23.83,23.12,59.259,50.139,20180104,不确定,0.0,0.0,22.29,4927.8141
912,000004.SZ,20180105,1.2589,49.5322,16.8698,8397.6684,4927.8141,194657.9535,192309.8195,国农科技,...,23.47,22.85,57.701,49.554,20180105,不确定,0.0,0.0,22.29,4927.8141
1003,000004.SZ,20180108,0.8649,48.8056,16.6224,8397.6684,4927.8141,191802.7463,189489.0543,国农科技,...,23.11,22.73,48.947,46.22,20180108,不确定,0.0,0.0,22.29,4927.8141


## Download Finance Info

In [176]:
def find_next_close_date(date, calendar):
    if date in calendar:
        return date
    int_date = int(date)
    min_interval = np.inf
    idx = 0
    for ii,dt in enumerate(calendar):
        if int(dt) > int_date and int(dt) - int_date < min_interval:
            min_interval = int(dt) - int_date
            idx = ii
    return calendar[idx]

def get_finace_reports_from_Tushre(ts_code):
    # https://tushare.pro/document/2?doc_id=79
    #start_date=calendar[0]
    my_fields = [
                  'ts_code','ann_date','cfps','revenue_ps','gross_margin', # 每股现金流，每股营业收入,
                  'dt_eps','dt_eps_yoy', # 每股收益， 每股收同比益增长率
                  'bps','bps_yoy',  #每股净资产，每股净资产同比增长率
                  'extra_item','profit_dedt', # 扣非，扣非净利润
                  'roe_dt','roe_yoy', #净资产收益，净资产收益同比增长率
                  'ebt_yoy','or_yoy' # 总利润增长率，营业收入增长率
                 ]
    df = pro.fina_indicator(ts_code=ts_code, start_date='20170901', end_date=calendar[-1])[my_fields]
    df = df.fillna(method='bfill').dropna()
    return df

# function test
get_finace_reports_from_Tushre('600085.SH').tail()

Unnamed: 0,ts_code,ann_date,cfps,revenue_ps,gross_margin,dt_eps,dt_eps_yoy,bps,bps_yoy,extra_item,profit_dedt,roe_dt,roe_yoy,ebt_yoy,or_yoy


In [171]:
# download all profit notice
def download_finreport(tickser_list):
    start_date = '20170901'
    calendar_procast = pro.trade_cal(exchange='SSE', is_open='1', 
                                start_date=start_date, 
                                end_date=calendar[-1], 
                                fields='cal_date').values.flatten()[::-1]

    finance_df_all = pd.DataFrame()
    for ts_code in tqdm(ticker_list, desc='finance info download...'):
        finance_df = get_finace_reports_from_Tushre(ts_code)
        for ii, dt in enumerate(finance_df['ann_date']):
            dt = find_next_close_date(dt, calendar_procast)
            finance_df.at[ii,'ann_date'] = dt
        finance_df['trade_date'] = finance_df['ann_date'].apply(np.int64)
        finance_df = finance_df.drop_duplicates(subset=['trade_date']).drop(columns=['ann_date'])
        finance_df_all = finance_df_all.append(finance_df)
    return finance_df_all

finance_df_all = download_finreport(universe_raw.ts_code.unique())
# merge all finacial fundamental values into one dataframe
universe_raw = universe_raw.merge(finance_df_all, on=['ts_code','trade_date'], how='left')
print(universe_raw.shape)
universe_raw.head()

finance info download...: 100%|███████████████| 396/396 [01:12<00:00,  5.45it/s]

(5700, 16)





Unnamed: 0,ts_code,cfps,revenue_ps,gross_margin,dt_eps,dt_eps_yoy,bps,bps_yoy,extra_item,profit_dedt,roe_dt,roe_yoy,ebt_yoy,or_yoy,equity_yoy,trade_date
0,603963.SH,-0.6163,0.4572,68281671.49,-0.0522,-92.6199,1.8543,-2.738,4208000.44,-15685162.19,-3.7968,-115.2169,-77.0878,-22.2137,-10.4254,20221028
1,603963.SH,-0.5892,0.3172,47521341.7,-0.0386,-628.3019,1.8679,-2.0247,2774383.15,-11265117.72,-2.717,-710.6136,-510.514,-24.2977,-10.7073,20220826
3,603963.SH,-0.5235,0.1812,26969938.07,-0.0146,-1722.2222,1.8919,-0.7658,1374652.35,-4582155.66,-1.0982,-2017.1109,-1082.2479,-18.7403,-11.7632,20220428
7,603963.SH,-0.0601,0.5878,97089975.64,-0.0271,-223.1818,2.0701,-3.4108,3558554.2,-9512067.86,-2.0552,-227.9112,-206.5187,-20.3012,-3.7386,20211029
9,603963.SH,-0.0968,0.419,69858610.08,-0.0053,-312.0,2.0918,-2.3983,3147207.99,-4320255.38,-0.9287,-319.0112,-319.8879,-11.3693,-1.8355,20210825


In [180]:
# check some stock don't have fundamental values
print(len(universe_raw.ts_code.unique()), len(finance_df_all.ts_code.unique()))
universe_raw.loc[universe_raw['ts_code'].isin(finance_df_all.ts_code.unique())==False].ts_code.unique()

358 355


array(['600085.SH', '688176.SH', '688222.SH', '688302.SH', '688382.SH'],
      dtype=object)

## Wait Here
If we download new data, we should append data to old sheet then excute this section

In [198]:
# fill procast nan forward then history nan to others
universe_raw = universe_raw.sort_values(by=['trade_date'], ascending=True).reset_index(drop=True)
universe_raw = universe_raw.replace([np.inf, -np.inf], np.nan)
universe_raw[universe_raw.columns] = universe_raw.groupby('ts_code')[universe_raw.columns].fillna(method='ffill')
universe_raw.fillna(-0.,inplace=True)

# cut off large value
universe_raw['bps_yoy'] = np.where(universe_raw['bps_yoy']>300, 300.,
                             np.where(universe_raw['bps_yoy']<-300, -300., universe_raw['bps_yoy']))
universe_raw['dt_eps_yoy'] = np.where(universe_raw['dt_eps_yoy']>300, 300., 
                             np.where(universe_raw['dt_eps_yoy']<-300, -300., universe_raw['dt_eps_yoy']))
universe_raw['roe_yoy'] = np.where(universe_raw['roe_yoy']>300, 300., 
                             np.where(universe_raw['roe_yoy']<-300, -300., universe_raw['roe_yoy']))

In [216]:
#finance_df_all.loc[finance_df_all.ts_code.isin(universe_raw.ts_code.unique())].shape
universe_raw.loc[universe_raw.ts_code=='603538.SH'].head()

Unnamed: 0,ts_code,trade_date,turnover_rate,pe,pb,total_share,free_share,total_mv,circ_mv,name,...,dt_eps,dt_eps_yoy,bps,bps_yoy,extra_item,profit_dedt,roe_dt,roe_yoy,ebt_yoy,or_yoy
220,603538.SH,20180102,1.7829,41.3072,2.9414,12000.0,3000.0,325680.0,81420.0,美诺华,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
471,603538.SH,20180103,1.6366,41.3833,2.9468,12000.0,3000.0,326280.0,81570.0,美诺华,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
572,603538.SH,20180104,1.5866,40.9723,2.9176,12000.0,3000.0,323040.0,80760.0,美诺华,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
907,603538.SH,20180105,1.2232,41.0028,2.9197,12000.0,3000.0,323280.0,80820.0,美诺华,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
1104,603538.SH,20180108,1.9092,40.8658,2.91,12000.0,3000.0,322200.0,80550.0,美诺华,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0


In [217]:
# update fundamental df
fundamental_df = fundamental_df.loc[fundamental_df.ts_code.isin(universe_raw.ts_code.unique())]

## Save Raw Data

In [218]:
#'20180102', '20230327'
universe_raw = universe_raw.drop_duplicates(keep='first').reset_index(drop=True)
fundamental_df = fundamental_df.drop_duplicates(keep='first').reset_index(drop=True)
universe_raw.to_csv('raw_20180102_20230327.csv')
fundamental_df.to_csv('fundamental_20180103_20230327.csv')

## Append Data of New days

In [None]:
# load old data
fundamental_df = pd.read_csv('fundamental_20180103_20230327.csv').iloc[:,1:]
universe_raw_old = pd.read_csv('raw_20180102_20230327.csv').iloc[:,1:]

# calendar list
start_date, end_date = '20230328' , '20230330'
calendar = pro.trade_cal(exchange='SSE', is_open='1', 
                            start_date=start_date, 
                            end_date=end_date, 
                            fields='cal_date')
calendar = calendar.values.flatten()[::-1]

# get all stock exists in tha day
fundamental_new = get_basic_from_Tushare(calendar[0])
# get all 
fundamental_new = filter_basic_stockDf(fundamental_new)
all_ticker_list = list(set(fundamental_df.ts_code.unique()).union(fundamental_new.ts_code.unique()))

'''
Here we can jump to titile `Load Data` to run rest section
'''
# note: If some tickers we have and change to ST some day, we dont have that data.

In [None]:
'''
After Download finished, we can append new data to old
'''

universe_raw = universe_raw_old.append(universe_raw, ignore_index=True)
universe_raw = universe_raw.sort_values(by['trade_date'], ascending=True).reset_index(drop=True)
# update fundamental df
fundamental_df = fundamental_df.loc[fundamental_df.ts_code.isin(universe_raw.ts_code.unique())]
'''
After we finish above steps, we can jump to title `Wait Here` to run some clean steps.
Remember chack and save all data
'''