# Load Data From Tushare
We load data from 2018-2021. Due to some constraint of platform, we download thses data year by year and save seperately. After that we process these data.

In [1]:
import tushare as ts
import pandas as pd
import numpy as np

print(ts.__version__)

1.2.89


In [2]:
# register token
token = '' # your token
ts.set_token(token)
pro = ts.pro_api()

In [3]:
# got calendar from date range
start_date = '20180101'
end_date = '20210101'
calendar = pro.trade_cal(exchange='SSE', is_open='1', 
                            start_date=start_date, 
                            end_date=end_date, 
                            fields='cal_date')
# check all stocks exist in market today
stocks = pro.query('stock_basic', exchange='', list_status='L', market = '主板') # 主板/创业板/科创板/CDR/北交所
ts_code_list = ','.join(stocks.ts_code.values)
print(calendar.shape, stocks.shape)
calendar.tail()

(730, 1) (2186, 7)


Unnamed: 0,cal_date
725,20180108
726,20180105
727,20180104
728,20180103
729,20180102


In [4]:
# get fundamental data
# pick up market cap [5,30] billion
base_universe = pro.bak_daily(trade_date='20180102', 
                   fields='trade_date, ts_code, name, float_mv, total_mv, pe, turn_over, industry')
filte_stock = base_universe.loc[(base_universe.total_mv>=50) & (base_universe.total_mv<=300)]
filte_stock

Unnamed: 0,ts_code,trade_date,name,turn_over,pe,industry,float_mv,total_mv
3,600903.SH,20180102,贵州燃气,35.95,110.50,供气供热,22.62,150.81
9,000885.SZ,20180102,同力水泥,4.98,10.71,水泥,72.68,84.53
12,002372.SZ,20180102,伟星新材,1.75,27.46,其他建材,176.55,199.46
13,600507.SH,20180102,方大特钢,4.05,9.02,特种钢,185.12,185.12
14,300735.SZ,20180102,光弘科技,0.02,29.91,通信设备,14.04,56.15
...,...,...,...,...,...,...,...,...
3454,600074.SH,20180102,ST保千里,0.00,42.50,电脑设备,95.52,228.67
3455,600289.SH,20180102,ST信通,0.00,0.00,软件服务,53.42,59.57
3456,300630.SZ,20180102,普利制药,5.14,109.15,化学制药,22.02,88.08
3457,002127.SZ,20180102,南极电商,3.07,57.60,互联网,110.37,186.57


In [5]:
from helper import download_helper

In [8]:
# load total stock daily date for one year
ts_code_list = filte_stock.ts_code.values
all_stocks = download_helper.get_Daily_All(ts, ts_code_list, start_date, end_date)
print(all_stocks.shape)
all_stocks

ticker/tickers: 100%|███████████████████████| 1857/1857 [17:19<00:00,  1.79it/s]

(1319793, 17)





Unnamed: 0,ts_code,trade_date,open,high,low,close,pre_close,change,pct_chg,vol,amount,ma10,ma_v_10,ma60,ma_v_60,ma120,ma_v_120
0,600903.SH,20201231,12.8300,13.0500,12.6500,12.7000,13.0000,-0.3000,-2.3077,298018.48,380610.620,13.026,428080.779,12.1060,199368.8255,11.7891,148799.6242
1,600903.SH,20201230,12.9100,13.3500,12.9100,13.0000,13.2200,-0.2200,-1.6641,335380.15,439698.264,12.913,408187.524,12.0788,195089.4188,11.7739,147809.3177
2,600903.SH,20201229,13.5500,13.7000,13.0100,13.2200,13.7700,-0.5500,-3.9942,403904.90,539356.202,12.748,387981.164,12.0478,190053.8642,11.7531,146556.4926
3,600903.SH,20201228,13.9300,14.2100,13.3900,13.7700,13.6500,0.1200,0.8791,518263.18,713652.186,12.607,361548.627,12.0128,184151.5455,11.7313,144766.4967
4,600903.SH,20201225,13.1800,14.4800,13.0700,13.6500,13.3500,0.3000,2.2472,584433.16,807266.469,12.450,321017.163,11.9723,176815.1513,11.7012,141314.9518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,300639.SZ,20180108,27.0020,27.0020,25.2016,25.5438,27.0183,-1.4745,-5.4574,44458.13,279841.559,,,,,,
726,300639.SZ,20180105,27.2342,27.4705,26.6069,27.0183,26.9654,0.0529,0.1962,34242.50,227133.793,,,,,,
727,300639.SZ,20180104,27.3157,27.6415,26.5092,26.9654,28.3462,-1.3808,-4.8712,52177.37,346749.657,,,,,,
728,300639.SZ,20180103,27.6130,28.5866,27.2098,28.3462,29.1772,-0.8310,-2.8481,66187.57,450061.438,,,,,,


In [10]:
# save data
universe = all_stocks.drop_duplicates()
universe.to_csv(start_date +'-'+ end_date + '.csv')
filte_stock.drop_duplicates().to_csv('fundamental_' + start_date +'.csv')

# Load Data by File
if we load data from saved file. 

In [2]:
# load data from csv
universe = pd.read_csv('20180101-20210101.csv').iloc[:,1:]
fundamental = pd.read_csv('fundamental_20180101.csv').iloc[:,1:]

# Process Data
1. filter ma_v_120 top 500 stocks
2. add 'date' column as datetime type, and deascanding time
3. add industry infomation and boll indicator to stock 

In [3]:
from helper.factor_helper import IndicatorHelper, AverageByWindow
# pick average amount 120 days top 500
universe = AverageByWindow(universe).top(500, index='trade_date', ticker_column='ts_code', value_column='ma_v_120')

# add bollinger as indicator which will be used as a custom factor later
ind_helper = IndicatorHelper(universe)
tech_indicator_list = ['boll_ub','boll_lb']
universe = ind_helper.add_technical_indicator(tech_indicator_list)
# add industry and stock name
universe = ind_helper.add_by_basetable('ts_code', fundamental, ['industry', 'name'])

  self.use_column = self.columns
  self.stocks = stockstats.StockDataFrame.retype(self.copy())
  self.df = self.copy()
add tech indicators: 100%|████████████████████| 500/500 [00:24<00:00, 20.71it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
add fundamental info: 100%|███████████████████| 500/500 [00:20<00:00, 23.97it/s]


In [7]:
universe

Unnamed: 0,ts_code,trade_date,open,high,low,close,pre_close,change,pct_chg,vol,...,ma_v_10,ma60,ma_v_60,ma120,ma_v_120,date,boll_ub,boll_lb,industry,name
0,000009.SZ,20180702,4.0470,4.0635,3.8983,3.9479,4.0470,-0.0991,-2.4487,94709.98,...,125922.272,4.7691,1.352710e+05,5.0153,1.759280e+05,2018-07-02,4.106014,3.872386,综合类,中国宝安
1,000012.SZ,20180702,4.4065,4.4243,4.3090,4.3356,4.3888,-0.0532,-1.2122,73861.08,...,84886.852,5.2401,6.831003e+04,5.7780,1.397237e+05,2018-07-02,4.454473,4.278827,玻璃,南 玻Ａ
2,000016.SZ,20180702,5.0745,5.2879,5.0260,5.1036,5.0939,0.0097,0.1904,202694.27,...,166471.324,5.5048,1.671701e+05,5.7798,1.984626e+05,2018-07-02,5.174543,4.955057,家用电器,深康佳Ａ
3,000021.SZ,20180702,6.7204,6.7401,6.4839,6.5627,6.7204,-0.1577,-2.3466,78621.11,...,112309.271,8.0763,2.115724e+05,8.4805,2.819460e+05,2018-07-02,6.939991,6.382509,电脑设备,深科技
4,000040.SZ,20180702,9.9984,9.9984,9.4842,9.8006,9.9885,-0.1879,-1.8812,70863.95,...,74596.047,11.6338,7.014094e+04,11.9974,5.012325e+04,2018-07-02,9.800600,9.800600,新型电力,东旭蓝天
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293963,603000.SH,20201231,16.6600,16.9600,16.6200,16.9000,16.6100,0.2900,1.7459,92420.20,...,60877.710,17.3523,6.030390e+04,18.7319,1.110187e+05,2020-12-31,17.676380,16.267620,互联网,人民网
293964,603019.SH,20201231,33.8600,34.5500,33.5800,34.2300,33.7100,0.5200,1.5426,263529.72,...,195620.958,35.4893,1.928680e+05,39.6270,2.683894e+05,2020-12-31,35.328468,30.764532,电脑设备,中科曙光
293965,603077.SH,20201231,1.4100,1.4400,1.4100,1.4200,1.4100,0.0100,0.7092,2063731.75,...,1791702.858,1.4210,1.239141e+06,1.4297,1.614093e+06,2020-12-31,1.483045,1.323955,化工原料,和邦生物
293966,603128.SH,20201231,9.3000,9.4100,9.0300,9.2900,9.2500,0.0400,0.4324,309135.28,...,425958.744,7.9493,3.798198e+05,7.2817,4.590436e+05,2020-12-31,9.614855,7.502145,仓储物流,华贸物流


In [17]:
# tmp save
save_data = universe.loc[universe.date > pd.to_datetime('2020-01-01')]
save_data.
#universe.to_csv('tmp_result.csv')

Unnamed: 0,ts_code,trade_date,open,high,low,close,pre_close,change,pct_chg,vol,...,ma_v_10,ma60,ma_v_60,ma120,ma_v_120,date,boll_ub,boll_lb,industry,name
174607,000008.SZ,20200102,3.6477,3.6976,3.6278,3.6976,3.6179,0.0797,2.2029,377467.90,...,267414.987,3.5245,2.435912e+05,3.5524,2.349738e+05,2020-01-02,3.749506,3.536044,运输设备,神州高铁
174608,000009.SZ,20200102,6.2644,6.7732,6.1747,6.6435,6.1747,0.4688,7.5923,2406170.23,...,1493652.872,4.7577,3.836671e+05,4.6913,3.077963e+05,2020-01-02,6.528136,3.672544,综合类,中国宝安
174609,000012.SZ,20200102,4.9402,5.0092,4.9205,4.9698,4.9402,0.0296,0.5992,332464.01,...,272282.672,4.4482,1.284885e+05,4.2998,1.183312e+05,2020-01-02,5.015841,4.269009,玻璃,南 玻Ａ
174610,000016.SZ,20200102,4.3722,4.5014,4.3622,4.4318,4.3424,0.0894,2.0588,275211.92,...,255275.749,4.1854,1.678056e+05,4.1934,1.923976e+05,2020-01-02,4.630850,4.044940,家用电器,深康佳Ａ
174611,000021.SZ,20200102,12.3673,13.1353,12.1978,12.8461,12.1479,0.6982,5.7475,822238.58,...,676423.489,10.6389,4.248515e+05,10.3856,5.768221e+05,2020-01-02,13.018872,9.799828,电脑设备,深科技
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293963,603000.SH,20201231,16.6600,16.9600,16.6200,16.9000,16.6100,0.2900,1.7459,92420.20,...,60877.710,17.3523,6.030390e+04,18.7319,1.110187e+05,2020-12-31,17.676380,16.267620,互联网,人民网
293964,603019.SH,20201231,33.8600,34.5500,33.5800,34.2300,33.7100,0.5200,1.5426,263529.72,...,195620.958,35.4893,1.928680e+05,39.6270,2.683894e+05,2020-12-31,35.328468,30.764532,电脑设备,中科曙光
293965,603077.SH,20201231,1.4100,1.4400,1.4100,1.4200,1.4100,0.0100,0.7092,2063731.75,...,1791702.858,1.4210,1.239141e+06,1.4297,1.614093e+06,2020-12-31,1.483045,1.323955,化工原料,和邦生物
293966,603128.SH,20201231,9.3000,9.4100,9.0300,9.2900,9.2500,0.0400,0.4324,309135.28,...,425958.744,7.9493,3.798198e+05,7.2817,4.590436e+05,2020-12-31,9.614855,7.502145,仓储物流,华贸物流


In [13]:
# tmp load
import pandas as pd
import numpy as np
universe = pd.read_csv('tmp_result.csv',keep_date_col=True).iloc[:,1:]
universe.date = pd.to_datetime(universe.date)