下载数据的命令

python collector.py download_data --source_dir ~/.qlib/fund_data/source/cn_1d --region CN  --delay 0.1 --interval 1d

# download from eastmoney.com
python collector.py download_data --source_dir ~/.qlib/fund_data/source/cn_1d --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d

# normalize
python collector.py normalize_data --source_dir ~/.qlib/fund_data/source/cn_1d --normalize_dir ~/.qlib/fund_data/source/cn_1d_nor --region CN --interval 1d --date_field_name FSRQ

# dump data
cd qlib/scripts
python dump_bin.py dump_all --csv_path ~/.qlib/fund_data/source/cn_1d_nor --qlib_dir ~/.qlib/qlib_data/cn_fund_data --freq day --date_field_name FSRQ --include_fields DWJZ,LJJZ

In [1]:
import pandas as pd
import numpy as np
import time
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

# Step1： 读出来所有的数据

In [2]:
import qlib
from qlib.data import D

qlib.init(provider_uri="~/.qlib/qlib_data/cn_fund_data")
df = D.features(D.instruments(market="all"), ["$DWJZ", "$LJJZ"], freq="day")

[23992:MainThread](2021-06-28 23:16:05,753) INFO - qlib.Initialization - [config.py:276] - default_conf: client.
[23992:MainThread](2021-06-28 23:16:05,866) INFO - qlib.Initialization - [__init__.py:46] - qlib successfully initialized based on client settings.
[23992:MainThread](2021-06-28 23:16:05,866) INFO - qlib.Initialization - [__init__.py:47] - data_path=/Users/wangfan/.qlib/qlib_data/cn_fund_data


In [3]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,$DWJZ,$LJJZ
instrument,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2005-01-04,0.995,1.115
1,2005-01-05,0.998,1.118
1,2005-01-06,0.991,1.111
1,2005-01-07,0.989,1.109
1,2005-01-10,0.992,1.112


In [4]:
df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,$DWJZ,$LJJZ
instrument,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1
980003,2021-06-22,1.4718,1.4718
980003,2021-06-23,1.4721,1.4721
980003,2021-06-24,1.4725,1.4725
980003,2021-06-25,1.4727,1.4727
980003,2021-06-28,1.4734,1.4734


In [5]:
df = df.reset_index()

In [6]:
df_new = df.copy()

# Step2 数据加工

In [7]:
df_new["LJJZ_1d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-1)
df_new["LJJZ_5d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-5)
df_new["LJJZ_20d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-20)
df_new["LJJZ_60d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-60)
df_new["LJJZ_120d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-120)
df_new["LJJZ_240d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-240)
df_new["LJJZ_480d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-480)
df_new["LJJZ_720d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-720)
df_new.index = range(len(df_new))
df_new["y_1d"]=100 * (df_new["LJJZ_1d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_5d"]=100 * (df_new["LJJZ_5d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_20d"]=100 * (df_new["LJJZ_20d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_60d"]=100 * (df_new["LJJZ_60d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_120d"]=100 * (df_new["LJJZ_120d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_240d"]=100 * (df_new["LJJZ_240d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_480d"]=100 * (df_new["LJJZ_480d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_720d"]=100 * (df_new["LJJZ_720d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new = df_new.drop(columns = ['LJJZ_1d','LJJZ_5d','LJJZ_20d','LJJZ_60d','LJJZ_120d','LJJZ_240d','LJJZ_480d','LJJZ_720d'])

In [8]:
df_new["LJJZ_20d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(20)
df_new["LJJZ_60d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(60)
df_new["LJJZ_120d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(120)
df_new["LJJZ_240d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(240)
df_new["LJJZ_480d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(480)
df_new["LJJZ_720d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(720)
df_new.index = range(len(df_new))


df_new["his_20d"]=-100 * (df_new["LJJZ_20d"]- df_new["$LJJZ"])/df_new["LJJZ_20d"]
df_new["his_60d"]=-100 * (df_new["LJJZ_60d"]- df_new["$LJJZ"])/df_new["LJJZ_60d"]
df_new["his_120d"]=-100 * (df_new["LJJZ_120d"]- df_new["$LJJZ"])/df_new["LJJZ_120d"]
df_new["his_240d"]=-100 * (df_new["LJJZ_240d"]- df_new["$LJJZ"])/df_new["LJJZ_240d"]
df_new["his_480d"]=-100 * (df_new["LJJZ_480d"]- df_new["$LJJZ"])/df_new["LJJZ_480d"]
df_new["his_720d"]=-100 * (df_new["LJJZ_720d"]- df_new["$LJJZ"])/df_new["LJJZ_720d"]
df_new = df_new.drop(columns = ['LJJZ_20d','LJJZ_60d','LJJZ_120d','LJJZ_240d','LJJZ_480d','LJJZ_720d'])

In [9]:
df_train = df_new[df_new['y_720d'].notnull()]
df_test = df_new[df_new['y_720d'].isnull()]

In [10]:
df_train = df_train.dropna()

In [11]:
df_train["rank_20d"] = df_train.groupby(['datetime'])["his_20d"].rank(method='min',ascending=False)
df_train["rank_60d"] = df_train.groupby(['datetime'])["his_60d"].rank(method='min',ascending=False)
df_train["rank_120d"] = df_train.groupby(['datetime'])["his_120d"].rank(method='min',ascending=False)
df_train["rank_240d"] = df_train.groupby(['datetime'])["his_240d"].rank(method='min',ascending=False)
df_train["rank_480d"] = df_train.groupby(['datetime'])["his_480d"].rank(method='min',ascending=False)
df_train["rank_720d"] = df_train.groupby(['datetime'])["his_720d"].rank(method='min',ascending=False)
df_train.index = range(len(df_train))
# rank越小涨幅越大

In [12]:
#df_train["average_rank"] = df_train["rank_60d"] + df_train["rank_120d"] + df_train["rank_240d"] + df_train["rank_480d"] + df_train["rank_720d"]
df_train["average_rank"] = df_train["rank_120d"] + df_train["rank_240d"] + df_train["rank_480d"] + df_train["rank_720d"]
df_train["rank_of_average_rank"] = df_train.groupby(['datetime'])["average_rank"].rank(method='min')
df_train.index = range(len(df_train))
# rank越小，平均排名越高

In [13]:
df_train[(df_train['rank_of_average_rank']==1)&(df_train['datetime']=='2016-05-11')][['instrument','datetime','average_rank','rank_of_average_rank']]

Unnamed: 0,instrument,datetime,average_rank,rank_of_average_rank
1747772,762001,2016-05-11,32.0,1.0


In [14]:
df_train[(df_train['rank_of_average_rank']==10)&(df_train['datetime']=='2016-05-11')][['instrument','datetime','average_rank','rank_of_average_rank']]

Unnamed: 0,instrument,datetime,average_rank,rank_of_average_rank
240880,40035,2016-05-11,566.0,10.0


In [15]:
df_test["rank_20d"] = df_test.groupby(['datetime'])["his_20d"].rank(method='min',ascending=False)
df_test["rank_60d"] = df_test.groupby(['datetime'])["his_60d"].rank(method='min',ascending=False)
df_test["rank_120d"] = df_test.groupby(['datetime'])["his_120d"].rank(method='min',ascending=False)
df_test["rank_240d"] = df_test.groupby(['datetime'])["his_240d"].rank(method='min',ascending=False)
df_test["rank_480d"] = df_test.groupby(['datetime'])["his_480d"].rank(method='min',ascending=False)
df_test["rank_720d"] = df_test.groupby(['datetime'])["his_720d"].rank(method='min',ascending=False)
df_test.index = range(len(df_test))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["rank_20d"] = df_test.groupby(['datetime'])["his_20d"].rank(method='min',ascending=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["rank_60d"] = df_test.groupby(['datetime'])["his_60d"].rank(method='min',ascending=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["

In [16]:
df_test["average_rank"] = df_test["rank_60d"] + df_test["rank_120d"] + df_test["rank_240d"] + df_test["rank_480d"] + df_test["rank_720d"]
df_test["rank_of_average_rank"] = df_test.groupby(['datetime'])["average_rank"].rank(method='min')
df_test.index = range(len(df_test))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["average_rank"] = df_test["rank_60d"] + df_test["rank_120d"] + df_test["rank_240d"] + df_test["rank_480d"] + df_test["rank_720d"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["rank_of_average_rank"] = df_test.groupby(['datetime'])["average_rank"].rank(method='min')


# Step 3 模拟回测

In [17]:
df_train.columns

Index(['instrument', 'datetime', '$DWJZ', '$LJJZ', 'y_1d', 'y_5d', 'y_20d',
       'y_60d', 'y_120d', 'y_240d', 'y_480d', 'y_720d', 'his_20d', 'his_60d',
       'his_120d', 'his_240d', 'his_480d', 'his_720d', 'rank_20d', 'rank_60d',
       'rank_120d', 'rank_240d', 'rank_480d', 'rank_720d', 'average_rank',
       'rank_of_average_rank'],
      dtype='object')

In [18]:
def simulation_240d(start_rank,end_rank,start_date='1900-01-01',end_date='2021-04-29',dingtou=False,frequency=5):
    if not dingtou:
        return np.mean(df_train[(df_train['rank_of_average_rank']>=start_rank)
                &(df_train['rank_of_average_rank']<=end_rank)&(df_train['datetime']>start_date)&(df_train['datetime']<end_date)]['y_240d'])

In [19]:
start_rank=11
end_rank=15
start_date='1900-01-01'
end_date='2021-04-29'
df_train[(df_train['rank_of_average_rank']>=start_rank)
                &(df_train['rank_of_average_rank']<=end_rank)
                 &(df_train['datetime']>start_date)
                 &(df_train['datetime']<end_date)]

Unnamed: 0,instrument,datetime,$DWJZ,$LJJZ,y_1d,y_5d,y_20d,y_60d,y_120d,y_240d,...,his_480d,his_720d,rank_20d,rank_60d,rank_120d,rank_240d,rank_480d,rank_720d,average_rank,rank_of_average_rank
316,000001,2009-04-14,1.2360,2.9170,0.000000,-0.274259,1.954060,9.496056,6.067879,11.347269,...,13.413687,112.609329,50.0,58.0,46.0,43.0,34.0,59.0,182.0,15.0
4605,000011,2008-07-17,5.3480,5.6280,1.279318,4.761907,-5.152807,-20.131485,-6.645338,48.081020,...,203.232758,479.608582,4.0,51.0,97.0,29.0,1.0,1.0,128.0,14.0
4606,000011,2008-07-18,5.4200,5.7000,2.140355,3.035094,-6.438590,-19.614037,-4.999998,50.263153,...,219.148926,482.226746,3.0,26.0,98.0,30.0,1.0,1.0,130.0,14.0
4608,000011,2008-07-22,5.5550,5.8350,-0.222794,0.582689,-10.814054,-21.782347,-5.329904,49.374454,...,230.407715,497.236420,3.0,76.0,93.0,31.0,1.0,1.0,126.0,11.0
4609,000011,2008-07-23,5.5420,5.8220,1.271039,0.652699,-6.630026,-20.731712,-4.998281,50.498112,...,230.983536,486.304138,2.0,86.0,97.0,31.0,1.0,1.0,130.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1747754,762001,2016-04-14,1.7773,2.4073,0.170319,-0.926349,-1.129896,4.062644,4.818674,8.827321,...,128.765564,134.309891,537.0,291.0,66.0,7.0,11.0,27.0,111.0,11.0
1747756,762001,2016-04-18,1.7776,2.4076,0.054000,-1.204521,-0.440266,4.319658,4.224126,8.282114,...,125.557419,127.884529,436.0,225.0,80.0,2.0,9.0,23.0,114.0,11.0
1747818,762001,2016-07-18,1.8743,2.5043,-0.187685,-0.259554,1.748987,0.618929,-0.023963,3.098661,...,124.681519,138.028732,397.0,395.0,168.0,65.0,8.0,12.0,253.0,13.0
1747833,762001,2016-08-08,1.8719,2.5019,0.871339,1.846598,0.683484,-0.259803,1.251052,3.337469,...,112.909538,133.691376,186.0,521.0,224.0,36.0,8.0,14.0,282.0,14.0


In [20]:
simulation_240d(1,10,'2012-01-01')

7.8717585

In [21]:
simulation_240d(2,11,'2012-01-01')

8.031313

In [22]:
simulation_240d(1,5,'2012-01-01')

7.5532804

In [23]:
simulation_240d(11,20,'2012-01-01')

8.78526

In [24]:
simulation_240d(11,15,'2012-01-01')

8.890366

In [25]:
simulation_240d(21,30,'2012-01-01')

8.599791

# 2013以后

In [26]:
simulation_240d(1,10,'2013-01-01')

8.421692

In [27]:
simulation_240d(1,10,'2014-01-01')

9.003691

In [28]:
simulation_240d(1,10,'2015-01-01')

-1.4595478

In [29]:
simulation_240d(1,10,'2016-01-01')

1.7541798

In [30]:
simulation_240d(1,10,'2017-01-01')

-2.0217662

In [31]:
simulation_240d(1,10,'2018-01-01')

-4.2754984

In [32]:
simulation_240d(1,10,'2019-01-01')

nan

# 定投

# Step 4 预测未来最优潜力组合

In [33]:
df_test[(df_test['instrument'] == '002943')&(df_test['datetime'] == '2021-05-25')]

Unnamed: 0,instrument,datetime,$DWJZ,$LJJZ,y_1d,y_5d,y_20d,y_60d,y_120d,y_240d,...,his_480d,his_720d,rank_20d,rank_60d,rank_120d,rank_240d,rank_480d,rank_720d,average_rank,rank_of_average_rank
1489970,2943,2021-05-25,2.8372,3.0775,0.854588,5.0593,4.532897,,,,...,163.169159,177.903214,947.0,377.0,35.0,67.0,175.0,54.0,708.0,23.0


In [34]:
df_test[(df_test['rank_of_average_rank']==10)&(df_test['datetime']=='2021-04-29')][['instrument','datetime','average_rank','rank_of_average_rank']]

Unnamed: 0,instrument,datetime,average_rank,rank_of_average_rank
826174,1691,2021-04-29,658.0,10.0


In [35]:
df_test[(df_test['rank_of_average_rank']==1)&(df_test['datetime']=='2021-04-29')][['instrument','datetime','average_rank','rank_of_average_rank']]

Unnamed: 0,instrument,datetime,average_rank,rank_of_average_rank
5123309,168203,2021-04-29,235.0,1.0


In [36]:
def get_list(start_rank,end_rank,date='2021-04-29'): 
    return df_test[(df_test['rank_of_average_rank']>=start_rank)&(df_test['rank_of_average_rank']<=end_rank)&(df_test['datetime']==date)][['instrument','datetime','average_rank','rank_of_average_rank']].sort_values(by=['rank_of_average_rank'])

In [37]:
get_list(11,20,'2021-04-29')

Unnamed: 0,instrument,datetime,average_rank,rank_of_average_rank
616920,1300,2021-04-29,673.0,11.0
1988451,4040,2021-04-29,725.0,12.0
2420345,5004,2021-04-29,725.0,12.0
541403,1171,2021-04-29,753.0,14.0
617658,1301,2021-04-29,758.0,15.0
1989188,4041,2021-04-29,776.0,16.0
4667611,90020,2021-04-29,905.0,17.0
933696,1887,2021-04-29,941.0,18.0
5091700,166019,2021-04-29,951.0,19.0
749699,1543,2021-04-29,954.0,20.0


In [38]:
get_list(11,20,'2021-04-29')

Unnamed: 0,instrument,datetime,average_rank,rank_of_average_rank
616920,1300,2021-04-29,673.0,11.0
1988451,4040,2021-04-29,725.0,12.0
2420345,5004,2021-04-29,725.0,12.0
541403,1171,2021-04-29,753.0,14.0
617658,1301,2021-04-29,758.0,15.0
1989188,4041,2021-04-29,776.0,16.0
4667611,90020,2021-04-29,905.0,17.0
933696,1887,2021-04-29,941.0,18.0
5091700,166019,2021-04-29,951.0,19.0
749699,1543,2021-04-29,954.0,20.0


In [39]:
get_list(11,20,'2021-05-25')

Unnamed: 0,instrument,datetime,average_rank,rank_of_average_rank
1659350,3291,2021-05-25,498.0,11.0
2350173,4868,2021-05-25,510.0,12.0
1988466,4040,2021-05-25,514.0,13.0
1989203,4041,2021-05-25,555.0,14.0
4667626,90020,2021-05-25,567.0,15.0
2003824,4075,2021-05-25,614.0,16.0
541418,1171,2021-05-25,626.0,17.0
2420360,5004,2021-05-25,626.0,17.0
382185,831,2021-05-25,665.0,19.0
2497374,5164,2021-05-25,673.0,20.0


In [40]:
get_list(1,10,'2021-05-25')

Unnamed: 0,instrument,datetime,average_rank,rank_of_average_rank
125012,209,2021-05-25,285.0,1.0
841271,1717,2021-05-25,304.0,2.0
2572810,5296,2021-05-25,316.0,3.0
2573546,5297,2021-05-25,361.0,4.0
5123324,168203,2021-05-25,372.0,5.0
1202515,2408,2021-05-25,400.0,6.0
1905704,3834,2021-05-25,419.0,7.0
615472,1298,2021-05-25,441.0,8.0
742330,1532,2021-05-25,472.0,9.0
1368644,2708,2021-05-25,478.0,10.0


In [41]:
get_list(11,20,'2021-05-25')

Unnamed: 0,instrument,datetime,average_rank,rank_of_average_rank
1659350,3291,2021-05-25,498.0,11.0
2350173,4868,2021-05-25,510.0,12.0
1988466,4040,2021-05-25,514.0,13.0
1989203,4041,2021-05-25,555.0,14.0
4667626,90020,2021-05-25,567.0,15.0
2003824,4075,2021-05-25,614.0,16.0
541418,1171,2021-05-25,626.0,17.0
2420360,5004,2021-05-25,626.0,17.0
382185,831,2021-05-25,665.0,19.0
2497374,5164,2021-05-25,673.0,20.0


In [42]:
get_list(21,30,'2021-05-25')

Unnamed: 0,instrument,datetime,average_rank,rank_of_average_rank
2498094,5165,2021-05-25,695.0,21.0
1035571,2083,2021-05-25,701.0,22.0
1489970,2943,2021-05-25,708.0,23.0
1036310,2084,2021-05-25,711.0,24.0
327083,684,2021-05-25,720.0,25.0
1557263,3095,2021-05-25,730.0,26.0
2536722,5235,2021-05-25,787.0,27.0
487047,1054,2021-05-25,789.0,28.0
1144279,2300,2021-05-25,791.0,29.0
1557995,3096,2021-05-25,802.0,30.0


In [43]:
get_list(1,30,'2021-06-25')

Unnamed: 0,instrument,datetime,average_rank,rank_of_average_rank
615494,1298,2021-06-25,102.0,1.0
1905726,3834,2021-06-25,175.0,2.0
966245,1951,2021-06-25,183.0,3.0
1035593,2083,2021-06-25,193.0,4.0
125034,209,2021-06-25,194.0,5.0
1036332,2084,2021-06-25,199.0,6.0
2574316,5299,2021-06-25,296.0,7.0
381471,828,2021-06-25,304.0,8.0
2575064,5300,2021-06-25,323.0,9.0
1089987,2190,2021-06-25,353.0,10.0
