In [21]:
import pandas as pd
import numpy as np
import time
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

# Step1： 读出来所有的数据

In [136]:
import qlib
from qlib.data import D

qlib.init(provider_uri="~/.qlib/qlib_data/cn_fund_data")
df = D.features(D.instruments(market="all"), ["$DWJZ", "$LJJZ"], freq="day")

[34150:MainThread](2021-05-26 14:36:31,686) INFO - qlib.Initialization - [config.py:276] - default_conf: client.
[34150:MainThread](2021-05-26 14:36:31,691) INFO - qlib.Initialization - [__init__.py:46] - qlib successfully initialized based on client settings.
[34150:MainThread](2021-05-26 14:36:31,692) INFO - qlib.Initialization - [__init__.py:47] - data_path=/Users/wangfan/.qlib/qlib_data/cn_fund_data


In [137]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,$DWJZ,$LJJZ
instrument,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2005-01-04,0.995,1.115
1,2005-01-05,0.998,1.118
1,2005-01-06,0.991,1.111
1,2005-01-07,0.989,1.109
1,2005-01-10,0.992,1.112


In [29]:
df = df.reset_index()

In [30]:
df_new = df.copy()

# Step2 数据加工

In [32]:
df_new["LJJZ_1d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-1)
df_new["LJJZ_5d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-5)
df_new["LJJZ_20d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-20)
df_new["LJJZ_60d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-60)
df_new["LJJZ_120d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-120)
df_new["LJJZ_240d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-240)
df_new["LJJZ_480d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-480)
df_new.index = range(len(df_new))
df_new["y_1d"]=100 * (df_new["LJJZ_1d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_5d"]=100 * (df_new["LJJZ_5d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_20d"]=100 * (df_new["LJJZ_20d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_60d"]=100 * (df_new["LJJZ_60d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_120d"]=100 * (df_new["LJJZ_120d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_240d"]=100 * (df_new["LJJZ_240d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_480d"]=100 * (df_new["LJJZ_480d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new = df_new.drop(columns = ['LJJZ_1d','LJJZ_5d','LJJZ_20d','LJJZ_60d','LJJZ_120d','LJJZ_240d','LJJZ_480d'])

In [33]:
df_new["LJJZ_20d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(20)
df_new["LJJZ_60d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(60)
df_new["LJJZ_120d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(120)
df_new["LJJZ_240d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(240)
df_new["LJJZ_480d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(480)
df_new.index = range(len(df_new))


df_new["his_20d"]=-100 * (df_new["LJJZ_20d"]- df_new["$LJJZ"])/df_new["LJJZ_20d"]
df_new["his_60d"]=-100 * (df_new["LJJZ_60d"]- df_new["$LJJZ"])/df_new["LJJZ_60d"]
df_new["his_120d"]=-100 * (df_new["LJJZ_120d"]- df_new["$LJJZ"])/df_new["LJJZ_120d"]
df_new["his_240d"]=-100 * (df_new["LJJZ_240d"]- df_new["$LJJZ"])/df_new["LJJZ_240d"]
df_new["his_480d"]=-100 * (df_new["LJJZ_480d"]- df_new["$LJJZ"])/df_new["LJJZ_480d"]
df_new = df_new.drop(columns = ['LJJZ_20d','LJJZ_60d','LJJZ_120d','LJJZ_240d','LJJZ_480d'])

In [35]:
df_train = df_new[df_new['y_480d'].notnull()]
df_test = df_new[df_new['y_480d'].isnull()]

In [39]:
df_train = df_train.dropna()

In [41]:
df_train["rank_20d"] = df_train.groupby(['datetime'])["his_20d"].rank(method='min',ascending=False)
df_train["rank_60d"] = df_train.groupby(['datetime'])["his_60d"].rank(method='min',ascending=False)
df_train["rank_120d"] = df_train.groupby(['datetime'])["his_120d"].rank(method='min',ascending=False)
df_train["rank_240d"] = df_train.groupby(['datetime'])["his_240d"].rank(method='min',ascending=False)
df_train["rank_480d"] = df_train.groupby(['datetime'])["his_480d"].rank(method='min',ascending=False)
df_train.index = range(len(df_train))
# rank越小涨幅越大

In [54]:
df_train["average_rank"] = df_train["rank_20d"] + df_train["rank_60d"] + df_train["rank_120d"] + df_train["rank_240d"] + df_train["rank_480d"]
df_train["rank_of_average_rank"] = df_train.groupby(['datetime'])["average_rank"].rank(method='min')
df_train.index = range(len(df_train))
# rank越小，平均排名越高

In [61]:
df_train[(df_train['rank_of_average_rank']==1)&(df_train['datetime']=='2016-05-11')][['instrument','datetime','average_rank','rank_of_average_rank']]

Unnamed: 0,instrument,datetime,average_rank,rank_of_average_rank
2724916,762001,2016-05-11,483.0,1.0


In [62]:
df_train[(df_train['rank_of_average_rank']==10)&(df_train['datetime']=='2016-05-11')][['instrument','datetime','average_rank','rank_of_average_rank']]

Unnamed: 0,instrument,datetime,average_rank,rank_of_average_rank
145555,290,2016-05-11,1238.0,10.0


In [53]:
df_test["rank_20d"] = df_test.groupby(['datetime'])["his_20d"].rank(method='min',ascending=False)
df_test["rank_60d"] = df_test.groupby(['datetime'])["his_60d"].rank(method='min',ascending=False)
df_test["rank_120d"] = df_test.groupby(['datetime'])["his_120d"].rank(method='min',ascending=False)
df_test["rank_240d"] = df_test.groupby(['datetime'])["his_240d"].rank(method='min',ascending=False)
df_test["rank_480d"] = df_test.groupby(['datetime'])["his_480d"].rank(method='min',ascending=False)
df_test.index = range(len(df_test))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [63]:
df_test["average_rank"] = df_test["rank_20d"] + df_test["rank_60d"] + df_test["rank_120d"] + df_test["rank_240d"] + df_test["rank_480d"]
df_test["rank_of_average_rank"] = df_test.groupby(['datetime'])["average_rank"].rank(method='min')
df_test.index = range(len(df_test))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# Step 3 模拟回测

In [71]:
df_train.columns

Index(['instrument', 'datetime', '$DWJZ', '$LJJZ', 'y_1d', 'y_5d', 'y_20d',
       'y_60d', 'y_120d', 'y_240d', 'y_480d', 'his_20d', 'his_60d', 'his_120d',
       'his_240d', 'his_480d', 'rank_20d', 'rank_60d', 'rank_120d',
       'rank_240d', 'rank_480d', 'average_rank', 'rank_of_average_rank'],
      dtype='object')

In [104]:
def simulation_240d(start_rank,end_rank,start_date='1900-01-01',end_date='2021-04-29',dingtou=False,frequency=5):
    if not dingtou:
        return np.mean(df_train[(df_train['rank_of_average_rank']>=start_rank)
                &(df_train['rank_of_average_rank']<=end_rank)&(df_train['datetime']>start_date)&(df_train['datetime']<end_date)]['y_240d'])
    else:
        

In [126]:
start_rank=11
end_rank=15
start_date='1900-01-01'
end_date='2021-04-29'
df_train[(df_train['rank_of_average_rank']>=start_rank)
                &(df_train['rank_of_average_rank']<=end_rank)
                 &(df_train['datetime']>start_date)
                 &(df_train['datetime']<end_date)]

Unnamed: 0,instrument,datetime,$DWJZ,$LJJZ,y_1d,y_5d,y_20d,y_60d,y_120d,y_240d,...,his_120d,his_240d,his_480d,rank_20d,rank_60d,rank_120d,rank_240d,rank_480d,average_rank,rank_of_average_rank
14026,000020,2018-03-02,2.3160,2.3160,0.431779,2.849740,0.129535,-2.763381,-18.652851,-13.816922,...,11.560694,34.183086,57.123474,729.0,75.0,136.0,53.0,89.0,1082.0,12.0
14029,000020,2018-03-07,2.3280,2.3280,0.257734,1.632297,-2.448460,-2.792099,-17.010309,-11.640899,...,11.281074,32.197617,49.903419,139.0,137.0,143.0,57.0,104.0,580.0,11.0
20802,000030,2015-08-25,1.7820,1.7820,0.000000,2.132442,2.974192,10.437716,2.413025,5.050508,...,18.641809,38.677040,78.021965,360.0,376.0,23.0,152.0,31.0,942.0,15.0
20804,000030,2015-08-27,1.7990,1.7990,0.944967,0.944967,2.001112,7.726511,0.166760,3.946636,...,19.693945,39.241482,79.183281,329.0,393.0,27.0,145.0,32.0,926.0,12.0
20811,000030,2015-09-09,1.8360,1.8360,-0.326793,-0.708062,1.688454,4.956431,-2.178647,1.851854,...,18.911913,40.474361,83.233536,314.0,389.0,16.0,161.0,33.0,913.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2724910,762001,2016-05-03,1.7934,2.4234,0.107295,-2.492367,-0.433274,2.290172,3.495094,4.654623,...,5.820699,21.778891,131.993103,297.0,518.0,33.0,4.0,13.0,865.0,13.0
2724911,762001,2016-05-04,1.7960,2.4260,0.086562,-1.463316,-0.511136,2.366034,3.825222,4.311622,...,5.892633,21.799387,132.509109,417.0,507.0,26.0,4.0,14.0,968.0,12.0
2724914,762001,2016-05-09,1.7119,2.3419,0.900970,3.176905,4.252950,6.089065,6.870485,7.767189,...,2.123674,17.594782,123.442429,886.0,233.0,127.0,2.0,8.0,1256.0,11.0
2724922,762001,2016-05-19,1.7281,2.3581,0.330779,0.339262,1.904083,8.057337,7.251601,6.276239,...,2.767364,5.164336,122.988174,682.0,199.0,75.0,126.0,8.0,1090.0,11.0


In [111]:
simulation_240d(1,10,'2012-01-01')

8.651797

In [112]:
simulation_240d(2,11,'2012-01-01')

8.87317

In [113]:
simulation_240d(1,5,'2012-01-01')

8.393454

In [114]:
simulation_240d(11,20,'2012-01-01')

8.842889

In [115]:
simulation_240d(11,15,'2012-01-01')

9.055062

In [116]:
simulation_240d(21,30,'2012-01-01')

8.630151

# 2013以后

In [117]:
simulation_240d(11,15,'2013-01-01')

9.478833

In [118]:
simulation_240d(11,15,'2014-01-01')

9.816602

In [119]:
simulation_240d(11,15,'2015-01-01')

2.7627668

In [120]:
simulation_240d(11,15,'2016-01-01')

5.759387

In [121]:
simulation_240d(11,15,'2017-01-01')

4.541946

In [123]:
simulation_240d(11,15,'2018-01-01')

5.1824164

# 定投

# Step 4 预测未来最优潜力组合

In [66]:
df_test.tail()

Unnamed: 0,instrument,datetime,$DWJZ,$LJJZ,y_1d,y_5d,y_20d,y_60d,y_120d,y_240d,...,his_120d,his_240d,his_480d,rank_20d,rank_60d,rank_120d,rank_240d,rank_480d,average_rank,rank_of_average_rank
4592256,980003,2021-04-23,1.4565,1.4565,0.04806,,,,,,...,1.988661,4.244207,,6944.0,823.0,5798.0,5216.0,,,
4592257,980003,2021-04-26,1.4572,1.4572,0.020583,,,,,,...,1.952011,4.294307,,5946.0,739.0,5596.0,5181.0,,,
4592258,980003,2021-04-27,1.4575,1.4575,0.006862,,,,,,...,1.972996,4.750611,,5560.0,692.0,6079.0,5254.0,,,
4592259,980003,2021-04-28,1.4576,1.4576,0.013723,,,,,,...,1.951457,4.802991,,5442.0,968.0,5884.0,5108.0,,,
4592260,980003,2021-04-29,1.4578,1.4578,,,,,,,...,1.958316,4.847524,,5968.0,999.0,5615.0,5051.0,,,


In [67]:
df_test[(df_test['rank_of_average_rank']==10)&(df_test['datetime']=='2021-04-29')][['instrument','datetime','average_rank','rank_of_average_rank']]

Unnamed: 0,instrument,datetime,average_rank,rank_of_average_rank
249317,684,2021-04-29,792.0,10.0


In [68]:
df_test[(df_test['rank_of_average_rank']==1)&(df_test['datetime']=='2021-04-29')][['instrument','datetime','average_rank','rank_of_average_rank']]

Unnamed: 0,instrument,datetime,average_rank,rank_of_average_rank
1857204,5296,2021-04-29,431.0,1.0


In [130]:
def get_list(start_rank,end_rank,date='2021-04-29'): 
    return df_test[(df_test['rank_of_average_rank']>=start_rank)&(df_test['rank_of_average_rank']<=end_rank)&(df_test['datetime']==date)][['instrument','datetime','average_rank','rank_of_average_rank']]

In [132]:
get_list(11,20,'2021-04-29')

Unnamed: 0,instrument,datetime,average_rank,rank_of_average_rank
290406,831,2021-04-29,861.0,11.0
481267,1365,2021-04-29,863.0,12.0
541734,1532,2021-04-29,1415.0,19.0
546718,1543,2021-04-29,899.0,13.0
983857,2708,2021-04-29,1295.0,16.0
1198252,3291,2021-04-29,1279.0,15.0
1697167,4868,2021-04-29,1370.0,17.0
1748588,5004,2021-04-29,1388.0,18.0
4136281,501038,2021-04-29,1448.0,20.0
4333707,519171,2021-04-29,1210.0,14.0
