In [139]:
import pandas as pd
import numpy as np
import time
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

# Step1： 读出来所有基金的数据

In [140]:
import qlib
from qlib.data import D

qlib.init(provider_uri="~/.qlib/qlib_data/cn_fund_data")
df = D.features(D.instruments(market="all"), ["$DWJZ", "$LJJZ"], freq="day")

[52270:MainThread](2021-05-13 23:42:25,669) INFO - qlib.Initialization - [config.py:276] - default_conf: client.
[52270:MainThread](2021-05-13 23:42:25,700) INFO - qlib.Initialization - [__init__.py:46] - qlib successfully initialized based on client settings.
[52270:MainThread](2021-05-13 23:42:25,701) INFO - qlib.Initialization - [__init__.py:47] - data_path=/Users/wangfan/.qlib/qlib_data/cn_fund_data


In [141]:
df.shape

(9720744, 2)

In [142]:
df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,$DWJZ,$LJJZ
instrument,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1
980003,2021-04-23,1.4565,1.4565
980003,2021-04-26,1.4572,1.4572
980003,2021-04-27,1.4575,1.4575
980003,2021-04-28,1.4576,1.4576
980003,2021-04-29,1.4578,1.4578


In [143]:
df.columns

Index(['$DWJZ', '$LJJZ'], dtype='object')

In [144]:
df = df.reset_index()

In [145]:
df.head()

Unnamed: 0,instrument,datetime,$DWJZ,$LJJZ
0,1,2005-01-04,0.995,1.115
1,1,2005-01-05,0.998,1.118
2,1,2005-01-06,0.991,1.111
3,1,2005-01-07,0.989,1.109
4,1,2005-01-10,0.992,1.112


LJJZ:因此累计净值更能准确地体现一只基金的赚钱能力。

# 收益和时间的关系

## 策略1:收益预测，随机选择1个基金10000元，拿在手上200天，涨幅多少？亏损多少？

In [146]:
df.shape[0]

9720744

In [147]:
np.mean([1,2,3,4,5])

3.0

In [148]:
import random
import datetime
import numpy as np

In [149]:
st = '2020-04-29 00:00:00'
dt = datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S")

In [65]:
count = 0
sum_ins = 0
list1 = []
for i in range(10000):
    index = random.randint(1,df.shape[0])
    if (df.iloc[index]['datetime'] < dt) & (df.iloc[index]['$LJJZ'] > 0):
        item = df.iloc[index+200]['$LJJZ']/df.iloc[index]['$LJJZ']
        if not(np.isnan(item)):
            count += 1
            sum_ins += item
            list1.append(item)
print(np.mean(list1))
print(np.std(list1))

1.0893517
0.19377252


## 策略2:收益预测，随机选择1个基金10000元，拿在手上100天，涨幅多少？亏损多少？

In [66]:
st = '2020-04-29 00:00:00'
dt = datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S")

In [67]:
count = 0
sum_ins = 0
list1 = []
for i in range(10000):
    index = random.randint(1,df.shape[0])
    if (df.iloc[index]['datetime'] < dt) & (df.iloc[index]['$LJJZ'] > 0):
        item = df.iloc[index+100]['$LJJZ']/df.iloc[index]['$LJJZ']
        if not(np.isnan(item)):
            count += 1
            sum_ins += item
            list1.append(item)
print(np.mean(list1))
print(np.std(list1))

1.0404161
0.11959813


## 策略3:收益预测，随机选择1个基金10000元，拿在手上400天，涨幅多少？亏损多少？

In [68]:
st = '2019-04-29 00:00:00'
dt = datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S")
count = 0
sum_ins = 0
list1 = []
for i in range(10000):
    index = random.randint(1,df.shape[0])
    if (df.iloc[index]['datetime'] < dt) & (df.iloc[index]['$LJJZ'] > 0):
        item = df.iloc[index+400]['$LJJZ']/df.iloc[index]['$LJJZ']
        if not(np.isnan(item)):
            count += 1
            sum_ins += item
            list1.append(item)
print(np.mean(list1))
print(np.std(list1))

1.1310375
0.26123962


# Step2: 数据加工

In [150]:
df_new = df.copy()

In [151]:
df_new.shape

(9720744, 4)

创建特征

In [152]:
df_temp = df_new.groupby(['instrument'])["$LJJZ"].rolling(5).agg({'max_5d':'max','min_5d':'min','mean_5d':'mean','std_5d':'std','sum_5d':'sum','median_5d':'median'})
df_temp = df_temp.reset_index()
df_temp = df_temp.drop(['instrument','level_1'],axis = 1)
df_new = pd.merge(df_new, df_temp, how = 'inner', left_index=True, right_index=True) 

df_temp = df_new.groupby(['instrument'])["$LJJZ"].rolling(20).agg({'max_20d':'max','min_20d':'min','mean_20d':'mean','std_20d':'std','sum_20d':'sum','median_20d':'median'})
df_temp = df_temp.reset_index()
df_temp = df_temp.drop(['instrument','level_1'],axis = 1)
df_new = pd.merge(df_new, df_temp, how = 'inner', left_index=True, right_index=True) 

df_temp = df_new.groupby(['instrument'])["$LJJZ"].rolling(60).agg({'max_60d':'max','min_60d':'min','mean_60d':'mean','std_60d':'std','sum_60d':'sum','median_60d':'median'})
df_temp = df_temp.reset_index()
df_temp = df_temp.drop(['instrument','level_1'],axis = 1)
df_new = pd.merge(df_new, df_temp, how = 'inner', left_index=True, right_index=True) 

df_temp = df_new.groupby(['instrument'])["$LJJZ"].rolling(120).agg({'max_120d':'max','min_120d':'min','mean_120d':'mean','std_120d':'std','sum_120d':'sum','median_120d':'median'})
df_temp = df_temp.reset_index()
df_temp = df_temp.drop(['instrument','level_1'],axis = 1)
df_new = pd.merge(df_new, df_temp, how = 'inner', left_index=True, right_index=True) 

df_temp = df_new.groupby(['instrument'])["$LJJZ"].rolling(240).agg({'max_240d':'max','min_240d':'min','mean_240d':'mean','std_240d':'std','sum_240d':'sum','median_240d':'median'})
df_temp = df_temp.reset_index()
df_temp = df_temp.drop(['instrument','level_1'],axis = 1)
df_new = pd.merge(df_new, df_temp, how = 'inner', left_index=True, right_index=True) 

df_temp = df_new.groupby(['instrument'])["$LJJZ"].rolling(480).agg({'max_480d':'max','min_480d':'min','mean_480d':'mean','std_480d':'std','sum_480d':'sum','median_480d':'median'})
df_temp = df_temp.reset_index()
df_temp = df_temp.drop(['instrument','level_1'],axis = 1)
df_new = pd.merge(df_new, df_temp, how = 'inner', left_index=True, right_index=True) 

In [153]:
df_new = df_new.dropna()

In [154]:
df_no_label = df_new.copy()

In [155]:
df_no_label.shape

(4871908, 40)

In [156]:
df_no_label.head()

Unnamed: 0,instrument,datetime,$DWJZ,$LJJZ,max_5d,min_5d,mean_5d,std_5d,sum_5d,median_5d,...,mean_240d,std_240d,sum_240d,median_240d,max_480d,min_480d,mean_480d,std_480d,sum_480d,median_480d
479,1,2006-12-26,1.69,2.04,2.053,2.018,2.0372,0.012755,10.186,2.04,...,1.504504,0.249264,361.081,1.576,2.053,0.979,1.289679,0.27953,619.046001,1.138
480,1,2006-12-27,1.719,2.069,2.069,2.018,2.0404,0.018447,10.202,2.04,...,1.5086,0.250432,362.064,1.579,2.069,0.979,1.291667,0.281669,620.000001,1.1385
481,1,2006-12-28,1.711,2.061,2.069,2.018,2.0458,0.019993,10.229,2.041,...,1.512692,0.251404,363.046,1.58,2.069,0.979,1.293631,0.283736,620.943001,1.1395
482,1,2006-12-29,1.753,2.103,2.103,2.04,2.0628,0.025753,10.314,2.061,...,1.516888,0.25282,364.053,1.5815,2.103,0.979,1.295698,0.286007,621.935001,1.1405
483,1,2007-01-04,1.753,2.103,2.103,2.04,2.0752,0.027499,10.376,2.069,...,1.521017,0.254268,365.044,1.583,2.103,0.979,1.297769,0.288242,622.929001,1.141


创建label

In [157]:
df_new["LJJZ_1d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-1)
df_new["LJJZ_5d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-5)
df_new["LJJZ_20d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-20)
df_new["LJJZ_60d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-60)
df_new["LJJZ_120d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-120)
df_new["LJJZ_240d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-240)
df_new["LJJZ_480d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(-480)
df_new = df_new.dropna()
df_new.index = range(len(df_new))
df_new["y_1d"]=100 * (df_new["LJJZ_1d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_5d"]=100 * (df_new["LJJZ_5d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_20d"]=100 * (df_new["LJJZ_20d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_60d"]=100 * (df_new["LJJZ_60d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_120d"]=100 * (df_new["LJJZ_120d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_240d"]=100 * (df_new["LJJZ_240d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new["y_480d"]=100 * (df_new["LJJZ_480d"]- df_new["$LJJZ"])/df_new["$LJJZ"]
df_new = df_new.drop(columns = ['LJJZ_1d','LJJZ_5d','LJJZ_20d','LJJZ_60d','LJJZ_120d','LJJZ_240d','LJJZ_480d'])

In [158]:
df_new["LJJZ_20d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(20)
df_new["LJJZ_60d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(60)
df_new["LJJZ_120d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(120)
df_new["LJJZ_240d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(240)
df_new["LJJZ_480d"] = df_new.groupby(['instrument'])["$LJJZ"].shift(240)
df_new = df_new.dropna()
df_new.index = range(len(df_new))


df_new["his_20d"]=-100 * (df_new["LJJZ_20d"]- df_new["$LJJZ"])/df_new["LJJZ_20d"]
df_new["his_60d"]=-100 * (df_new["LJJZ_60d"]- df_new["$LJJZ"])/df_new["LJJZ_60d"]
df_new["his_120d"]=-100 * (df_new["LJJZ_120d"]- df_new["$LJJZ"])/df_new["LJJZ_120d"]
df_new["his_240d"]=-100 * (df_new["LJJZ_240d"]- df_new["$LJJZ"])/df_new["LJJZ_240d"]
df_new["his_480d"]=-100 * (df_new["LJJZ_480d"]- df_new["$LJJZ"])/df_new["LJJZ_480d"]
df_new = df_new.drop(columns = ['LJJZ_20d','LJJZ_60d','LJJZ_120d','LJJZ_240d','LJJZ_480d'])

In [159]:
df_new.shape

(1858429, 52)

In [160]:
df_new.head()

Unnamed: 0,instrument,datetime,$DWJZ,$LJJZ,max_5d,min_5d,mean_5d,std_5d,sum_5d,median_5d,...,y_20d,y_60d,y_120d,y_240d,y_480d,his_20d,his_60d,his_120d,his_240d,his_480d
0,1,2007-12-21,2.11,3.291,3.291,3.213,3.2452,0.03072,16.226,3.234,...,-0.39501,-5.226373,-12.792464,-19.598907,-0.577327,3.9482,-2.053572,18.338728,61.323524,61.323524
1,1,2007-12-24,2.143,3.324,3.324,3.213,3.2644,0.044286,16.322,3.26,...,1.293629,-6.046931,-12.966306,-20.186516,-1.534292,5.123338,-1.100868,19.740629,60.657314,60.657314
2,1,2007-12-25,2.152,3.333,3.333,3.234,3.2884,0.04192,16.442,3.291,...,1.830186,-7.140712,-12.061206,-20.102005,-1.890188,6.146492,-0.68534,18.99321,61.717602,61.717602
3,1,2007-12-26,2.174,3.355,3.355,3.26,3.3126,0.03734,16.563,3.324,...,1.75857,-6.497763,-12.339791,-20.417286,-2.742176,7.670093,0.569546,20.423546,59.534008,59.534008
4,1,2007-12-27,2.211,3.392,3.392,3.291,3.339,0.037517,16.695,3.333,...,-1.562499,-8.549527,-14.327832,-21.521227,-3.891509,7.070706,0.952383,24.477066,61.293396,61.293396


In [161]:
df_new.columns

Index(['instrument', 'datetime', '$DWJZ', '$LJJZ', 'max_5d', 'min_5d',
       'mean_5d', 'std_5d', 'sum_5d', 'median_5d', 'max_20d', 'min_20d',
       'mean_20d', 'std_20d', 'sum_20d', 'median_20d', 'max_60d', 'min_60d',
       'mean_60d', 'std_60d', 'sum_60d', 'median_60d', 'max_120d', 'min_120d',
       'mean_120d', 'std_120d', 'sum_120d', 'median_120d', 'max_240d',
       'min_240d', 'mean_240d', 'std_240d', 'sum_240d', 'median_240d',
       'max_480d', 'min_480d', 'mean_480d', 'std_480d', 'sum_480d',
       'median_480d', 'y_1d', 'y_5d', 'y_20d', 'y_60d', 'y_120d', 'y_240d',
       'y_480d', 'his_20d', 'his_60d', 'his_120d', 'his_240d', 'his_480d'],
      dtype='object')

In [200]:
df_new["rank_20d"] = df_new.groupby(['datetime'])["his_20d"].rank(method='min',ascending=False)
df_new["rank_60d"] = df_new.groupby(['datetime'])["his_60d"].rank(method='min',ascending=False)
df_new["rank_120d"] = df_new.groupby(['datetime'])["his_120d"].rank(method='min',ascending=False)
df_new["rank_240d"] = df_new.groupby(['datetime'])["his_240d"].rank(method='min',ascending=False)
df_new["rank_480d"] = df_new.groupby(['datetime'])["his_480d"].rank(method='min',ascending=False)
df_new = df_new.dropna()
df_new.index = range(len(df_new))

In [212]:
st = '2019-04-29 00:00:00'
dt = datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S")
df_new[df_new['datetime']==dt][['instrument','datetime','rank_20d','his_20d']]

Unnamed: 0,instrument,datetime,rank_20d,his_20d
2760,000001,2019-04-29,680.0,0.028199
3506,000003,2019-04-29,2365.0,-4.387759
4252,000004,2019-04-29,2365.0,-4.387759
4997,000008,2019-04-29,2392.0,-4.565740
7766,000011,2019-04-29,664.0,0.055028
...,...,...,...,...
1854960,750002,2019-04-29,1089.0,-0.352360
1855834,750003,2019-04-29,1184.0,-0.433520
1856661,750005,2019-04-29,369.0,0.844952
1857548,762001,2019-04-29,355.0,0.924260


# 历史业绩和未来业绩的关系

## 最近60天最好的基金，未来240天涨幅平均为

In [226]:
st = '2016-01-01 00:00:00'
dt = datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S")

In [227]:
np.mean(df_new[(df_new['rank_60d']==1.0)&(df_new['datetime']>dt)]['y_240d'])

4.7507873

In [228]:
np.mean(df_new[(df_new['rank_60d']==1.0)&(df_new['datetime']>dt)]['y_120d'])

1.5368642

In [229]:
np.mean(df_new[(df_new['rank_60d']==1.0)&(df_new['datetime']>dt)]['y_60d'])

-0.5927418

### 最近120天最好的基金，未来240天涨幅平均为

In [230]:
np.mean(df_new[(df_new['rank_120d']==1.0)&(df_new['datetime']>dt)]['y_240d'])

-0.1344907

In [231]:
np.mean(df_new[(df_new['rank_120d']==1.0)&(df_new['datetime']>dt)]['y_120d'])

-0.5726641

In [232]:
np.mean(df_new[(df_new['rank_120d']==1.0)&(df_new['datetime']>dt)]['y_60d'])

-1.3292818

## 最近240天最好的基金，未来240天涨幅平均为


In [233]:
np.mean(df_new[df_new['rank_240d']==1.0]['y_240d'])

8.646881

In [234]:
np.mean(df_new[df_new['rank_240d']==2.0]['y_240d'])

7.904129

In [235]:
np.mean(df_new[df_new['rank_240d']==3.0]['y_240d'])

4.881589

In [236]:
np.mean(df_new[df_new['rank_240d']==1500.0]['y_240d'])

7.839954

In [237]:
np.mean(df_new[df_new['rank_240d']==60.0]['y_240d'])

9.515042

# 把几个时期综合排名最低的基金挑选出来

In [238]:
df_new["average_rank"] = df_new["rank_20d"] + df_new["rank_60d"] + df_new["rank_120d"] + df_new["rank_240d"] + df_new["rank_480d"]
df_new["rank_of_average_rank"] = df_new.groupby(['datetime'])["average_rank"].rank(method='min')
df_new = df_new.dropna()
df_new.index = range(len(df_new))

In [239]:
np.mean(df_new[df_new['rank_of_average_rank']==1.0]['y_240d'])

9.577209

In [240]:
np.mean(df_new[df_new['rank_of_average_rank']==100.0]['y_240d'])

7.4145055

In [241]:
np.mean(df_new[df_new['rank_of_average_rank']==101.0]['y_240d'])

7.502192

In [242]:
np.mean(df_new[df_new['rank_of_average_rank']==1000.0]['y_240d'])

4.5252967

In [245]:
st = '2020-04-29 00:00:00'
dt = datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S")


In [248]:
df_new.head()

Unnamed: 0,instrument,datetime,$DWJZ,$LJJZ,max_5d,min_5d,mean_5d,std_5d,sum_5d,median_5d,...,his_120d,his_240d,his_480d,rank_20d,rank_60d,rank_120d,rank_240d,rank_480d,average_rank,rank_of_average_rank
0,1,2007-12-21,2.11,3.291,3.291,3.213,3.2452,0.03072,16.226,3.234,...,18.338728,61.323524,61.323524,2.0,2.0,2.0,2.0,2.0,10.0,2.0
1,1,2007-12-24,2.143,3.324,3.324,3.213,3.2644,0.044286,16.322,3.26,...,19.740629,60.657314,60.657314,2.0,2.0,2.0,2.0,2.0,10.0,2.0
2,1,2007-12-25,2.152,3.333,3.333,3.234,3.2884,0.04192,16.442,3.291,...,18.99321,61.717602,61.717602,2.0,2.0,2.0,2.0,2.0,10.0,2.0
3,1,2007-12-26,2.174,3.355,3.355,3.26,3.3126,0.03734,16.563,3.324,...,20.423546,59.534008,59.534008,2.0,2.0,2.0,2.0,2.0,10.0,2.0
4,1,2007-12-27,2.211,3.392,3.392,3.291,3.339,0.037517,16.695,3.333,...,24.477066,61.293396,61.293396,2.0,2.0,2.0,2.0,2.0,10.0,2.0


In [250]:
df_new.tail()

Unnamed: 0,instrument,datetime,$DWJZ,$LJJZ,max_5d,min_5d,mean_5d,std_5d,sum_5d,median_5d,...,his_120d,his_240d,his_480d,rank_20d,rank_60d,rank_120d,rank_240d,rank_480d,average_rank,rank_of_average_rank
1858424,770001,2019-05-08,0.968,1.768,1.8193,1.768,1.7909,0.023821,8.9545,1.7814,...,6.976467,-2.428258,-2.428258,1093.0,1293.0,1070.0,1397.0,1397.0,6250.0,1214.0
1858425,770001,2019-05-09,0.9445,1.7445,1.8193,1.7445,1.77712,0.027235,8.8856,1.7724,...,5.791388,-4.285085,-4.285085,1201.0,1368.0,1155.0,1479.0,1479.0,6682.0,1454.0
1858426,770001,2019-05-10,0.979,1.779,1.7814,1.7445,1.76906,0.014719,8.8453,1.7724,...,8.113042,-2.611262,-2.611262,1285.0,1358.0,1178.0,1468.0,1468.0,6757.0,1472.0
1858427,770001,2019-05-13,0.9676,1.7676,1.7814,1.7445,1.7681,0.014602,8.8405,1.768,...,8.09026,-3.261821,-3.261821,1343.0,1340.0,1151.0,1515.0,1515.0,6864.0,1553.0
1858428,770001,2019-05-14,0.9631,1.7631,1.779,1.7445,1.76444,0.01259,8.8222,1.7676,...,7.335929,-2.70941,-2.70941,1319.0,1315.0,1059.0,1451.0,1451.0,6595.0,1435.0


In [244]:

df_new[(df_new['rank_of_average_rank']==1.0) & (df_new['datetime']==dt)][['instrument','rank_of_average_rank']]

Unnamed: 0,instrument,rank_of_average_rank


训练集和验证集分离