In [1]:
import pandas as pd

data = pd.DataFrame()
%store -r data
data

Unnamed: 0,净值日期,基金代码,单位净值,日增长率
0,2012-08-17,270023,0.8870,0.1129
1,2012-08-17,050025,1.0456,0.0766
2,2012-08-17,519185,0.8146,-0.6101
3,2012-08-17,320013,1.0650,0.0000
4,2012-08-17,163813,0.8280,0.0000
...,...,...,...,...
25845,2023-12-31,270042,5.0879,0.0000
25846,2023-12-31,539001,2.1043,0.0000
25847,2023-12-31,270023,3.0990,0.0000
25848,2023-12-31,160719,1.0510,0.0000


In [2]:
data = data.loc[:, ["净值日期", "基金代码", "单位净值"]]
data.columns = ["date", "fund", "nav"]
data

Unnamed: 0,date,fund,nav
0,2012-08-17,270023,0.8870
1,2012-08-17,050025,1.0456
2,2012-08-17,519185,0.8146
3,2012-08-17,320013,1.0650
4,2012-08-17,163813,0.8280
...,...,...,...
25845,2023-12-31,270042,5.0879
25846,2023-12-31,539001,2.1043
25847,2023-12-31,270023,3.0990
25848,2023-12-31,160719,1.0510


In [3]:
from config import config

data = data.sort_values(["date", "fund"], ignore_index=True)



data.index = data.date.factorize()[0]



cov_list = []  # create empty list for storing coveriance matrices at each time step



for i in range(config.APPROX_BDAYS_PER_YEAR, len(data.index.unique())):


    data_lookback = data.loc[i - config.APPROX_BDAYS_PER_YEAR : i, :]


    price_lookback = data_lookback.pivot_table(
        index="date", columns="fund", values="nav"
    )


    return_lookback = price_lookback.dropna().pct_change()


    covs = return_lookback.cov().values


    covs = covs / covs.max()
    cov_list.append(covs)



df_cov = pd.DataFrame({"date": data.date.unique()[config.APPROX_BDAYS_PER_YEAR :], "cov": cov_list})



data_with_cov = data.merge(df_cov, on="date")


data_with_cov = data_with_cov.sort_values(["date", "fund"]).reset_index(drop=True)
data_with_cov = data_with_cov[data_with_cov["date"] >= config.START_DATE]



data_with_cov

Unnamed: 0,date,fund,nav,cov
590,2014-01-02,050025,1.2765,"[[0.33468638608556567, 0.06816936360615063, 0...."
591,2014-01-02,160719,0.6600,"[[0.33468638608556567, 0.06816936360615063, 0...."
592,2014-01-02,163813,0.8520,"[[0.33468638608556567, 0.06816936360615063, 0...."
593,2014-01-02,164701,0.6410,"[[0.33468638608556567, 0.06816936360615063, 0...."
594,2014-01-02,270023,1.3690,"[[0.33468638608556567, 0.06816936360615063, 0...."
...,...,...,...,...
23545,2023-12-31,270042,5.0879,"[[0.14336567609794507, 0.0027795933198097628, ..."
23546,2023-12-31,290012,1.8280,"[[0.14336567609794507, 0.0027795933198097628, ..."
23547,2023-12-31,320013,1.1870,"[[0.14336567609794507, 0.0027795933198097628, ..."
23548,2023-12-31,519185,1.7079,"[[0.14336567609794507, 0.0027795933198097628, ..."


In [5]:
data_dir = ""
%store -r data_dir

print("Training Data: ", "from ", config.START_DATE, " to ", config.TRAIN_DATE)
print("Testing Data: ", "from ", config.TRAIN_DATE, " to ", config.END_DATE)
choose_data = pd.read_csv(f"{data_dir}/fund_choose_data.csv")

train_data = data_with_cov[data_with_cov["date"] < config.TRAIN_DATE]
test_data = data_with_cov[data_with_cov["date"] >= config.TRAIN_DATE]
train_choose = choose_data[choose_data["date"] < config.TRAIN_DATE]
test_choose = choose_data[choose_data["date"] >= config.TRAIN_DATE]

train_data.index = train_data["date"].factorize()[0]
test_data.index = test_data["date"].factorize()[0]
train_choose = train_choose.reset_index(drop=True).set_index(["date"])
test_choose = test_choose.reset_index(drop=True).set_index(["date"])

%store train_data
%store test_data
%store train_choose
%store test_choose

test_choose


Training Data:  from  2014-01-01  to  2022-01-01
Testing Data:  from  2022-01-01  to  2024-01-01
Stored 'train_data' (DataFrame)
Stored 'test_data' (DataFrame)
Stored 'train_choose' (DataFrame)
Stored 'test_choose' (DataFrame)


Unnamed: 0_level_0,270023,539001,270042,163813,519185,050025,290012,320013,160719,164701
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-01-04,2.999,1.8323,4.5985,0.712,1.0899,3.4292,1.576,1.000,0.873,0.749
2022-01-05,2.839,1.7764,4.4525,0.697,1.0962,3.3655,1.567,1.002,0.873,0.748
2022-01-06,2.848,1.7711,4.4416,0.697,1.0875,3.3600,1.550,0.986,0.860,0.738
2022-01-07,2.804,1.7510,4.3926,0.693,1.1147,3.3487,1.515,0.988,0.862,0.740
2022-01-10,2.822,1.7524,4.3951,0.692,1.1377,3.3395,1.526,0.989,0.863,0.740
...,...,...,...,...,...,...,...,...,...,...
2023-12-26,3.099,2.1151,5.1134,0.779,1.6959,3.7479,1.755,1.192,1.055,0.898
2023-12-27,3.127,2.1184,5.1236,0.781,1.7171,3.7546,1.753,1.198,1.061,0.903
2023-12-28,3.128,2.1168,5.1190,0.780,1.6930,3.7545,1.785,1.194,1.056,0.897
2023-12-29,3.099,2.1043,5.0879,0.775,1.7080,3.7371,1.828,1.187,1.051,0.894
