In [73]:
import os 
import pandas as pd
import numpy as np

In [74]:
crt_dir = os.path.abspath("")
data_dir = os.path.abspath("data")
idx000300_dir = os.path.join(data_dir, "000300Weight_of_Constituent_Stock")

# Data Preparation

todo: description of our index data. why 000300? what did we do?

The historical daily constituent data of index 000300 from 2009-12 to current time we downloaded from CSMAR are seperated into 3 files due to the CSMAR's 5-year data maximum query policy. The following cell combine all data files and read them into RAM:

In [75]:
def genIDX_all(dir, namelist, output_name=None, force=False):
    if not os.path.lexists(os.path.join(dir, output_name)) or force:
        dflist = []
        for name in namelist:
            file = os.path.join(dir, name)
            dflist.append(pd.read_csv(file, header=0, index_col=1, parse_dates=True))
        df = pd.concat(dflist)
        df.to_csv(os.path.join(dir, output_name))
        return df
    else:
        return pd.read_csv(os.path.join(dir, output_name), header=0, index_col=0, parse_dates=True)

In [76]:
idx_filelist = list(filter(lambda file: file.startswith("IDX_Smprat_"), os.listdir(idx000300_dir)))
idx_filelist

['IDX_Smprat_1.csv', 'IDX_Smprat_2.csv', 'IDX_Smprat_3.csv']

In [77]:
df = genIDX_all(idx000300_dir, idx_filelist, 'All_IDX_Smprat.csv')
df

Unnamed: 0_level_0,Indexcd,Stkcd,Constdnme_en,Weight
Enddt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-12-01,300,600547,SDHJ,0.540
2009-12-01,300,600548,SGS,0.030
2009-12-01,300,600549,XMWY,0.080
2009-12-01,300,600550,TWBB,0.360
2009-12-01,300,600569,AYGT,0.100
...,...,...,...,...
2022-12-08,300,601186,ZGTJ,0.220
2022-12-08,300,2555,SQHY,0.125
2022-12-08,300,600919,JSYH,0.478
2022-12-08,300,600010,BGGF,0.246


**Enable the following cell if you need to regenerate the combined dataset:**

In [78]:
# df = genIDX_all(idx000300_dir, idx_filelist, 'All_IDX_Smprat.csv', force=True)

**Select the stocks that are in 000016 portfolio during 2019-12-01 to 2022-12-08**

In [79]:
numOfDays = df.index.unique().size
stock_mask = df.groupby("Stkcd")["Indexcd"].count() == numOfDays
stock_list = stock_mask.index[stock_mask]
stock_list = stock_list[stock_list>=600000].values
print(stock_list.size, stock_list, sep='\n')

52
[600000 600009 600010 600015 600016 600019 600028 600029 600030 600031
 600036 600048 600050 600085 600104 600111 600196 600309 600352 600362
 600383 600489 600519 600547 600585 600588 600660 600690 600741 600795
 600837 600900 601006 601009 601088 601111 601166 601169 601186 601318
 601328 601390 601398 601600 601601 601628 601766 601857 601899 601939
 601988 601998]


**Now we have obtained the list of stocks of interest. Next step is to obtain the monthly return rates of these stocks:**

In [80]:
TRD_df = pd.read_csv(os.path.join(data_dir, "TRD_Mnth.csv"),
                   header=0, index_col=1, parse_dates=True)
stock_dict = stock_mask.to_dict()
TRD_df = TRD_df[TRD_df['Stkcd'].apply(lambda x: stock_dict.get(x, False))]


(184780, 2)


**Store the monthly return rates of our selected stockes:**

In [81]:
R_df_list = []
for stock in stock_list:
    stock_df = TRD_df[TRD_df['Stkcd'] == stock]
    tmp = pd.DataFrame(stock_df['Mretwd'], index=stock_df.index)
    tmp.columns = [stock]
    R_df_list.append(tmp)


In [82]:
R_df = pd.concat(R_df_list, axis=1)
R_df.to_csv(os.path.join(data_dir, "Monthly_Return_Rates.csv"))
R_df

Unnamed: 0_level_0,600000,600009,600010,600015,600016,600019,600028,600029,600030,600031,...,601398,601600,601601,601628,601766,601857,601899,601939,601988,601998
Trdmnt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-12-01,-0.014986,0.139307,-0.027254,0.098143,0.027273,0.182375,0.157765,0.011686,0.088386,0.014609,...,0.040153,-0.006181,0.042735,0.036298,-0.029010,0.025984,-0.036000,0.043845,0.050971,0.270062
2010-01-01,-0.095436,0.064868,-0.131466,-0.132850,-0.085967,-0.215321,-0.189496,0.004950,-0.109852,-0.162184,...,-0.108456,-0.123013,-0.137393,-0.146418,-0.042179,-0.053546,-0.139004,-0.087237,-0.053118,-0.157959
2010-02-01,0.057085,0.031806,0.029777,0.018570,0.013831,0.060686,-0.011384,0.087028,-0.033946,0.060311,...,0.004124,-0.000788,0.074661,-0.002588,0.020183,-0.006116,0.026506,-0.003540,0.009756,0.002886
2010-03-01,0.098361,-0.031870,0.019277,0.168642,0.049113,-0.019900,0.038973,0.140483,0.040264,0.037003,...,0.022587,0.012618,0.136842,0.055597,0.000000,-0.012308,-0.037559,0.007105,0.036232,0.057554
2010-04-01,-0.096137,-0.134377,-0.073286,-0.007800,-0.087126,-0.125635,-0.140665,0.117881,0.018297,-0.176644,...,-0.088353,-0.103583,-0.061852,-0.067065,-0.043165,-0.057632,-0.023171,-0.074074,-0.023310,-0.175510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-01,-0.042936,-0.090300,-0.038298,-0.017275,-0.021505,-0.058140,0.007353,-0.124487,-0.095106,-0.105981,...,-0.022046,-0.016842,-0.102406,-0.082802,-0.009616,-0.018868,-0.042872,-0.022266,0.007797,-0.018817
2022-08-01,0.001377,0.093835,-0.110619,0.003906,-0.010989,-0.059965,0.038930,0.006250,0.023639,-0.075704,...,0.000000,-0.027422,0.038633,0.101543,0.007977,0.026923,-0.014558,-0.008977,-0.006536,0.027523
2022-09-01,-0.031637,0.024105,-0.084577,-0.021401,-0.058333,-0.013133,0.043140,0.031056,-0.107582,-0.118730,...,-0.006849,-0.084257,-0.030520,0.030293,-0.063872,-0.001606,-0.109091,0.000000,0.016447,0.017857
2022-10-01,-0.056818,-0.076151,-0.043478,-0.047714,-0.026549,-0.053466,-0.051282,-0.027109,-0.012055,-0.027378,...,-0.045977,-0.065375,-0.063945,-0.159026,0.012793,-0.066277,0.010204,-0.032609,-0.025890,-0.050439
