In [15]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import os
import typing
import glob
import matplotlib.pyplot as plt
import seaborn
import yfinance as yf

In [41]:
data_dir = 'data/kaggle'
start_train = '2010-01-01'
end_train = '2015-12-31'
start_test = '2016-01-01'
end_test = '2017-01-01'
seed = 2024

In [60]:
def get_files(path: str, extension: str) -> typing.Dict[str, str]:
    files = glob.glob(path + f"/*{extension}")
    return {os.path.basename(file):file for file in files}


def get_df(start_train: str, end_train: str, start_test: str, end_test: str, files: typing.Dict[str, str], log: bool) -> typing.List[pd.core.series.Series]:
    df_list = []
    err_files = []
    skipped_files = []
    for symbol, file in files.items():
        try:
            df = pd.read_csv(file, parse_dates=True)
            if df['Date'].iloc[0] > start_train or df['Date'].iloc[-1] < end_test:
                skipped_files.append(symbol)
                continue
            else:
                df = df.set_index('Date', drop = True)
                if log:
                    df[symbol] = np.log(df['Close']/df['Close'].shift(1))
                else:
                    df[symbol] = df['Close'].pct_change()
            returns = df[symbol][start_train:end_train]
            returns.dropna(inplace = True)
            df_list.append(returns)
        except pd.errors.EmptyDataError:
            err_files.append(symbol)
            print(symbol)
    return df_list, err_files, skipped_files

def combine_df(df_list: typing.List[pd.core.series.Series], threshold: float) -> pd.DataFrame:
    df = pd.concat(df_list, axis = 1)
    df.dropna(axis = 1, thresh = int(len(df)*threshold), inplace = True)
    df.fillna(method='ffill', axis = 0, inplace = True)
    df.dropna(axis=0, inplace=True)
    return df

def scale_and_pca(df: pd.DataFrame, n_components: float, seed: int):
    pipeline = Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=n_components, random_state=seed))])
    transformed_df = pipeline.fit_transform(df.values)
    return pipeline, transformed_df

def fetch_spy(start, end):
    sp500_data = yf.download("^GSPC", start=start, end=end)
    sp500_data['SP500'] = np.log(sp500_data['Close']/sp500_data['Close'].shift(1))
    returns = sp500_data['SP500']
    returns.dropna(inplace = True, axis = 0)
    return returns        




In [64]:
np.exp(fetch_spy(start_train, end_train).cumsum()) * 100000

[*********************100%%**********************]  1 of 1 completed


Date
2010-01-05    100311.567563
2010-01-06    100366.289592
2010-01-07    100767.875378
2010-01-08    101058.260915
2010-01-11    101234.784981
                  ...      
2015-12-23    182198.435719
2015-12-24    181907.166700
2015-12-28    181510.871034
2015-12-29    183440.288558
2015-12-30    182116.358062
Name: SP500, Length: 1508, dtype: float64

In [6]:
df, err_files, skipped_files = get_df(start_train, end_train, start_test, end_test, get_files(data_dir, '.txt'), True)

accp.us.txt
amrh.us.txt
amrhw.us.txt
asns.us.txt
bbrx.us.txt
bolt.us.txt
boxl.us.txt
bxg.us.txt
ehr.us.txt
fmax.us.txt
gnst.us.txt
hayu.us.txt
jt.us.txt
mapi.us.txt
molc.us.txt
otg.us.txt
pbio.us.txt
pxus.us.txt
rbio.us.txt
sail.us.txt
sbt.us.txt
scci.us.txt
scph.us.txt
send.us.txt
sfix.us.txt
srva.us.txt
stnl.us.txt
vist.us.txt
vmet.us.txt
wnfm.us.txt
wspt.us.txt
znwaa.us.txt


In [10]:
df_combine = combine_df(df, threshold=0.75)

  df.fillna(method='ffill', axis = 0, inplace = True)


In [12]:
df_combine

Unnamed: 0_level_0,a.us.txt,aa.us.txt,aaba.us.txt,aame.us.txt,aan.us.txt,aaon.us.txt,aap.us.txt,aapl.us.txt,aau.us.txt,aav.us.txt,...,zbra.us.txt,zeus.us.txt,zf.us.txt,zion.us.txt,ziop.us.txt,zixi.us.txt,zn.us.txt,znh.us.txt,ztr.us.txt,zumz.us.txt
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-11-24,0.025349,0.013949,0.013497,0.031775,0.022083,0.024671,0.010966,0.019462,0.276490,0.013524,...,0.009407,0.022668,0.012478,-0.005114,0.028848,0.029141,0.012579,0.040262,-0.021652,0.007976
2010-11-26,0.000285,-0.010780,-0.011646,0.030744,-0.016177,0.000376,0.001637,0.000620,-0.005208,-0.012012,...,-0.001606,-0.021770,0.003065,0.002507,0.018780,0.000000,-0.012579,0.001413,-0.002837,-0.007643
2010-11-29,-0.019465,0.009231,0.009816,-0.030744,-0.011950,-0.028679,-0.002973,0.005883,-0.010499,0.004521,...,-0.010776,-0.015421,0.000000,-0.012868,-0.009346,-0.013141,0.012579,0.014620,-0.005249,0.021123
2010-11-30,-0.017819,-0.012911,-0.035418,-0.010498,-0.002142,0.020839,-0.013872,-0.018129,0.162783,-0.010582,...,-0.013086,-0.018004,-0.008818,0.010254,-0.035846,0.015748,0.008299,0.016041,0.001666,0.038127
2010-12-01,0.031029,0.033843,0.021277,0.015625,0.018962,0.039125,0.002430,0.016699,0.008929,0.015083,...,0.027607,0.024553,0.020942,0.021349,0.024040,-0.034440,0.004124,0.005561,0.014712,-0.007573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-12-24,0.008806,-0.006981,-0.009918,0.002131,-0.000439,0.020107,0.000471,-0.005274,-0.005988,-0.008081,...,-0.002114,-0.016612,-0.004435,0.003997,-0.004751,0.001910,0.005602,-0.022421,-0.003574,-0.022771
2015-12-28,-0.008561,-0.013974,-0.015065,-0.004246,-0.016282,-0.006815,0.009439,-0.011410,0.077962,0.002026,...,-0.011067,-0.048450,-0.007783,-0.005100,-0.038840,-0.001910,0.000000,0.003364,-0.002713,-0.012231
2015-12-29,0.013755,0.014961,0.013010,0.053634,0.014965,0.000000,0.010933,0.017831,-0.028171,0.029912,...,0.002565,0.050240,0.008816,0.006197,0.030472,0.003817,0.011111,-0.002071,0.011830,0.038124
2015-12-30,-0.004467,-0.017012,-0.019879,-0.022356,-0.015411,0.006388,-0.005286,-0.013077,-0.043803,-0.031939,...,-0.011738,0.012175,0.014455,-0.012693,0.017847,-0.013423,0.000000,-0.005779,0.005973,-0.032312


In [16]:
df_combine.values

array([[ 0.02534917,  0.01394874,  0.01349714, ...,  0.04026237,
        -0.02165184,  0.00797611],
       [ 0.00028532, -0.01077963, -0.01164586, ...,  0.00141319,
        -0.00283709, -0.0076425 ],
       [-0.01946523,  0.00923134,  0.00981603, ...,  0.01461975,
        -0.0052489 ,  0.0211229 ],
       ...,
       [ 0.01375484,  0.01496107,  0.01301024, ..., -0.00207115,
         0.01182995,  0.03812406],
       [-0.00446715, -0.01701245, -0.01987901, ..., -0.0057788 ,
         0.0059728 , -0.03231197],
       [-0.0058738 , -0.00884617, -0.00330182, ..., -0.00024377,
         0.00334344, -0.02675527]])

In [48]:
pipeline = Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=15, random_state=seed))])
df_pipe = pipeline.fit_transform(df_combine.values)


In [53]:
pipeline.named_steps['pca'].components_[0]

(3414,)

In [44]:
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df_combine.values)
pca = PCA(n_components=2, random_state=seed)
df_pca = pca.fit_transform(scaled_df)

In [45]:
df_pca

array([[ 49.32461774,  -5.82275075],
       [-16.4862511 ,   9.79006283],
       [ -4.14358916,  -2.91483092],
       ...,
       [ 22.43809992,  -4.9147267 ],
       [-23.19481   ,   5.36836876],
       [-21.06213646,  13.55110456]])

In [46]:
are_same = np.allclose(df_pipe, df_pca)

In [47]:
are_same

True