# Compiling the Stock Data

### Import Libraries

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import yfinance as yf
from datetime import timedelta, date

from statsmodels.tsa.arima_model import ARIMA

### Load Raw Data

In [13]:
(date.today() - timedelta(days=1)).strftime("%Y-%m-%d")

'2020-10-28'

In [2]:
raw_path = "../data/raw/"
inter_path = "../data/interim/"
final_path = "../data/processed/"

In [3]:
stocks = ['GOOGL', 'AMZN', 'CL=F', 'EURUSD=X', 'FB', 'GC=F', 'GOOG', 'NDAQ', 'NFLX', 'QQQ', 'TSLA', 'YELP', '^GSPC', '^N225', '^NYA', '^VIX', 'MMM', 'AMD', 'INTC', 'ICE', 'IBM', 'JNPR', 'KEYS', 'LRCX', 'LDOS', 'MCHP', 'MU', 'MSFT', 'SNE', 'MSI', 'SEKJPY=X', 'HYG', 'JNK']
target = 'GOOGL'

df = pd.DataFrame()
for stock in stocks:
    if stock != target:
        df[stock] = yf.download(
            stock,
            start = '2004-08-18',
            end = (date.today() - timedelta(days=1)).strftime("%Y-%m-%d"),
            progress = False
        )['Adj Close']
    else:
        vals = yf.download(
            stock,
            start = '2004-08-18',
            end = '2020-04-29',
            progress = False
        )
        df['Change'] = vals['Open'] - vals['Adj Close']
        df['Gain'] = df['Change'].apply(lambda x: x if x>0 else 0)
        df['Loss'] = df["Change"].apply(lambda x: np.abs(x) if x<0 else 0)
        df[stock] = vals['Adj Close']
        
df.describe()

Unnamed: 0,Change,Gain,Loss,GOOGL,AMZN,CL=F,EURUSD=X,FB,GC=F,GOOG,...,IBM,JNPR,KEYS,LRCX,LDOS,MCHP,MU,MSFT,SNE,MSI
count,3950.0,3950.0,3950.0,3950.0,3950.0,3915.0,3921.0,1998.0,3910.0,3950.0,...,3950.0,3950.0,1390.0,3950.0,3405.0,3950.0,3950.0,3950.0,3950.0,3950.0
mean,0.055919,2.438443,2.382523,512.332451,491.806441,70.895571,1.265579,112.881251,1130.850409,506.832962,...,108.952062,22.188067,51.600446,73.654731,35.675903,40.044446,18.80822,43.485181,32.065585,64.289802
std,7.982372,5.316641,4.881532,359.600161,589.922627,22.26802,0.123882,58.184759,365.053745,356.167944,...,33.79854,4.687278,24.481605,63.647531,20.207088,25.104693,14.207973,36.570297,13.053942,36.253195
min,-64.939941,0.0,0.0,50.055054,26.07,-2.72,1.039047,17.73,397.399994,49.818268,...,47.92794,10.82263,21.18,13.60589,14.604857,11.477574,1.69,11.69924,9.167341,10.548212
25%,-2.907913,0.0,0.0,238.861359,77.285,52.820002,1.143851,65.310001,882.275009,237.731426,...,75.623285,19.014679,32.66,34.307253,23.798365,21.291088,8.2525,20.451392,21.314043,39.470354
50%,0.040039,0.040039,0.0,332.082077,226.535004,66.459999,1.276699,114.494999,1224.049988,330.511169,...,122.40852,22.204136,40.095001,44.876772,26.839727,28.889428,12.695,24.761961,30.480731,57.76589
75%,2.787804,2.787804,2.907913,761.212479,708.249985,90.470001,1.352997,168.805,1337.499969,742.594986,...,136.929794,25.288671,64.944998,76.868959,46.516842,46.494232,27.790001,50.422002,41.107726,75.808786
max,62.690063,62.690063,64.939941,1524.869995,2410.219971,145.179993,1.598798,223.229996,1888.699951,1526.689941,...,163.324295,39.440506,109.080002,339.54306,122.73172,110.838348,62.619999,188.185989,73.0,185.381104


In [4]:
df.to_csv(inter_path+'correlated_assets.csv', sep=',')

In [6]:
for i in range(1, 6):
    print(i)
    df.interpolate(method='spline', order=i).fillna(method='bfill').to_csv(inter_path+f'correlated_assets_interpolated_order{i}.csv', sep=',')
print('linear')
df.interpolate().fillna(method='bfill').to_csv(inter_path+f'correlated_assets_interpolated_linear.csv', sep=',')

1
2
3
4
5
linear
