# Compiling the Stock Data

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import yfinance as yf
from datetime import timedelta, date

from statsmodels.tsa.arima_model import ARIMA

### Load Raw Data

In [2]:
(date.today() - timedelta(days=1)).strftime("%Y-%m-%d")

'2020-12-10'

In [3]:
raw_path = "../data/raw/"
inter_path = "../data/interim/"
final_path = "../data/processed/"

In [4]:
stocks = ['GOOGL', 'AMZN', 'CL=F', 'EURUSD=X', 'FB', 'GC=F', 'GOOG', 'NDAQ', 'NFLX', 'QQQ', 'TSLA', 'YELP', '^GSPC', '^N225', '^NYA', '^VIX', 'MMM', 'AMD', 'INTC', 'ICE', 'IBM', 'JNPR', 'KEYS', 'LRCX', 'LDOS', 'MCHP', 'MU', 'MSFT', 'SNE', 'MSI', 'SEKJPY=X', 'HYG', 'JNK']
target = 'GOOGL'

df = pd.DataFrame()
for stock in stocks:
    if stock != target:
        df[stock] = yf.download(
            stock,
            start = '2004-08-18',
            end = (date.today() - timedelta(days=1)).strftime("%Y-%m-%d"),
            progress = False
        )['Adj Close']
    else:
        vals = yf.download(
            stock,
            start = '2004-08-18',
            end = '2020-04-29',
            progress = False
        )
        df['Change'] = vals['Open'] - vals['Adj Close']
        df['Gain'] = df['Change'].apply(lambda x: x if x>0 else 0)
        df['Loss'] = df["Change"].apply(lambda x: np.abs(x) if x<0 else 0)
        df[stock] = vals['Adj Close']
        df[['Open', 'High', 'Low', 'Volume']] = vals[['Open', 'High', 'Low', 'Volume']]
        
df.describe()

Unnamed: 0,Change,Gain,Loss,GOOGL,Open,High,Low,Volume,AMZN,CL=F,...,LRCX,LDOS,MCHP,MU,MSFT,SNE,MSI,SEKJPY=X,HYG,JNK
count,3950.0,3950.0,3950.0,3950.0,3950.0,3950.0,3950.0,3950.0,3950.0,3915.0,...,3950.0,3405.0,3950.0,3950.0,3950.0,3950.0,3950.0,3680.0,3286.0,3121.0
mean,0.055919,2.438443,2.382523,512.332451,512.388371,517.08756,507.312864,7026804.0,491.806441,70.877553,...,72.893698,34.950111,39.624068,18.80822,43.146875,31.925301,63.22124,13.732795,60.104731,76.537806
std,7.982372,5.316641,4.881532,359.600161,359.443463,362.50813,356.283275,7947507.0,589.922627,22.320203,...,62.989896,20.327988,24.84115,14.207973,36.285787,12.996656,35.942296,1.946092,13.607086,16.881357
min,-64.939941,0.0,0.0,50.055054,49.644646,50.920921,48.028027,520600.0,26.07,-37.630001,...,13.465309,13.978044,11.357085,1.69,11.608222,9.132853,10.454931,10.428,29.304523,34.071911
25%,-2.907913,0.0,0.0,238.861359,240.075073,242.142139,237.053299,1903900.0,77.285,52.820002,...,33.952774,23.006577,21.06758,8.2525,20.292284,21.233859,39.111342,12.10275,47.789689,64.792046
50%,0.040039,0.040039,0.0,332.082077,331.611603,335.310303,327.960449,4319500.0,226.535004,66.459999,...,44.413082,25.700678,28.586155,12.695,24.56932,30.345119,56.919054,13.26515,62.649965,79.95816
75%,2.787804,2.787804,2.907913,761.212479,762.335007,767.167496,755.837494,8878900.0,708.249985,90.470001,...,76.074703,46.178448,46.006148,27.790001,50.029731,40.866085,73.529991,15.12985,71.236797,90.127342
max,62.690063,62.690063,64.939941,1524.869995,1527.199951,1530.73999,1520.97998,82151100.0,2410.219971,145.179993,...,336.03476,121.838882,109.674797,62.619999,186.721939,72.725372,183.74176,18.433001,84.716263,105.290581


In [5]:
df.to_csv(inter_path+'correlated_assets.csv', sep=',')

In [6]:
for i in range(1, 6):
    print(i)
    df.interpolate(method='spline', order=i).fillna(method='bfill').to_csv(inter_path+f'correlated_assets_interpolated_order{i}.csv', sep=',')
print('linear')
df.interpolate().fillna(method='bfill').to_csv(inter_path+f'correlated_assets_interpolated_linear.csv', sep=',')

1
2


The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.


3
4
5
linear
