In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import math
import sklearn
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
dataset = pd.read_csv('data/all_exchanges_data_cleaned.csv')
fourday = pd.read_csv('data/fourday.csv')

In [3]:
dataset.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Percent Change
count,19958.0,19958.0,19958.0,19958.0,19958.0,19958.0,19958.0
mean,12359.002648,12432.552746,12275.562052,12356.404874,12356.404874,446385700.0,0.018235
std,8513.927306,8558.398208,8460.504323,8510.861717,8510.861717,722900800.0,1.368694
min,427.600006,436.519989,419.48999,419.950012,419.950012,0.0,-12.926546
25%,1085.022461,1090.065003,1081.955017,1086.654968,1086.654968,134700.0,-0.597822
50%,11933.995117,12011.040039,11834.734863,11932.549805,11932.549805,204860400.0,0.049819
75%,19283.262696,19401.442383,19161.534668,19287.947266,19287.947266,360370600.0,0.66537
max,33335.480469,33484.078125,32897.039063,33154.121094,33154.121094,9799120000.0,14.347069


In [4]:
fourday.columns

Index(['index', 'Date', 'Open', 'High', 'Low', 'Close', 'Adj Close',
       'Percent Change', 'Index'],
      dtype='object')

In [5]:
fourday.drop(['index', 'Date'], 1, inplace=True)

In [6]:
DJI = fourday.loc[fourday['Index'] == 'Dow Jones Industrial']
N100 = fourday.loc[fourday['Index'] == 'Euronext100']
HSI = fourday.loc[fourday['Index'] == 'Hang Seng']
N225 = fourday.loc[fourday['Index'] == 'Nikkei 225']

In [7]:
DJI = DJI.reset_index(drop=True)
HSI = HSI.reset_index(drop=True)
N100 = N100.reset_index(drop=True)
N225 = N225.reset_index(drop=True)

In [8]:
DJI.drop('Index', 1, inplace=True)
HSI.drop('Index', 1, inplace=True)
N100.drop('Index', 1, inplace=True)
N225.drop('Index', 1, inplace=True)

In [9]:
DJI

Unnamed: 0,Open,High,Low,Close,Adj Close,Percent Change
0,10521.070313,10566.580078,10350.889648,10413.790039,10413.790039,-1.050990
1,10424.139648,10460.400391,10023.490234,10034.580078,10034.580078,-3.641421
2,10031.620117,10208.089844,10014.240234,10192.179688,10192.179688,1.570565
3,10184.780273,10272.089844,10177.759766,10238.799805,10238.799805,0.457411
4,10242.870117,10293.919922,10026.450195,10089.709961,10089.709961,-1.456126
...,...,...,...,...,...,...
4510,27560.240234,27605.599609,27338.089844,27452.660156,27452.660156,-0.476363
4511,27514.640625,28026.330078,27511.060547,27781.699219,27781.699219,1.198569
4512,27825.419922,28162.640625,27825.419922,28148.640625,28148.640625,1.682741
4513,28214.240234,28354.480469,27728.029297,27772.759766,27772.759766,-1.335343


In [10]:
DJI.dtypes

Open              float64
High              float64
Low               float64
Close             float64
Adj Close         float64
Percent Change    float64
dtype: object

In [11]:
#DJI['Date'] = pd.to_datetime(DJI['Date'], format='%Y-%m-%d')
#N100['Date'] = pd.to_datetime(N100['Date'], format='%Y-%m-%d')
#HSI['Date'] = pd.to_datetime(HSI['Date'], format='%Y-%m-%d')
#N225['Date'] = pd.to_datetime(N225['Date'], format='%Y-%m-%d')

In [12]:
print(DJI.shape, HSI.shape, N100.shape, N225.shape)

(4515, 6) (4515, 6) (4515, 6) (4515, 6)


In [13]:
###
###
###
###MERGE WITH DIFFERENTIATING COLUMN NAMES
###
###
###

In [14]:
Merged_df = [DJI, N100, HSI, N225]

In [15]:
DJI.columns

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Percent Change'], dtype='object')

In [16]:
Merged_df[2]

Unnamed: 0,Open,High,Low,Close,Adj Close,Percent Change
0,15376.620117,15376.620117,15073.950195,15127.000000,15127.000000,-2.745965
1,15071.919922,15244.650391,14883.320313,15074.799805,15074.799805,-0.345080
2,14679.990234,14786.040039,14494.379883,14680.500000,14680.500000,-2.615622
3,15185.679688,15284.650391,14956.879883,14973.400391,14973.400391,1.995166
4,15081.759766,15134.530273,14794.809570,14873.429688,14873.429688,-0.667655
...,...,...,...,...,...,...
4510,23584.609375,23601.400391,23256.919922,23275.529297,23275.529297,-0.854153
4511,23548.890625,23780.869141,23368.490234,23459.050781,23459.050781,0.788474
4512,24039.390625,24039.390625,23674.519531,23767.779297,23767.779297,1.316032
4513,23895.210938,24005.029297,23842.250000,23980.650391,23980.650391,0.895629


In [17]:
All_exchanges_mg = Merged_df[0].merge(Merged_df[1], how='inner', suffixes=('_Dow', '_Eur'), left_index=True, right_index=True)

In [18]:
for column in (Merged_df[2].columns.values):
    Merged_df[2] = Merged_df[2].rename(columns={column: column + '_HSI'})

In [19]:
for column in (Merged_df[3].columns.values):
    Merged_df[3] = Merged_df[3].rename(columns={column: column + '_NIK'})

In [20]:
All_exchanges_mg = All_exchanges_mg.merge(Merged_df[2], how='inner', left_index=True, right_index=True)

In [21]:
All_exchanges_mg = All_exchanges_mg.merge(Merged_df[3], how='inner', left_index=True, right_index=True)

In [22]:
All_exchanges_mg.columns

Index(['Open_Dow', 'High_Dow', 'Low_Dow', 'Close_Dow', 'Adj Close_Dow',
       'Percent Change_Dow', 'Open_Eur', 'High_Eur', 'Low_Eur', 'Close_Eur',
       'Adj Close_Eur', 'Percent Change_Eur', 'Open_HSI', 'High_HSI',
       'Low_HSI', 'Close_HSI', 'Adj Close_HSI', 'Percent Change_HSI',
       'Open_NIK', 'High_NIK', 'Low_NIK', 'Close_NIK', 'Adj Close_NIK',
       'Percent Change_NIK'],
      dtype='object')

In [23]:
All_exchanges_mg.shape

(4515, 24)

In [24]:
print(DJI.columns, len(DJI.columns))

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Percent Change'], dtype='object') 6


In [25]:
### Times in Greenwich Mean Time

###Both day ahead
### Japan Exchange Group / N225    12am to 6am
### Hong Kong / Hang Seng =        1:30am to 8am

###Both lag day behind locally
### EuroNext / N100 =              8am to 4:30pm
### NYSE / DJI =                   2:30pm to 9pm

###Observations
### Jap and Hong Kong close before EuroNext opens SAME DAY
### Hong Kong closes RIGHT as Euronext Opens SAME DAY
###New York and EuroNext close before Jap and Hong Kong open THE DAY BEFORE
###NYSE is mid-dayish when EURO closes

###For test case leave out NYSE in test train split and begin running models on that

In [26]:
tscv = TimeSeriesSplit(n_splits=10)

In [27]:
All_exchanges_mg[All_exchanges_mg.columns[:6]]

Unnamed: 0,Open_Dow,High_Dow,Low_Dow,Close_Dow,Adj Close_Dow,Percent Change_Dow
0,10521.070313,10566.580078,10350.889648,10413.790039,10413.790039,-1.050990
1,10424.139648,10460.400391,10023.490234,10034.580078,10034.580078,-3.641421
2,10031.620117,10208.089844,10014.240234,10192.179688,10192.179688,1.570565
3,10184.780273,10272.089844,10177.759766,10238.799805,10238.799805,0.457411
4,10242.870117,10293.919922,10026.450195,10089.709961,10089.709961,-1.456126
...,...,...,...,...,...,...
4510,27560.240234,27605.599609,27338.089844,27452.660156,27452.660156,-0.476363
4511,27514.640625,28026.330078,27511.060547,27781.699219,27781.699219,1.198569
4512,27825.419922,28162.640625,27825.419922,28148.640625,28148.640625,1.682741
4513,28214.240234,28354.480469,27728.029297,27772.759766,27772.759766,-1.335343


In [28]:
y = All_exchanges_mg[All_exchanges_mg.columns[:6]]

In [29]:
X = All_exchanges_mg[All_exchanges_mg.columns[6:]]

In [30]:
print(X.shape, y.shape)

(4515, 18) (4515, 6)


In [31]:
print(tscv)

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=10, test_size=None)


In [32]:
print(tscv.split(X))

<generator object TimeSeriesSplit.split at 0x7f51fc60d120>


In [33]:
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]

In [34]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(4105, 18) (4105, 6) (410, 18) (410, 6)
