In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import math
import sklearn
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV

In [2]:
dataset = pd.read_csv('data/all_exchanges_data_cleaned.csv')
fourday = pd.read_csv('data/fourday.csv')

In [3]:
dataset.info

<bound method DataFrame.info of              Date          Open          High           Low         Close  \
0      2000-10-10  10569.169922  10623.549805  10488.889648  10524.400391   
1      2000-10-10   1036.500000   1036.500000   1036.500000   1036.500000   
2      2000-10-10  15739.389648  15739.389648  15434.740234  15554.110352   
3      2000-10-11  10521.070313  10566.580078  10350.889648  10413.790039   
4      2000-10-11   1009.489990   1009.489990   1009.489990   1009.489990   
...           ...           ...           ...           ...           ...   
19953  2020-10-07  27971.359375  28369.660156  27971.359375  28303.460938   
19954  2020-10-07    979.710022    983.119995    974.849976    978.059998   
19955  2020-10-07  23999.789063  24243.910156  23905.419922  24242.859375   
19956  2020-10-07  23272.449219  23432.730469  23272.449219  23422.820313   
19957  2020-10-08  23272.449219  23701.769531  23477.730469  23647.070313   

          Adj Close        Volume  Percent 

In [4]:
dataset.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Percent Change
count,19958.0,19958.0,19958.0,19958.0,19958.0,19958.0,19958.0
mean,12359.002648,12432.552746,12275.562052,12356.404874,12356.404874,446385700.0,0.018235
std,8513.927306,8558.398208,8460.504323,8510.861717,8510.861717,722900800.0,1.368694
min,427.600006,436.519989,419.48999,419.950012,419.950012,0.0,-12.926546
25%,1085.022461,1090.065003,1081.955017,1086.654968,1086.654968,134700.0,-0.597822
50%,11933.995117,12011.040039,11834.734863,11932.549805,11932.549805,204860400.0,0.049819
75%,19283.262696,19401.442383,19161.534668,19287.947266,19287.947266,360370600.0,0.66537
max,33335.480469,33484.078125,32897.039063,33154.121094,33154.121094,9799120000.0,14.347069


In [5]:
fourday.columns

Index(['index', 'Date', 'Open', 'High', 'Low', 'Close', 'Adj Close',
       'Percent Change', 'Index'],
      dtype='object')

In [6]:
fourday.drop('index', 1, inplace=True)

In [7]:
DJI = fourday.loc[fourday['Index'] == 'Dow Jones Industrial']
N100 = fourday.loc[fourday['Index'] == 'Euronext100']
HSI = fourday.loc[fourday['Index'] == 'Hang Seng']
N225 = fourday.loc[fourday['Index'] == 'Nikkei 225']

In [8]:
DJI = DJI.reset_index(drop=True)
HSI = HSI.reset_index(drop=True)
N100 = N100.reset_index(drop=True)
N225 = N225.reset_index(drop=True)

In [9]:
print(DJI.shape, HSI.shape, N100.shape, N225.shape)

(4515, 8) (4515, 8) (4515, 8) (4515, 8)


In [10]:
###
###
###
###MERGE WITH DIFFERENTIATING COLUMN NAMES
###
###
###

In [11]:
Merged_df = [DJI, N100, HSI, N225]

In [12]:
DJI.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Percent Change',
       'Index'],
      dtype='object')

In [13]:
Merged_df[2]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Percent Change,Index
0,2000-10-11,15376.620117,15376.620117,15073.950195,15127.000000,15127.000000,-2.745965,Hang Seng
1,2000-10-12,15071.919922,15244.650391,14883.320313,15074.799805,15074.799805,-0.345080,Hang Seng
2,2000-10-13,14679.990234,14786.040039,14494.379883,14680.500000,14680.500000,-2.615622,Hang Seng
3,2000-10-16,15185.679688,15284.650391,14956.879883,14973.400391,14973.400391,1.995166,Hang Seng
4,2000-10-17,15081.759766,15134.530273,14794.809570,14873.429688,14873.429688,-0.667655,Hang Seng
...,...,...,...,...,...,...,...,...
4510,2020-09-29,23584.609375,23601.400391,23256.919922,23275.529297,23275.529297,-0.854153,Hang Seng
4511,2020-09-30,23548.890625,23780.869141,23368.490234,23459.050781,23459.050781,0.788474,Hang Seng
4512,2020-10-05,24039.390625,24039.390625,23674.519531,23767.779297,23767.779297,1.316032,Hang Seng
4513,2020-10-06,23895.210938,24005.029297,23842.250000,23980.650391,23980.650391,0.895629,Hang Seng


In [14]:
All_exchanges_mg = Merged_df[0].merge(Merged_df[1], how='inner', suffixes=('_Dow', '_Eur'), left_index=True, right_index=True)

In [15]:
for column in (Merged_df[2].columns.values):
    Merged_df[2] = Merged_df[2].rename(columns={column: column + '_HSI'})

In [16]:
for column in (Merged_df[3].columns.values):
    Merged_df[3] = Merged_df[3].rename(columns={column: column + '_NIK'})

In [17]:
All_exchanges_mg = All_exchanges_mg.merge(Merged_df[2], how='inner', left_index=True, right_index=True)

In [18]:
All_exchanges_mg = All_exchanges_mg.merge(Merged_df[3], how='inner', left_index=True, right_index=True)

In [19]:
All_exchanges_mg.columns

Index(['Date_Dow', 'Open_Dow', 'High_Dow', 'Low_Dow', 'Close_Dow',
       'Adj Close_Dow', 'Percent Change_Dow', 'Index_Dow', 'Date_Eur',
       'Open_Eur', 'High_Eur', 'Low_Eur', 'Close_Eur', 'Adj Close_Eur',
       'Percent Change_Eur', 'Index_Eur', 'Date_HSI', 'Open_HSI', 'High_HSI',
       'Low_HSI', 'Close_HSI', 'Adj Close_HSI', 'Percent Change_HSI',
       'Index_HSI', 'Date_NIK', 'Open_NIK', 'High_NIK', 'Low_NIK', 'Close_NIK',
       'Adj Close_NIK', 'Percent Change_NIK', 'Index_NIK'],
      dtype='object')

In [20]:
All_exchanges_mg.shape

(4515, 32)

In [21]:
print(DJI.columns, len(DJI.columns))

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Percent Change',
       'Index'],
      dtype='object') 8


In [22]:
### Times in Greenwich Mean Time

###Both day ahead
### Japan Exchange Group / N225    12am to 6am
### Hong Kong / Hang Seng =        1:30am to 8am

###Both lag day behind locally
### EuroNext / N100 =              8am to 4:30pm
### NYSE / DJI =                   2:30pm to 9pm

###Observations
### Jap and Hong Kong close before EuroNext opens SAME DAY
### Hong Kong closes RIGHT as Euronext Opens SAME DAY
###New York and EuroNext close before Jap and Hong Kong open THE DAY BEFORE
###NYSE is mid-dayish when EURO closes

###For test case leave out NYSE in test train split and begin running models on that

In [23]:
tscv = TimeSeriesSplit(n_splits=5)

In [24]:
All_exchanges_mg[All_exchanges_mg.columns[:8]]

Unnamed: 0,Date_Dow,Open_Dow,High_Dow,Low_Dow,Close_Dow,Adj Close_Dow,Percent Change_Dow,Index_Dow
0,2000-10-11,10521.070313,10566.580078,10350.889648,10413.790039,10413.790039,-1.050990,Dow Jones Industrial
1,2000-10-12,10424.139648,10460.400391,10023.490234,10034.580078,10034.580078,-3.641421,Dow Jones Industrial
2,2000-10-13,10031.620117,10208.089844,10014.240234,10192.179688,10192.179688,1.570565,Dow Jones Industrial
3,2000-10-16,10184.780273,10272.089844,10177.759766,10238.799805,10238.799805,0.457411,Dow Jones Industrial
4,2000-10-17,10242.870117,10293.919922,10026.450195,10089.709961,10089.709961,-1.456126,Dow Jones Industrial
...,...,...,...,...,...,...,...,...
4510,2020-09-29,27560.240234,27605.599609,27338.089844,27452.660156,27452.660156,-0.476363,Dow Jones Industrial
4511,2020-09-30,27514.640625,28026.330078,27511.060547,27781.699219,27781.699219,1.198569,Dow Jones Industrial
4512,2020-10-05,27825.419922,28162.640625,27825.419922,28148.640625,28148.640625,1.682741,Dow Jones Industrial
4513,2020-10-06,28214.240234,28354.480469,27728.029297,27772.759766,27772.759766,-1.335343,Dow Jones Industrial


In [25]:
y = All_exchanges_mg[All_exchanges_mg.columns[:8]]

In [26]:
X = All_exchanges_mg[All_exchanges_mg.columns[8:]]

In [27]:
print(X.shape, y.shape)

(4515, 24) (4515, 8)


In [28]:
pd.Series.shift 

##set import above

<function pandas.core.series.Series.shift(self, periods=1, freq=None, axis=0, fill_value=None) -> 'Series'>

In [29]:
print(tscv)

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)


In [30]:
print(tscv.split(X))

<generator object TimeSeriesSplit.split at 0x7f2efa2b8e40>


In [31]:
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]

In [32]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(3763, 24) (3763, 8) (752, 24) (752, 8)


In [33]:
#############
    ####    TODO LIST
    ####    Re-organize merge into acceptable renamed columns
    ####    Redo y potentially? investitage
    ####    Run model with test tran split to see if it works, no need to re-do y

In [34]:
pd.DataFrame(X_train).to_csv('data/X_train.csv', index=False)
pd.DataFrame(X_test).to_csv('data/X_test.csv', index=False)
pd.DataFrame(y_train).to_csv('data/y_train.csv', index=False)
pd.DataFrame(y_test).to_csv('data/y_test.csv', index=False)