In [75]:
import pandas as pd
import numpy as np

In [76]:
raw_csv_data = pd.read_csv("../Data/index2018.csv")
df_comp = raw_csv_data.copy()
df_comp.head()

Unnamed: 0,date,spx,dax,ftse,nikkei
0,07/01/1994,469.9,2224.95,3445.98,18124.01
1,10/01/1994,475.27,2225.0,3440.58,18443.44
2,11/01/1994,474.13,2228.1,3413.77,18485.25
3,12/01/1994,474.17,2182.06,3372.02,18793.88
4,13/01/1994,472.47,2142.37,3360.01,18577.26


### Length of the time Period

In [77]:
df_comp.date.describe()

count           6269
unique          6269
top       07/01/1994
freq               1
Name: date, dtype: object

In [78]:
df_comp.date = pd.to_datetime(df_comp.date, dayfirst=True)
df_comp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6269 entries, 0 to 6268
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    6269 non-null   datetime64[ns]
 1   spx     6269 non-null   float64       
 2   dax     6269 non-null   float64       
 3   ftse    6269 non-null   float64       
 4   nikkei  6269 non-null   float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 245.0 KB


In [79]:
df_comp.date.describe()

count                             6269
mean     2006-01-14 19:36:59.492742144
min                1994-01-07 00:00:00
25%                2000-01-11 00:00:00
50%                2006-01-12 00:00:00
75%                2012-01-19 00:00:00
max                2018-01-29 00:00:00
Name: date, dtype: object

### Setting the index
! Important

In [80]:
df_comp.set_index("date", inplace=True)
df_comp.head()

Unnamed: 0_level_0,spx,dax,ftse,nikkei
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1994-01-07,469.9,2224.95,3445.98,18124.01
1994-01-10,475.27,2225.0,3440.58,18443.44
1994-01-11,474.13,2228.1,3413.77,18485.25
1994-01-12,474.17,2182.06,3372.02,18793.88
1994-01-13,472.47,2142.37,3360.01,18577.26


### setting desired frequency
cela nous permet de mieux naviguer dans nos données

In [81]:
df_comp = df_comp.asfreq('b') # previously i've put 'd', but market indexes in only available in business days
df_comp.head()

Unnamed: 0_level_0,spx,dax,ftse,nikkei
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1994-01-07,469.9,2224.95,3445.98,18124.01
1994-01-10,475.27,2225.0,3440.58,18443.44
1994-01-11,474.13,2228.1,3413.77,18485.25
1994-01-12,474.17,2182.06,3372.02,18793.88
1994-01-13,472.47,2142.37,3360.01,18577.26


### Handling missing values

In [82]:
df_comp.isna().sum()

spx       8
dax       8
ftse      8
nikkei    8
dtype: int64

rappelons qu'il n'y avait pas de valeurs manquantes dans les données de bases, mais en y ajoutant des fréquences nous nous rendons comptes que si...

Il y existe plusieurs manière de gérer ces valeurs manquantes, l'une d'elle est fillna():
- front filling : assigns the value of the previous period
- back filling : assigns the value of the next period
- assigning the same value: like average (bad approach for time series because the data follows a mean curve )

In [83]:
df_comp.spx = df_comp.spx.fillna(method="ffill")

  df_comp.spx = df_comp.spx.fillna(method="ffill")


In [84]:
df_comp.ftse = df_comp.ftse.bfill()
df_comp.dax = df_comp.dax.bfill()

In [85]:
df_comp.isna().sum()

spx       0
dax       0
ftse      0
nikkei    8
dtype: int64

### Simplifying the dataset

In [86]:
df_comp['market_value'] = df_comp['spx']
del df_comp["ftse"], df_comp["dax"], df_comp["nikkei"], df_comp['spx']
df_comp.head()

Unnamed: 0_level_0,market_value
date,Unnamed: 1_level_1
1994-01-07,469.9
1994-01-10,475.27
1994-01-11,474.13
1994-01-12,474.17
1994-01-13,472.47


### Splitting the data
80% train set 20% test set

In [89]:
size = int(len(df_comp)*0.8)

df_train = df_comp.iloc[:size]
df_test = df_comp.iloc[size:]