# Imports

In [40]:
import numpy as np
import pandas as pd
import prep

# Dataset

In [41]:
df = pd.read_csv('../data/dataset.csv')
df.head(3)

Unnamed: 0,Date,Open,High,Low,Close,Volume,value,value_classification
0,2018-02-01,10237.299805,10288.799805,8812.280273,9170.540039,9959400448,30,Fear
1,2018-02-02,9142.280273,9142.280273,7796.490234,8830.75,12726899712,15,Extreme Fear
2,2018-02-03,8852.120117,9430.75,8251.629883,9174.910156,7263790080,40,Fear


In [42]:
df.set_index('Date', inplace=True)

# Feature Engineering

## Checkpoint 1

In [43]:
df_feat = df.copy()

## Adding window

In [44]:
WINDOW = 37
HORIZON = 1
for i in range(WINDOW):
    df_feat[f'Close + {i+1}'] = df_feat['Close'].shift(i+1)
df_feat.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,value,value_classification,Close + 1,Close + 2,Close + 3,...,Close + 28,Close + 29,Close + 30,Close + 31,Close + 32,Close + 33,Close + 34,Close + 35,Close + 36,Close + 37
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-02-01,10237.299805,10288.799805,8812.280273,9170.540039,9959400448,30,Fear,,,,...,,,,,,,,,,
2018-02-02,9142.280273,9142.280273,7796.490234,8830.75,12726899712,15,Extreme Fear,9170.540039,,,...,,,,,,,,,,
2018-02-03,8852.120117,9430.75,8251.629883,9174.910156,7263790080,40,Fear,8830.75,9170.540039,,...,,,,,,,,,,


## Adding target

In [45]:
# df_feat['target'] = (df_feat['Close'] > df_feat['Close + 1']).astype(int)
# df_feat.head(3)

## Adding technical indicators

Adding MA 13 and 21 commonly used by many professional traders

In [46]:
df_feat['MA_13'] = df_feat['Close'].rolling(window=13).mean()
df_feat['MA_21'] = df_feat['Close'].rolling(window=21).mean()

Adding RSI of 3 with K of 5 and D of 3

In [47]:
df_feat['RSI_3'] = prep.calculate_rsi(df_feat['Close'], 3)

In [48]:
df_feat['%K'], df_feat['%D'] = prep.calculate_stochastic_oscillator(df_feat['Close'], 5, 3)

In [49]:
df_feat.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,value,value_classification,Close + 1,Close + 2,Close + 3,...,Close + 33,Close + 34,Close + 35,Close + 36,Close + 37,MA_13,MA_21,RSI_3,%K,%D
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-02-01,10237.299805,10288.799805,8812.280273,9170.540039,9959400448,30,Fear,,,,...,,,,,,,,,,
2018-02-02,9142.280273,9142.280273,7796.490234,8830.75,12726899712,15,Extreme Fear,9170.540039,,,...,,,,,,,,,,
2018-02-03,8852.120117,9430.75,8251.629883,9174.910156,7263790080,40,Fear,8830.75,9170.540039,,...,,,,,,,,50.319476,,


## Encoding

In [50]:
df['value_classification'].unique()

array(['Fear', 'Extreme Fear', 'Neutral', 'Greed', 'Extreme Greed'],
      dtype=object)

In [51]:
df_feat['value_classification'].replace({'Fear':1, 'Extreme Fear':2, 'Neutral':3, 'Greed':4, 'Extreme Greed':5}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_feat['value_classification'].replace({'Fear':1, 'Extreme Fear':2, 'Neutral':3, 'Greed':4, 'Extreme Greed':5}, inplace=True)
  df_feat['value_classification'].replace({'Fear':1, 'Extreme Fear':2, 'Neutral':3, 'Greed':4, 'Extreme Greed':5}, inplace=True)


dropping na values

In [52]:
df_feat.dropna(inplace=True)

In [53]:
df_feat

Unnamed: 0_level_0,Open,High,Low,Close,Volume,value,value_classification,Close + 1,Close + 2,Close + 3,...,Close + 33,Close + 34,Close + 35,Close + 36,Close + 37,MA_13,MA_21,RSI_3,%K,%D
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-03-10,9350.589844,9531.320312,8828.469727,8866.000000,5386319872,39,1,9337.549805,9395.009766,9965.570312,...,6955.270020,8277.009766,9174.910156,8830.750000,9170.540039,10495.940805,10481.063384,0.000000,0.000000,0.000000
2018-03-11,8852.780273,9711.889648,8607.120117,9578.629883,6296370176,40,1,8866.000000,9337.549805,9395.009766,...,7754.000000,6955.270020,8277.009766,9174.910156,8830.750000,10435.320012,10434.721959,57.394260,64.809851,21.603284
2018-03-12,9602.929688,9937.500000,8956.429688,9205.120117,6457399808,41,1,9578.629883,8866.000000,9337.549805,...,7621.299805,7754.000000,6955.270020,8277.009766,9174.910156,10318.360051,10338.522926,45.749163,47.587131,37.465661
2018-03-13,9173.040039,9470.379883,8958.190430,9194.849609,5991139840,41,1,9205.120117,9578.629883,8866.000000,...,8265.589844,7621.299805,7754.000000,6955.270020,8277.009766,10225.817683,10233.339565,64.996651,46.145919,52.847634
2018-03-14,9214.650391,9355.849609,8068.589844,8269.809570,6438230016,40,1,9194.849609,9205.120117,9578.629883,...,8736.980469,8265.589844,7621.299805,7754.000000,6955.270020,10019.572266,10118.073335,0.000000,0.000000,31.244350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-16,66256.109375,66712.429688,64613.054688,65231.582031,31573077994,70,4,66267.492188,61552.789062,62901.449219,...,63821.472656,67195.867188,70060.609375,70587.882812,69139.015625,62818.882212,62460.877046,66.411065,78.504064,71.515421
2024-05-17,65231.296875,67459.460938,65119.316406,67051.875000,28031279310,74,4,65231.582031,66267.492188,61552.789062,...,65738.726562,63821.472656,67195.867188,70060.609375,70587.882812,63061.990084,62617.855841,86.317224,100.000000,92.834688
2024-05-18,67066.210938,67387.328125,66663.500000,66940.804688,16712277406,73,4,67051.875000,65231.582031,66267.492188,...,63426.210938,65738.726562,63821.472656,67195.867188,70060.609375,63285.810998,62785.554129,61.345643,97.980204,92.161423
2024-05-19,66937.929688,67694.296875,65937.179688,66278.367188,19249094538,72,4,66940.804688,67051.875000,65231.582031,...,63811.863281,63426.210938,65738.726562,63821.472656,67195.867188,63525.535457,62936.274926,70.178596,57.506411,85.162205


# Rearranging columns

In [54]:
df_feat.keys()

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'value',
       'value_classification', 'Close + 1', 'Close + 2', 'Close + 3',
       'Close + 4', 'Close + 5', 'Close + 6', 'Close + 7', 'Close + 8',
       'Close + 9', 'Close + 10', 'Close + 11', 'Close + 12', 'Close + 13',
       'Close + 14', 'Close + 15', 'Close + 16', 'Close + 17', 'Close + 18',
       'Close + 19', 'Close + 20', 'Close + 21', 'Close + 22', 'Close + 23',
       'Close + 24', 'Close + 25', 'Close + 26', 'Close + 27', 'Close + 28',
       'Close + 29', 'Close + 30', 'Close + 31', 'Close + 32', 'Close + 33',
       'Close + 34', 'Close + 35', 'Close + 36', 'Close + 37', 'MA_13',
       'MA_21', 'RSI_3', '%K', '%D'],
      dtype='object')

In [55]:
df_feat = df_feat[['Open', 'High', 'Low', 'Volume', 'value',
       'value_classification', 
        'Close + 7', 'Close + 8',
       'Close + 9', 'Close + 10', 'Close + 11', 'Close + 12', 'Close + 13',
       'Close + 14', 'Close + 15', 'Close + 16', 'Close + 17', 'Close + 18',
       'Close + 19', 'Close + 20', 'Close + 21', 'Close + 22', 'Close + 23',
       'Close + 24', 'Close + 25', 'Close + 26', 'Close + 27', 'Close + 28',
       'Close + 29', 'Close + 30', 'Close + 31', 'Close + 32', 'Close + 33',
       'Close + 34', 'Close + 35', 'Close + 36', 'Close + 37', 'MA_13', 'MA_21', 'RSI_3', '%K',
       '%D', 'Close + 6', 'Close + 5', 'Close + 4' ,  'Close + 3', 'Close + 2', 'Close + 1', 'Close',]]

# Splitting data for train, valid, test

In [56]:
train_split = int(0.6 * len(df_feat))
valid_split = train_split + int(0.2 * len(df_feat))

train_df = df_feat.iloc[:train_split]
valid_df = df_feat.iloc[train_split:valid_split]
test_df = df_feat.iloc[valid_split:]

In [57]:
len(train_df) + len(valid_df) + len(test_df)

2261

# Exporting data

In [58]:
df_feat.to_csv('../data/prepared_data.csv')
train_df.to_csv('../data/train_dataset.csv')
valid_df.to_csv('../data/valid_dataset.csv')
test_df.to_csv('../data/test_dataset.csv')