# Imports

In [85]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from torch.utils.data import DataLoader
from sklearn.preprocessing import RobustScaler, MinMaxScaler

# Loading data

In [86]:
PATH = "../datasets/final_data.csv"
df = pd.read_csv(PATH)
df.head(3)

Unnamed: 0,Date,open,high,low,close,close t-3,close t-5,close t-7,close t-12,close t-30,...,close t+12,close t+30,volume,stoch_k,stoch_k_smooth,stoch_d,ma_5,ma_13,fng_value,fng_classification
0,2018-02-01,10237.299805,10288.799805,8812.280273,9170.540039,,,,,,...,8598.30957,11489.700195,9959400448,11.098439,11.401036,27.762554,10516.12793,11085.433969,30.0,Fear
1,2018-02-02,9142.280273,9142.280273,7796.490234,8830.75,,,,,,...,9494.629883,11512.599609,12726899712,25.355039,18.686779,18.442888,9925.017969,10772.476262,15.0,Extreme Fear
2,2018-02-03,8852.120117,9430.75,8251.629883,9174.910156,,,,,,...,10166.400391,11573.299805,7263790080,39.263281,25.23892,18.442245,9500.719922,10585.923227,40.0,Fear


In [87]:
# df = df.iloc[1034:]

In [88]:
len(df)

2257

# Preparation for training and test data

Check for balance
The data is fairly balanced, seems we don't need to resample the data

In [89]:
df['target'] = (df['close'].shift(-3) > df['close']).astype(int)

In [90]:
df['target'].value_counts(normalize=True)

target
1    0.525476
0    0.474524
Name: proportion, dtype: float64

Check for missing data

In [91]:
df.isna().sum()

Date                   0
open                   0
high                   0
low                    0
close                  0
close t-3              3
close t-5              5
close t-7              7
close t-12            12
close t-30            30
close t+3              3
close t+5              5
close t+7              7
close t+12            12
close t+30            30
volume                 0
stoch_k                0
stoch_k_smooth         0
stoch_d                0
ma_5                   0
ma_13                  0
 fng_value             0
fng_classification     0
target                 0
dtype: int64

dropping missing data

In [92]:
df.dropna(inplace=True)
df.isna().sum()

Date                  0
open                  0
high                  0
low                   0
close                 0
close t-3             0
close t-5             0
close t-7             0
close t-12            0
close t-30            0
close t+3             0
close t+5             0
close t+7             0
close t+12            0
close t+30            0
volume                0
stoch_k               0
stoch_k_smooth        0
stoch_d               0
ma_5                  0
ma_13                 0
 fng_value            0
fng_classification    0
target                0
dtype: int64

Mapping data

In [93]:
fng_classes = df['fng_classification'].unique()
fng_classes

array(['Greed', 'Fear', 'Extreme Fear', 'Neutral', 'Extreme Greed'],
      dtype=object)

In [94]:
df['fng_classification'] = df['fng_classification'].map({'Extreme Fear': 1, 'Fear': 2, 'Neutral': 3, 'Greed':4, 'Extreme Greed': 5})

In [95]:
df['fng_classification']

30      4
31      2
32      4
33      4
34      2
       ..
2222    5
2223    5
2224    5
2225    4
2226    5
Name: fng_classification, Length: 2197, dtype: int64

Scaling the data

In [96]:
df.columns.values

array(['Date', 'open', 'high', 'low', 'close', 'close t-3', 'close t-5',
       'close t-7', 'close t-12', 'close t-30', 'close t+3', 'close t+5',
       'close t+7', 'close t+12', 'close t+30', 'volume', 'stoch_k',
       'stoch_k_smooth', 'stoch_d', 'ma_5', 'ma_13', ' fng_value',
       'fng_classification', 'target'], dtype=object)

In [97]:
columns_to_scale = ['open', 'high', 'low', 'close', 'close t-3', 'close t-5',
                     'close t-7','close t-12', 'close t-30', 'close t+3', 'close t+5', 'close t+7',
                     'close t+12', 'close t+30',
                     'volume', 'stoch_k', 'stoch_k_smooth', 'stoch_d', 'ma_5', 'ma_13',
                     ' fng_value']

In [98]:
scaler = MinMaxScaler()
scaler.fit(df[columns_to_scale])

In [99]:
scaled_data = scaler.transform(df[columns_to_scale])
df[columns_to_scale] = scaled_data

In [100]:
df

Unnamed: 0,Date,open,high,low,close,close t-3,close t-5,close t-7,close t-12,close t-30,...,close t+30,volume,stoch_k,stoch_k_smooth,stoch_d,ma_5,ma_13,fng_value,fng_classification,target
30,2018-03-03,0.120815,0.125242,0.123611,0.126786,0.110013,0.110834,0.102228,0.124180,0.092240,...,0.055078,0.010823,0.969971,0.954253,0.870176,0.122131,0.118541,0.566667,4,0
31,2018-03-04,0.126890,0.125005,0.125727,0.127137,0.118510,0.116413,0.099922,0.126954,0.086958,...,0.060409,0.009081,0.986729,0.978202,0.928659,0.124652,0.118906,0.433333,2,0
32,2018-03-05,0.127427,0.127911,0.130598,0.128070,0.120590,0.111319,0.110834,0.115866,0.092308,...,0.051786,0.010185,0.903212,0.965752,0.977770,0.128418,0.119121,0.555556,4,0
33,2018-03-06,0.126931,0.124815,0.118736,0.115881,0.126786,0.119917,0.116413,0.105211,0.078350,...,0.051179,0.011230,0.084770,0.659448,0.876798,0.127870,0.119235,0.600000,4,0
34,2018-03-07,0.116238,0.116156,0.102876,0.103371,0.127137,0.122021,0.111319,0.109814,0.057804,...,0.048672,0.016878,0.135911,0.365041,0.666787,0.124279,0.119185,0.355556,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2222,2024-03-03,0.903087,0.909851,0.921713,0.920683,0.890434,0.837068,0.753870,0.762445,0.621002,...,0.890667,0.067032,0.896057,0.869897,0.865171,0.944574,0.872909,0.866667,5,1
2223,2024-03-04,0.920067,0.990385,0.936770,1.000000,0.909518,0.921311,0.797227,0.755516,0.617992,...,0.898310,0.194650,0.974296,0.900251,0.876884,0.963240,0.893277,0.855556,5,0
2224,2024-03-05,1.000000,1.000000,0.888304,0.930420,0.903208,0.901004,0.837068,0.747212,0.611640,...,0.934504,0.286973,0.454699,0.780676,0.858790,0.971579,0.908461,0.944444,5,1
2225,2024-03-06,0.929882,0.976740,0.944084,0.965840,0.920683,0.920314,0.921311,0.738305,0.612807,...,0.924895,0.189133,0.688848,0.708976,0.803673,0.983326,0.927250,0.777778,4,1


Splitting data into train, valid, and test

In [101]:
train_size = int(0.8 * len(df))
valid_size = int(0.1 * len(df))
train_df = df.iloc[:train_size, :]
valid_df = df.iloc[train_size:(train_size + valid_size), :]
test_df = df.iloc[(train_size + valid_size):, :]

Checking for imbalance

In [102]:
print(train_df['target'].value_counts(normalize=True),
valid_df['target'].value_counts(normalize=True),
test_df['target'].value_counts(normalize=True))

target
1    0.515083
0    0.484917
Name: proportion, dtype: float64 target
1    0.511416
0    0.488584
Name: proportion, dtype: float64 target
1    0.615385
0    0.384615
Name: proportion, dtype: float64


Exporting into csv

In [103]:
train_df.to_csv("../datasets/train_dataset.csv", index=False)
valid_df.to_csv("../datasets/valid_dataset.csv", index=False)
test_df.to_csv("../datasets/test_dataset.csv", index=False)