# Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from torch.utils.data import DataLoader
from sklearn.preprocessing import MinMaxScaler

# Loading data

In [2]:
PATH = "../datasets/final_data.csv"
df = pd.read_csv(PATH)
df.head(3)

Unnamed: 0,Date,open,high,low,close,volume,stoch_k,stoch_k_smooth,stoch_d,ma_5,ma_13,fng_value,fng_classification,tomorrow,target
0,2018-02-01,10237.299805,10288.799805,8812.280273,9170.540039,9959400448,11.098439,11.401036,27.762554,10516.12793,11085.433969,30.0,Fear,8830.75,0
1,2018-02-02,9142.280273,9142.280273,7796.490234,8830.75,12726899712,25.355039,18.686779,18.442888,9925.017969,10772.476262,15.0,Extreme Fear,9174.910156,1
2,2018-02-03,8852.120117,9430.75,8251.629883,9174.910156,7263790080,39.263281,25.23892,18.442245,9500.719922,10585.923227,40.0,Fear,8277.009766,0


In [3]:
# df = df.iloc[1034:]

In [4]:
len(df)

2257

# Preparation for training and test data

Check for balance
The data is fairly balanced, seems we don't need to resample the data

In [5]:
df['target'].value_counts(normalize=True)

target
1    0.513957
0    0.486043
Name: proportion, dtype: float64

Check for missing data

In [6]:
df.isna().sum()

Date                  0
open                  0
high                  0
low                   0
close                 0
volume                0
stoch_k               0
stoch_k_smooth        0
stoch_d               0
ma_5                  0
ma_13                 0
 fng_value            0
fng_classification    0
tomorrow              1
target                0
dtype: int64

dropping missing data

In [7]:
df.dropna(inplace=True)
df.isna().sum()

Date                  0
open                  0
high                  0
low                   0
close                 0
volume                0
stoch_k               0
stoch_k_smooth        0
stoch_d               0
ma_5                  0
ma_13                 0
 fng_value            0
fng_classification    0
tomorrow              0
target                0
dtype: int64

Mapping data

In [8]:
fng_classes = df['fng_classification'].unique()
fng_classes

array(['Fear', 'Extreme Fear', 'Neutral', 'Greed', 'Extreme Greed'],
      dtype=object)

In [9]:
df['fng_classification'] = df['fng_classification'].map({'Extreme Fear': 0, 'Fear': 1, 'Neutral': 2, 'Greed':3, 'Extreme Greed': 4})

In [10]:
df['fng_classification']

0       1
1       0
2       1
3       0
4       0
       ..
2251    4
2252    4
2253    3
2254    3
2255    4
Name: fng_classification, Length: 2256, dtype: int64

Scaling the data

In [11]:
df.columns.values

array(['Date', 'open', 'high', 'low', 'close', 'volume', 'stoch_k',
       'stoch_k_smooth', 'stoch_d', 'ma_5', 'ma_13', ' fng_value',
       'fng_classification', 'tomorrow', 'target'], dtype=object)

In [12]:
columns_to_scale = ['open', 'high', 'low', 'close', 'volume', 'stoch_k', 'stoch_k_smooth', 'stoch_d', 'ma_5', 'ma_13', ' fng_value', 'tomorrow']

In [13]:
scaler = MinMaxScaler()
scaler.fit(df[columns_to_scale])

In [14]:
scaled_data = scaler.transform(df[columns_to_scale])
df[columns_to_scale] = scaled_data

In [15]:
df

Unnamed: 0,Date,open,high,low,close,volume,stoch_k,stoch_k_smooth,stoch_d,ma_5,ma_13,fng_value,fng_classification,tomorrow,target
0,2018-02-01,0.100239,0.099517,0.082488,0.084954,0.020215,0.110984,0.094495,0.256735,0.105726,0.116522,0.277778,1,0.080089,0
1,2018-02-02,0.084561,0.083248,0.067581,0.080089,0.028167,0.253550,0.170127,0.157328,0.097058,0.111752,0.111111,0,0.085017,1
2,2018-02-03,0.080407,0.087342,0.074261,0.085017,0.012470,0.392633,0.238144,0.157321,0.090835,0.108908,0.388889,1,0.072162,0
3,2018-02-04,0.085040,0.085981,0.071026,0.072162,0.011923,0.185880,0.264060,0.215397,0.085470,0.105796,0.211111,0,0.053238,0
4,2018-02-05,0.072080,0.072217,0.052322,0.053238,0.018278,0.056224,0.195779,0.224183,0.075892,0.101208,0.066667,0,0.064674,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2251,2024-04-01,0.975003,0.965832,0.952696,0.951589,0.091798,0.463261,0.572381,0.617831,0.981903,0.990159,0.822222,4,0.890667,0
2252,2024-04-02,0.951687,0.942651,0.900980,0.890667,0.137286,0.126690,0.485558,0.566194,0.966365,0.987266,0.822222,4,0.898310,1
2253,2024-04-03,0.890716,0.903004,0.900588,0.898310,0.090691,0.208410,0.252398,0.433917,0.954891,0.987840,0.733333,3,0.934504,1
2254,2024-04-04,0.898291,0.936732,0.908717,0.934504,0.090551,0.579204,0.292518,0.338063,0.951558,0.993386,0.722222,3,0.924895,0


Splitting data into train, valid, and test

In [16]:
train_size = int(0.8 * len(df))
valid_size = int(0.1 * len(df))
train_df = df.iloc[:train_size, :]
valid_df = df.iloc[train_size:(train_size + valid_size), :]
test_df = df.iloc[(train_size + valid_size):, :]

Checking for imbalance

In [17]:
print(train_df['target'].value_counts(normalize=True),
valid_df['target'].value_counts(normalize=True),
test_df['target'].value_counts(normalize=True))

target
1    0.51663
0    0.48337
Name: proportion, dtype: float64 target
0    0.528889
1    0.471111
Name: proportion, dtype: float64 target
1    0.537445
0    0.462555
Name: proportion, dtype: float64


Exporting into csv

In [18]:
train_df.to_csv("../datasets/train_dataset.csv", index=False)
valid_df.to_csv("../datasets/valid_dataset.csv", index=False)
test_df.to_csv("../datasets/test_dataset.csv", index=False)