# Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from torch.utils.data import DataLoader
from sklearn.preprocessing import MinMaxScaler

# Loading data

In [2]:
PATH = "../datasets/final_data.csv"
df = pd.read_csv(PATH)
df.head(3)

Unnamed: 0,Date,open,high,low,close,volume,stoch_k,stoch_k_smooth,stoch_d,ma_5,ma_13,fng_value,fng_classification,tomorrow,target
0,2018-02-01,10237.299805,10288.799805,8812.280273,9170.540039,9959400448,11.098439,11.401036,27.762554,10516.12793,11085.433969,30.0,Fear,8830.75,0
1,2018-02-02,9142.280273,9142.280273,7796.490234,8830.75,12726899712,25.355039,18.686779,18.442888,9925.017969,10772.476262,15.0,Extreme Fear,9174.910156,1
2,2018-02-03,8852.120117,9430.75,8251.629883,9174.910156,7263790080,39.263281,25.23892,18.442245,9500.719922,10585.923227,40.0,Fear,8277.009766,0


In [3]:
df = df.iloc[1034:]

In [4]:
len(df)

1223

# Preparation for training and test data

Check for balance
The data is fairly balanced, seems we don't need to resample the data

In [5]:
df['target'].value_counts(normalize=True)

target
1    0.502044
0    0.497956
Name: proportion, dtype: float64

Check for missing data

In [6]:
df.isna().sum()

Date                  0
open                  0
high                  0
low                   0
close                 0
volume                0
stoch_k               0
stoch_k_smooth        0
stoch_d               0
ma_5                  0
ma_13                 0
 fng_value            0
fng_classification    0
tomorrow              1
target                0
dtype: int64

dropping missing data

In [7]:
df.dropna(inplace=True)
df.isna().sum()

Date                  0
open                  0
high                  0
low                   0
close                 0
volume                0
stoch_k               0
stoch_k_smooth        0
stoch_d               0
ma_5                  0
ma_13                 0
 fng_value            0
fng_classification    0
tomorrow              0
target                0
dtype: int64

Mapping data

In [8]:
fng_classes = df['fng_classification'].unique()
fng_classes

array(['Extreme Greed', 'Greed', 'Fear', 'Neutral', 'Extreme Fear'],
      dtype=object)

In [9]:
df['fng_classification'] = df['fng_classification'].map({'Extreme Fear': 0, 'Fear': 1, 'Neutral': 2, 'Greed':3, 'Extreme Greed': 4})

In [10]:
df['fng_classification']

1034    4
1035    4
1036    4
1037    4
1038    4
       ..
2251    4
2252    4
2253    3
2254    3
2255    4
Name: fng_classification, Length: 1222, dtype: int64

Scaling the data

In [11]:
df.columns.values

array(['Date', 'open', 'high', 'low', 'close', 'volume', 'stoch_k',
       'stoch_k_smooth', 'stoch_d', 'ma_5', 'ma_13', ' fng_value',
       'fng_classification', 'tomorrow', 'target'], dtype=object)

In [12]:
columns_to_scale = ['open', 'high', 'low', 'close', 'volume', 'stoch_k', 'stoch_k_smooth', 'stoch_d', 'ma_5', 'ma_13', ' fng_value', 'tomorrow']

In [13]:
scaler = MinMaxScaler()
scaler.fit(df[columns_to_scale])

In [14]:
scaled_data = scaler.transform(df[columns_to_scale])
df[columns_to_scale] = scaled_data

In [15]:
df

Unnamed: 0,Date,open,high,low,close,volume,stoch_k,stoch_k_smooth,stoch_d,ma_5,ma_13,fng_value,fng_classification,tomorrow,target
1034,2020-12-01,0.067219,0.062489,0.048854,0.052634,0.128176,0.685808,0.754812,0.626099,0.036043,0.035915,1.000000,4,0.059582,1
1035,2020-12-02,0.052698,0.053138,0.049317,0.059582,0.092746,0.780303,0.816918,0.761383,0.043625,0.037939,0.966292,4,0.063846,1
1036,2020-12-03,0.059752,0.057623,0.059688,0.063846,0.076957,0.824836,0.768874,0.794531,0.049886,0.039144,0.966292,4,0.050832,0
1037,2020-12-04,0.063959,0.056670,0.055587,0.050832,0.082576,0.312681,0.639762,0.754738,0.051778,0.039228,0.966292,4,0.058764,1
1038,2020-12-05,0.050894,0.050566,0.053667,0.058764,0.063394,0.546115,0.558727,0.665436,0.050069,0.040375,0.977528,4,0.062095,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2251,2024-04-01,0.969529,0.958120,0.942166,0.940985,0.085472,0.463261,0.572381,0.623911,0.977645,0.987728,0.820225,4,0.866718,0
2252,2024-04-02,0.941108,0.929706,0.878936,0.866718,0.131277,0.126690,0.485558,0.571766,0.958449,0.984121,0.820225,4,0.876036,1
2253,2024-04-03,0.866787,0.881111,0.878457,0.876036,0.084357,0.208410,0.252398,0.438188,0.944276,0.984836,0.730337,3,0.920158,1
2254,2024-04-04,0.876020,0.922451,0.888395,0.920158,0.084217,0.579204,0.292518,0.341390,0.940158,0.991753,0.719101,3,0.908443,0


Splitting data into train, valid, and test

In [16]:
train_size = int(0.8 * len(df))
valid_size = int(0.1 * len(df))
train_df = df.iloc[:train_size, :]
valid_df = df.iloc[train_size:(train_size + valid_size), :]
test_df = df.iloc[(train_size + valid_size):, :]

Checking for imbalance

In [17]:
print(train_df['target'].value_counts(normalize=True),
valid_df['target'].value_counts(normalize=True),
test_df['target'].value_counts(normalize=True))

target
0    0.503582
1    0.496418
Name: proportion, dtype: float64 target
0    0.5
1    0.5
Name: proportion, dtype: float64 target
1    0.552846
0    0.447154
Name: proportion, dtype: float64


Exporting into csv

In [18]:
train_df.to_csv("../datasets/train_dataset.csv", index=False)
valid_df.to_csv("../datasets/valid_dataset.csv", index=False)
test_df.to_csv("../datasets/test_dataset.csv", index=False)