# Imports

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from torch.utils.data import DataLoader

# Loading data

In [2]:
PATH = "../datasets/final_data.csv"
df = pd.read_csv(PATH)
df.head(3)

Unnamed: 0,Date,open,high,low,close,volume,stoch_k,stoch_k_smooth,stoch_d,ma_5,ma_13,fng_value,fng_classification,tomorrow,target
0,2018-02-01,10237.299805,10288.799805,8812.280273,9170.540039,9959400448,11.098439,11.401036,27.762554,10516.12793,11085.433969,30.0,Fear,8830.75,0
1,2018-02-02,9142.280273,9142.280273,7796.490234,8830.75,12726899712,25.355039,18.686779,18.442888,9925.017969,10772.476262,15.0,Extreme Fear,9174.910156,1
2,2018-02-03,8852.120117,9430.75,8251.629883,9174.910156,7263790080,39.263281,25.23892,18.442245,9500.719922,10585.923227,40.0,Fear,8277.009766,0


In [11]:
len(df)

2256

# Preparation for training and test data

Check for balance
The data is fairly balanced, seems we don't need to resample the data

In [3]:
df['target'].value_counts(normalize=True)

target
1    0.513957
0    0.486043
Name: proportion, dtype: float64

Check for missing data

In [4]:
df.isna().sum()

Date                  0
open                  0
high                  0
low                   0
close                 0
volume                0
stoch_k               0
stoch_k_smooth        0
stoch_d               0
ma_5                  0
ma_13                 0
 fng_value            0
fng_classification    0
tomorrow              1
target                0
dtype: int64

dropping missing data

In [5]:
df.dropna(inplace=True)
df.isna().sum()

Date                  0
open                  0
high                  0
low                   0
close                 0
volume                0
stoch_k               0
stoch_k_smooth        0
stoch_d               0
ma_5                  0
ma_13                 0
 fng_value            0
fng_classification    0
tomorrow              0
target                0
dtype: int64

Mapping data

In [6]:
fng_classes = df['fng_classification'].unique()
fng_classes

array(['Fear', 'Extreme Fear', 'Neutral', 'Greed', 'Extreme Greed'],
      dtype=object)

In [7]:
df['fng_classification'] = df['fng_classification'].map({'Extreme Fear': 0, 'Fear': 1, 'Neutral': 2, 'Greed':3, 'Extreme Greed': 4})

In [8]:
df['fng_classification']

0       1
1       0
2       1
3       0
4       0
       ..
2251    4
2252    4
2253    3
2254    3
2255    4
Name: fng_classification, Length: 2256, dtype: int64

In [15]:
train_size = int(0.8 * len(df))
valid_size = int(0.1 * len(df))
train_df = df.iloc[:train_size, :]
valid_df = df.iloc[train_size:(train_size + valid_size), :]
test_df = df.iloc[(train_size + valid_size):, :]

In [20]:
train_df.to_csv("../datasets/train_dataset.csv")
valid_df.to_csv("../datasets/valid_dataset.csv")
test_df.to_csv("../datasets/test_dataset.csv")