## Process different datasets into NTC format
As the purpose of this research is to compare the results of a model based on different features. The dataset is now to be separated into different formats.


In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("clean_all.csv", index_col = 0)
data

FileNotFoundError: [Errno 2] File b'clean_all.csv' does not exist: b'clean_all.csv'

In [10]:
data.columns

Index(['timestamp', 'unix', 'close', 'volume', 'trades', 'volatility',
       'return', 'EWM72', 'EWM12', 'MACD', 'CMF', 'RSI72', 'RSI12',
       'tweets_vol', 'reddit_vol', 'reddit_score', 'tweets_replies',
       'tweets_retweets', 'tweets_likes', 't_compound', 't_CumPositive',
       't_CumNegative', 't_CumCompound', 'r_compound', 'r_CumPositive',
       'r_CumNegative', 'r_CumCompound'],
      dtype='object')

In [11]:
univeriate = data[["close"]]
hist_all = data[["close", "volume", "volatility", "trades", "return", 'EWM72', "EWM12", "MACD", "CMF", "RSI72", "RSI12"]]
twitter = data[["close", "tweets_vol", "tweets_replies", "tweets_retweets", "tweets_likes", "t_CumPositive", "t_CumNegative", "t_CumCompound"]]
reddit = data[["close", "reddit_vol", "reddit_score", "r_CumPositive", "r_CumNegative", "r_CumCompound"]]
all_feat = data[["close", "volume", "trades", "volatility", "return", "EWM72", "EWM12", "MACD", "CMF", "RSI72", "RSI12", "tweets_vol", "reddit_vol",\
                 "reddit_score", "tweets_replies", "tweets_retweets", "tweets_likes", "t_CumPositive", "t_CumNegative", "t_CumCompound", "r_CumPositive",\
                 "r_CumNegative", "r_CumCompound"]]

In [12]:
def process_data_NTC(data, seq_len = 1, val_split = 0.15, test_split = 0.00001, pred_steps = 1):
    train_split = 1 - (val_split + test_split)
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    norm = scaler.fit_transform(data)
    
    y = norm[seq_len+pred_steps:,0]
    X = norm[:-pred_steps]
    
    
    VAL_INDEX = int(len(y) * train_split)
    TEST_INDEX = int(len(y) * (train_split + val_split))
    
    y_train = y[:VAL_INDEX]
    y_val = y[VAL_INDEX:TEST_INDEX]
    y_test = y[TEST_INDEX:]
    
    X_train = np.expand_dims(X[:seq_len], axis = 0)
    X_val = np.expand_dims(X[VAL_INDEX:VAL_INDEX+seq_len], axis = 0)
    X_test = np.expand_dims(X[TEST_INDEX:TEST_INDEX+seq_len], axis = 0)
    
    
    for i in range( X.shape[0] - seq_len):
        if i == 0 or i == VAL_INDEX or i == TEST_INDEX:
            continue
        if i >= TEST_INDEX:
            X_test = np.concatenate((X_test, np.expand_dims(X[i:i+seq_len], axis = 0)), axis = 0)
        elif i >= VAL_INDEX:
            X_val = np.concatenate((X_val, np.expand_dims(X[i:i+seq_len], axis = 0)), axis = 0)
        else:
            X_train = np.concatenate((X_train, np.expand_dims(X[i:i+seq_len], axis = 0)), axis = 0)
                                 
    return X_train, y_train, X_val, y_val, X_test, y_test


univeriate = process_data_NTC(univeriate, seq_len = 5, val_split = 0.1, test_split = 0.2)
hist_all = process_data_NTC(hist_all, seq_len = 5, val_split = 0.1, test_split = 0.2)
twitter = process_data_NTC(twitter, seq_len = 5, val_split = 0.1, test_split = 0.2)
reddit = process_data_NTC(reddit, seq_len = 5, val_split = 0.1, test_split = 0.2)
all_feat = process_data_NTC(all_feat, seq_len = 5, val_split = 0.1, test_split = 0.2)

In [13]:
data_ = [univeriate, hist_all, twitter, reddit, all_feat]
np.savez("len_5", data_)

In [14]:
univeriate = data[["close"]]
hist_all = data[["close", "volume", "volatility", "trades", "return", 'EWM72', "EWM12", "MACD", "CMF", "RSI72", "RSI12"]]
twitter = data[["close", "tweets_vol", "tweets_replies", "tweets_retweets", "tweets_likes", "t_compound", "t_CumPositive", "t_CumNegative", "t_CumCompound"]]
reddit = data[["close", "reddit_vol", "reddit_score", "r_compound", "r_CumPositive", "r_CumNegative", "r_CumCompound"]]
all_feat = data[["close", "volume", "trades", "volatility", "return", "EWM72", "EWM12", "MACD", "CMF", "RSI72", "RSI12", "tweets_vol", "reddit_vol",\
                 "reddit_score", "tweets_replies", "tweets_retweets", "tweets_likes", "t_compound", "t_CumPositive", "t_CumNegative", "t_CumCompound",\
                 "r_compound", "r_CumPositive", "r_CumNegative", "r_CumCompound"]]

In [15]:
univeriate = process_data_NTC(univeriate, seq_len = 10, val_split = 0.1, test_split = 0.2)
hist_all = process_data_NTC(hist_all, seq_len = 10, val_split = 0.1, test_split = 0.2)
twitter = process_data_NTC(twitter, seq_len = 10, val_split = 0.1, test_split = 0.2)
reddit = process_data_NTC(reddit, seq_len = 10, val_split = 0.1, test_split = 0.2)
all_feat = process_data_NTC(all_feat, seq_len = 10, val_split = 0.1, test_split = 0.2)

data_ = [univeriate, hist_all, twitter, reddit, all_feat]
np.savez("len_10", data_)

In [16]:
univeriate = data[["close"]]
hist_all = data[["close", "volume", "volatility", "trades", "return", 'EWM72', "EWM12", "MACD", "CMF", "RSI72", "RSI12"]]
twitter = data[["close", "tweets_vol", "tweets_replies", "tweets_retweets", "tweets_likes", "t_compound", "t_CumPositive", "t_CumNegative", "t_CumCompound"]]
reddit = data[["close", "reddit_vol", "reddit_score", "r_compound", "r_CumPositive", "r_CumNegative", "r_CumCompound"]]
all_feat = data[["close", "volume", "trades", "volatility", "return", "EWM72", "EWM12", "MACD", "CMF", "RSI72", "RSI12", "tweets_vol", "reddit_vol",\
                 "reddit_score", "tweets_replies", "tweets_retweets", "tweets_likes", "t_compound", "t_CumPositive", "t_CumNegative", "t_CumCompound",\
                 "r_compound", "r_CumPositive", "r_CumNegative", "r_CumCompound"]]

In [17]:
univeriate = process_data_NTC(univeriate, seq_len = 15, val_split = 0.1, test_split = 0.2)
hist_all = process_data_NTC(hist_all, seq_len = 15, val_split = 0.1, test_split = 0.2)
twitter = process_data_NTC(twitter, seq_len = 15, val_split = 0.1, test_split = 0.2)
reddit = process_data_NTC(reddit, seq_len = 15, val_split = 0.1, test_split = 0.2)
all_feat = process_data_NTC(all_feat, seq_len = 15, val_split = 0.1, test_split = 0.2)

data_ = [univeriate, hist_all, twitter, reddit, all_feat]
np.savez("len_15", data_)

In [18]:
univeriate = data[["close"]]
hist_all = data[["close", "volume", "volatility", "trades", "return", 'EWM72', "EWM12", "MACD", "CMF", "RSI72", "RSI12"]]
twitter = data[["close", "tweets_vol", "tweets_replies", "tweets_retweets", "tweets_likes", "t_compound", "t_CumPositive", "t_CumNegative", "t_CumCompound"]]
reddit = data[["close", "reddit_vol", "reddit_score", "r_compound", "r_CumPositive", "r_CumNegative", "r_CumCompound"]]
all_feat = data[["close", "volume", "trades", "volatility", "return", "EWM72", "EWM12", "MACD", "CMF", "RSI72", "RSI12", "tweets_vol", "reddit_vol",\
                 "reddit_score", "tweets_replies", "tweets_retweets", "tweets_likes", "t_compound", "t_CumPositive", "t_CumNegative", "t_CumCompound",\
                 "r_compound", "r_CumPositive", "r_CumNegative", "r_CumCompound"]]

In [19]:
univeriate = process_data_NTC(univeriate, seq_len = 20, val_split = 0.1, test_split = 0.2)
hist_all = process_data_NTC(hist_all, seq_len = 20, val_split = 0.1, test_split = 0.2)
twitter = process_data_NTC(twitter, seq_len = 20, val_split = 0.1, test_split = 0.2)
reddit = process_data_NTC(reddit, seq_len = 20, val_split = 0.1, test_split = 0.2)
all_feat = process_data_NTC(all_feat, seq_len = 20, val_split = 0.1, test_split = 0.2)

data_ = [univeriate, hist_all, twitter, reddit, all_feat]
np.savez("len_20", data_)