In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

def addlag(data, col, maxlag):
    df = data.copy()
    for i in range(1, maxlag +1):
        df[col + "_L" + str(i)] = df[col].shift(i)
    
    return df

def read_and_clean_data(filename):
    df = pd.read_csv(filename, parse_dates = ['time'])
    maxlag = 4
    df["pospercent"] = df.pos / df.total
    df.total = df.total.astype(int)
    
    df["BTC_change"] = df.BTC_price.pct_change()
    df = addlag(df, "BTC_change", maxlag)
    df["SPY_change"] = df.SPY_price.pct_change()
    df = addlag(df, "SPY_change", maxlag)
    df["VXX_change"] = df.VXX_price.pct_change()
    df = addlag(df, "VXX_change", maxlag)
    df["XAU_change"] = df.XAU_price.pct_change()
    df = addlag(df, "XAU_change", maxlag)
    df["change"] = df.XRP_price.pct_change().shift(-1)
    df = addlag(df, "change", maxlag)

    df = df.drop(['time','XRP_price','BTC_price','SPY_price','VXX_price','XAU_price','VXX_change','labels','pos','neg','dom'], axis = 1)
    df = df.dropna()
    df = df.reset_index(drop = True)
    
    new_cols = [col for col in df.columns if col != 'change'] + ['change']
    df = df[new_cols]

    return df

def split_and_save(filename):
    df = read_and_clean_data(filename)
    train, test = train_test_split(df, train_size = 0.7, shuffle = False)
    train, valid = train_test_split(train, train_size = 0.8, shuffle = False)
    train = train.reset_index(drop = True)
    valid = valid.reset_index(drop = True)
    test = test.reset_index(drop = True)
    print("Train: ", train.shape, " Valid: ", valid.shape, "Test: ", test.shape)
    if "day" in filename:
        add = "day"
    if "hour" in filename:
        add = "hour"
    if "minute" in filename:
        add = "minute"
        
    train.to_csv("data/" + add + "_train.csv", index = False)
    valid.to_csv("data/" + add + "_valid.csv", index = False)
    test.to_csv("data/" + add + "_test.csv", index = False)

## Day

In [2]:
split_and_save("data/full_day.csv")

Train:  (27, 26)  Valid:  (7, 26) Test:  (15, 26)


## Hour

In [3]:
split_and_save("data/full_hour.csv")

Train:  (688, 26)  Valid:  (173, 26) Test:  (370, 26)


## Minute

In [4]:
split_and_save("data/full_minute.csv")

Train:  (5137, 26)  Valid:  (1285, 26) Test:  (2753, 26)
