## Modules and data preparation

In [1]:
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import *
from sklearn.metrics import *
from sklearn.decomposition import PCA
from sklearn.preprocessing import Binarizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import *
from sklearn.calibration import calibration_curve

In [2]:
from pandas import DataFrame

def data_normalization(df: DataFrame, days_range: int=24, include_cur_row: bool=False):
    '''
    Return the normalized data:
    + df: the dataframe to be normalized
    + range: the number of previous rows (or including the current row) to be considered in the normalization
    + include_cur_row: True if we consider the current row in the normalization process (calculate mean and std
    using the current row and (range-1) previous rows), False if we want to use all the passed data for normalization 
    processing ((calculate mean and std using (range) previous rows))
    '''
    
    df_roll = None

    if include_cur_row == False:
        df_roll = df.rolling(days_range, closed='left')
    else:
        df_roll = df.rolling(days_range)
        
    res_df = (df - df_roll.mean()) / df_roll.std()
    res_df.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
    return res_df[days_range:] #

In [3]:
path = "../../data/test_df_with_cryptoquant_data.csv" 
df = pd.read_csv(path) 
ori_df = df.copy()

## Ridge Regression

In [4]:
df.ffill(inplace=True)
df["diff"] = (df["close"] - df["close"].shift(1)).shift(-1)
df.drop(columns=["Unnamed: 0", "time_open", "label"], inplace=True)
df.dropna(inplace=True)
df

Unnamed: 0,open,high,low,close,volume,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,exchange_reserve,...,coinbase_premium_index_WMA96,coinbase_premium_index_EMA96,coinbase_premium_index_MeanDev96,coinbase_premium_index_StdDev96,exchange_to_exchange_flow_SMA96,exchange_to_exchange_flow_WMA96,exchange_to_exchange_flow_EMA96,exchange_to_exchange_flow_MeanDev96,exchange_to_exchange_flow_StdDev96,diff
0,12650.00,12828.00,12459.00,12542.42,466.369907,5.880672e+06,5346.0,254.863724,3.214652e+06,1.357020e+09,...,2.527943,2.553510,1.519123,1.918975,1210.662282,1203.763811,1125.168402,612.771940,1242.314372,255.04
1,12544.99,12831.31,12493.66,12797.46,377.641478,4.789354e+06,4652.0,235.156423,2.983848e+06,1.391056e+09,...,2.489823,2.506743,1.518544,1.918193,1208.799321,1195.387326,1118.554678,613.858667,1242.788011,-245.45
2,12797.44,12955.00,12512.00,12552.01,355.128100,4.522712e+06,4523.0,181.167520,2.309072e+06,1.373858e+09,...,2.471935,2.481189,1.517938,1.917799,1215.700789,1197.734794,1122.762860,612.060951,1241.560873,-32.37
3,12552.01,12700.00,12425.98,12519.64,345.358570,4.332844e+06,4427.0,150.026889,1.881453e+06,1.450609e+09,...,2.453826,2.455958,1.519845,1.918824,1214.496966,1201.820761,1128.765073,610.380616,1241.312091,369.24
4,12519.64,12888.88,12483.65,12888.88,328.946751,4.176715e+06,4895.0,163.014102,2.070963e+06,1.445915e+09,...,2.442354,2.437831,1.519098,1.918500,1212.431534,1192.744942,1121.456921,611.628481,1241.877698,101.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39089,20972.91,21003.70,20741.03,20897.00,2945.616500,6.135372e+07,41992.0,1567.759890,3.265308e+07,1.278101e+10,...,-0.115172,-0.118030,0.029279,0.042228,452.041358,419.537049,491.800255,458.145798,1044.554324,-173.48
39090,20897.00,20943.17,20551.00,20723.52,2613.774410,5.412753e+07,42193.0,1245.791750,2.580072e+07,1.261532e+10,...,-0.115141,-0.118159,0.029238,0.042224,451.726412,412.244009,483.687444,458.283587,1044.656362,-136.51
39091,20723.51,20736.68,20421.38,20587.01,4195.854000,8.638902e+07,58827.0,1911.811000,3.937325e+07,1.258804e+10,...,-0.114620,-0.117796,0.029200,0.042245,447.378679,403.618116,474.402560,460.185721,1045.519557,-175.77
39092,20587.01,20719.21,20388.01,20411.24,3271.915890,6.732365e+07,53095.0,1526.045120,3.140632e+07,1.268025e+10,...,-0.114529,-0.117861,0.029174,0.042244,444.463703,397.208931,467.436182,461.521762,1045.991941,55.23


In [5]:
ridge = Ridge(alpha=0.1)

In [6]:
X = df.drop(columns=["diff"])
y = df["diff"]

In [7]:
# X = data_normalization(X)
# y = data_normalization(y)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=False)

In [9]:
ridge.fit(X_train, y_train)

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


Ridge(alpha=0.1)

In [10]:
ridge.predict(X_test)

array([460.07329777, 384.09104656, 442.56641454, ..., 268.92790551,
       411.6064772 , 277.60960918])

In [11]:
mean_absolute_percentage_error(ridge.predict(X_train), y_train)

4.2453427149157354

In [12]:
mean_absolute_percentage_error(ridge.predict(X_test), y_test)

2.812078454717799