## Modules and data preparation

In [21]:
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import *
from sklearn.metrics import *
from sklearn.decomposition import PCA
from sklearn.preprocessing import Binarizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import *
from sklearn.calibration import calibration_curve
from sklearn.neighbors import KNeighborsClassifier

In [22]:
from pandas import DataFrame

def data_normalization(df: DataFrame, days_range: int=24, include_cur_row: bool=False):
    '''
    Return the normalized data:
    + df: the dataframe to be normalized
    + range: the number of previous rows (or including the current row) to be considered in the normalization
    + include_cur_row: True if we consider the current row in the normalization process (calculate mean and std
    using the current row and (range-1) previous rows), False if we want to use all the passed data for normalization 
    processing ((calculate mean and std using (range) previous rows))
    '''
    
    df_roll = None

    if include_cur_row == False:
        df_roll = df.rolling(days_range, closed='left')
    else:
        df_roll = df.rolling(days_range)
        
    res_df = (df - df_roll.mean()) / df_roll.std()
    res_df.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
    return res_df[days_range:] #

In [23]:
path = "../../data/norm_features_test.csv" 
df = pd.read_csv(path) 
ori_df = df.copy()

## KNN

In [35]:
# df =ori_df.copy()

In [36]:
df.drop(columns=["Unnamed: 0"], inplace=True)
df.dropna(inplace=True)
df

Unnamed: 0,open,high,low,close,volume,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,exchange_reserve,...,low_MeanDev48_over_StdDev48,close_MeanDev48_over_StdDev48,open_MeanDev96_over_StdDev96,high_MeanDev96_over_StdDev96,low_MeanDev96_over_StdDev96,close_MeanDev96_over_StdDev96,open_MeanDev192_over_StdDev192,high_MeanDev192_over_StdDev192,low_MeanDev192_over_StdDev192,close_MeanDev192_over_StdDev192
0,0.018495,0.009785,0.018383,0.004281,1180.825933,-0.241505,11017.0,612.021981,0.000000,0.312954,...,0.370477,0.661152,-1.950690,-1.881119,-1.871933,-1.950690,0.593925,0.969559,-0.596205,0.593925
1,0.004956,-0.008991,-0.004450,-0.003945,693.787209,-0.732426,8059.0,343.998248,0.000000,0.159984,...,0.480687,0.630321,-1.811761,-1.745879,-1.770050,-1.811761,0.371924,0.791939,-1.087814,0.371924
2,-0.003946,0.004507,-0.000473,-0.003816,639.808801,-0.784803,6949.0,297.590606,0.000000,-0.065807,...,0.389880,0.548217,-1.678097,-1.709976,-1.709619,-1.678097,-0.010608,0.631068,-1.541996,-0.010608
3,-0.003369,-0.005212,0.002062,0.006863,463.446578,-0.958557,5328.0,232.254343,0.000000,-0.314973,...,0.350033,0.460780,-1.660865,-1.701925,-1.692418,-1.660865,-0.052136,0.782304,-1.830356,-0.052136
4,0.006419,0.004713,0.002860,-0.002752,634.129456,-0.787283,7406.0,318.341325,0.000000,-0.576336,...,0.247121,0.354412,-1.587490,-1.630426,-1.655253,-1.587490,-0.142859,0.554341,-1.989684,-0.142859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38443,-0.003626,-0.002886,-0.009204,-0.008336,2613.774410,-0.703874,42193.0,1245.791750,-0.887751,-1.027353,...,0.894451,1.038517,-0.844173,-0.114303,-1.152432,-0.844173,-1.239710,-1.730291,-2.099255,-1.239710
38444,-0.008337,-0.009908,-0.006327,-0.006609,4195.854000,-0.346556,58827.0,1911.811000,-0.560005,-0.958007,...,0.435777,0.910978,-0.640086,-0.109049,-1.067451,-0.640086,-2.261169,-2.252223,-2.587937,-2.261169
38445,-0.006609,-0.000843,-0.001635,-0.008575,3271.915890,-0.552267,53095.0,1526.045120,-0.747297,-0.851685,...,0.309717,0.506155,-0.569034,0.065389,-0.977107,-0.569034,-2.692674,-2.448695,-2.436019,-2.692674
38446,-0.008575,-0.009336,-0.002849,0.002702,3588.324010,-0.482464,53154.0,1650.636910,-0.678239,-0.840105,...,0.400968,0.594548,-0.490384,0.124920,-0.895412,-0.490384,-2.263128,-2.371402,-2.230504,-2.263128


In [37]:
knn = KNeighborsClassifier(n_neighbors=10)

In [38]:
X = df.drop(columns=["label"])
y = df["label"]

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=False)

In [40]:
X = X/(abs(X_train).quantile(0.9)+1e-6)

In [41]:
X = X.astype('float64')

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=False)

In [43]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)