### Question. First, theory tasks:

#### Which data transformation can prepare the data for linear separation?


In [3]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler, QuantileTransformer, PowerTransformer
from sklearn.linear_model import LogisticRegression

from quant.Classification import Classification
from quant.factor import get_factors

%reload_ext autoreload
%autoreload 2
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

np.set_printoptions(precision=4, suppress=True)

import warnings
warnings.filterwarnings('ignore')

In [4]:
klines_1h_file = 'data/futures_klines_1h_BTCUSDT_20200101_20220430.zip'
kline_1h_df = pd.read_csv(klines_1h_file, index_col=0, parse_dates=True)
kline_1h_factors = get_factors(kline_1h_df, sign_ratio=1)

In [5]:
X = kline_1h_factors.drop(['Price', 'Returns', 'CumReturns', 'Log_Returns', 'Returns_Ratio', 'Sign'], axis=1)
y = kline_1h_factors['Sign']
X.shape, y.shape

((20217, 18), (20217,))

In [6]:
standard = Classification(X, y)
standard.fit_predict(StandardScaler(), LogisticRegression(random_state=64))
standard_mse, standard_rmse, standard_r2train, standard_r2test = standard.eval_metrics()

minmax = Classification(X, y)
minmax.fit_predict(MinMaxScaler(), LogisticRegression(random_state=64))
minmax_mse, minmax_rmse, minmax_r2train, minmax_r2test = minmax.eval_metrics()

maxabs = Classification(X, y)
maxabs.fit_predict(MaxAbsScaler(), LogisticRegression(random_state=64))
maxabs_mse, maxabs_rmse, maxabs_r2train, maxabs_r2test = maxabs.eval_metrics()

quantile = Classification(X, y)
quantile.fit_predict(QuantileTransformer(), LogisticRegression(random_state=64))
quantile_mse, quantile_rmse, quantile_r2train, quantile_r2test = quantile.eval_metrics()

power = Classification(X, y)
power.fit_predict(PowerTransformer(), LogisticRegression(random_state=64))
power_mse, power_rmse, power_r2train, power_r2test = power.eval_metrics()

result = pd.DataFrame()
result = result.append([['StandardScaler', standard_mse, standard_rmse, standard_r2train, standard_r2test]])
result = result.append([['MinMaxScaler', minmax_mse, minmax_rmse, minmax_r2train, minmax_r2test]])
result = result.append([['MaxAbsScaler', maxabs_mse, maxabs_rmse, maxabs_r2train, maxabs_r2test]])
result = result.append([['QuantileTransformer', quantile_mse, quantile_rmse, quantile_r2train, quantile_r2test]])
result = result.append([['PowerTransformer', power_mse, power_rmse, power_r2train, power_r2test]])
result.columns = ['Scaler', 'MSE', 'RMSE', 'R2Train', 'R2Test']
result.reset_index(drop=True, inplace=True)
print(result.to_latex(index=False))
result

\begin{tabular}{lrrrr}
\toprule
             Scaler &    MSE &   RMSE &  R2Train &  R2Test \\
\midrule
     StandardScaler & 0.4691 & 0.6849 &   0.5368 &  0.5309 \\
       MinMaxScaler & 0.4718 & 0.6869 &   0.5360 &  0.5282 \\
       MaxAbsScaler & 0.4691 & 0.6849 &   0.5348 &  0.5309 \\
QuantileTransformer & 0.4594 & 0.6778 &   0.5423 &  0.5406 \\
   PowerTransformer & 0.4676 & 0.6838 &   0.5360 &  0.5324 \\
\bottomrule
\end{tabular}



Unnamed: 0,Scaler,MSE,RMSE,R2Train,R2Test
0,StandardScaler,0.4691,0.6849,0.5368,0.5309
1,MinMaxScaler,0.4718,0.6869,0.536,0.5282
2,MaxAbsScaler,0.4691,0.6849,0.5348,0.5309
3,QuantileTransformer,0.4594,0.6778,0.5423,0.5406
4,PowerTransformer,0.4676,0.6838,0.536,0.5324


### Conclusion

From table, we can find that MinMaxScaler has the lowest MSE and RMSE scores which mean the lowest amount of error, and has the highest R2Train and R2Test scores which indicate more variability is explained. So it can be concluded that MinMaxScaler is the best scaler for this experiment.