In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, f1_score, precision_recall_curve, roc_auc_score
from xgboost import XGBClassifier

from util import train_test_split, gen_dataset
from sample_weights import uniqueness_matrix, sample_weights, time_decay
from fractionally_differentiated_features import get_diff_factor, frac_diff_ffd
from financial_data_structure import cusum

In [None]:
def get_best_f1(y_test, y_pred):
    precision, recall, threshold = precision_recall_curve(y_test, y_pred)
    f1_score = 2*precision*recall / (precision+recall)
    return np.nanmax(f1_score), threshold[np.nanargmax(f1_score)]

In [None]:
data = pd.read_csv('data/BTCSPOT_300.csv', index_col=0, parse_dates=True)
side = pd.read_csv('data/side_labeling.csv', index_col=0, parse_dates=True)
long_size = pd.read_csv('data/long_labeling.csv', index_col=0, parse_dates=True)
short_size = pd.read_csv('data/short_labeling.csv', index_col=0, parse_dates=True)
data = data.loc[:side.index[-1]]
dataset = gen_dataset(data)
index_ls = train_test_split(dataset, 90, 7, 1)

In [2]:
space = [
    {'window': (365, 30, 1)},
    {'window': (180, 14, 1)},
    {'window': (90, 7, 1)},
    {'window': (30, 3, 1)},
    {'window': (7, 1, 1)},
]

In [None]:
result = []
for config in space:
    index_ls = train_test_split(dataset, *config['window'])
    scores, long_idx = {}, []
    for train_idx, test_idx in tqdm(index_ls[:5]):
        X_train, X_test = dataset.loc[train_idx], dataset.loc[test_idx]
        y_train, y_test = side.loc[X_train.index, 'bin'], side.loc[X_test.index, 'bin']
        model = XGBClassifier(eval_metric='logloss', seed=1014)
        model.fit(X_train, y_train)
        y_pred_train = model.predict_proba(X_train)[:,1]
        _f1, threshold = get_best_f1(y_train, y_pred_train)
        y_pred_test = model.predict_proba(X_test)[:,1]
        recall = recall_score(y_test, y_pred_test >= threshold)
        precision = precision_score(y_test, y_pred_test >= threshold)
        f1 = f1_score(y_test, y_pred_test >= threshold)
        auc = roc_auc_score(y_test, y_pred_test)
        scores[test_idx[-1]] = (recall, precision, f1, auc, threshold)
        long_idx.append(y_test.index[y_pred_test >= threshold])
    result.append(pd.DataFrame(scores, index=['recall', 'precision', 'f1', 'auc', 'threshold']).T)

In [None]:
pd.concat(map(lambda x: x.mean(), result))

In [None]:
plt.figure(figsize=(24,8))
result = pd.DataFrame(scores).T
result.columns = ['recall', 'precision', 'f1', 'auc', 'threshold']
sns.lineplot(x=result.index, y=result['recall'])
sns.lineplot(x=result.index, y=result['precision'])
sns.lineplot(x=result.index, y=result['f1'])
sns.lineplot(x=result.index, y=result['auc'])
sns.lineplot(x=result.index, y=result['threshold'])
plt.legend(result.columns)
plt.show()
result.mean()

In [None]:
scores = {}
for (train_idx, _), test_idx in tqdm(zip(index_ls, long_idx), total=len(index_ls)):
    X_train, X_test = dataset.loc[train_idx], dataset.loc[test_idx]
    y_train, y_test = long_size.loc[X_train.index, 'bin'], long_size.loc[X_test.index, 'bin']
    model = XGBClassifier(eval_metric='logloss')
    model.fit(X_train, y_train)
    y_pred_test = model.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, y_pred_test)
    scores[test_idx[-1]] = (len(test_idx)/len(_), auc)

In [None]:
plt.figure(figsize=(24,8))
result = pd.DataFrame(scores).T
result.columns = ['trade_ratio', 'auc']
sns.lineplot(x=result.index, y=result['trade_ratio'])
sns.lineplot(x=result.index, y=result['auc'])
plt.legend(result.columns)
plt.show()