In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from problem import get_train_data
from problem import get_test_data
from catboost import CatBoost, CatBoostClassifier

In [12]:
def compute_rolling_std(X_df, feature, time_window, center=False):
    """
    For a given dataframe, compute the standard deviation over
    a defined period of time (time_window) of a defined feature

    Parameters
    ----------
    X : dataframe
    feature : str
        feature in the dataframe we wish to compute the rolling std from
    time_window : str
        string that defines the length of the time window passed to `rolling`
    center : bool
        boolean to indicate if the point of the dataframe considered is
        center or end of the window
    """
    name = "_".join([feature, time_window, "std"])
    X_df[name] = X_df[feature].rolling(time_window, center=center).std()
    X_df[name] = X_df[name].ffill().bfill()
    X_df[name] = X_df[name].astype(X_df[feature].dtype)
    return X_df

def compute_rolling_mean(X_df, feature, time_window, center=False):
    """
    For a given dataframe, compute the mean over
    a defined period of time (time_window) of a defined feature

    Parameters
    ----------
    X : dataframe
    feature : str
        feature in the dataframe we wish to compute the rolling mean from
    time_window : str
        string that defines the length of the time window passed to `rolling`
    center : bool
        boolean to indicate if the point of the dataframe considered is
        center or end of the window
    """
    name = "_".join([feature, time_window, "mean"])
    X_df[name] = X_df[feature].rolling(time_window, center=center).mean()
    X_df[name] = X_df[name].ffill().bfill()
    X_df[name] = X_df[name].astype(X_df[feature].dtype)
    return X_df

def clip_column(X_df, column, min, max):
    X_df[column] = X_df[column].clip(min, max)
    return X_df

In [13]:
def transform1(X):
    X = clip_column(X, 'Beta', 0, 250)
    X = clip_column(X, 'Np_nl', 0, 100)
    X = clip_column(X, 'Np', 0, 500)

    X = compute_rolling_std(X, "B", "2h")
    X = compute_rolling_mean(X, "B", "2h")
    X = compute_rolling_std(X, 'Beta', '2h')
    X = compute_rolling_mean(X, 'Beta', '2h')
    X = compute_rolling_std(X, 'RmsBob', '2h')
    X = compute_rolling_mean(X, 'RmsBob', '2h')
    X = compute_rolling_std(X, 'Vx', '2h')
    X = compute_rolling_mean(X, 'Vx', '2h')

    X = compute_rolling_std(X, "B", "1h")
    X = compute_rolling_mean(X, "B", "1h")
    X = compute_rolling_std(X, 'Beta', '1h')
    X = compute_rolling_mean(X, 'Beta', '1h')
    X = compute_rolling_std(X, 'RmsBob', '1h')
    X = compute_rolling_mean(X, 'RmsBob', '1h')
    X = compute_rolling_std(X, 'Vx', '1h')
    X = compute_rolling_mean(X, 'Vx', '1h')

    X = compute_rolling_std(X, "B", "6h")
    X = compute_rolling_mean(X, "B", "6h")
    X = compute_rolling_std(X, 'Beta', '6h')
    X = compute_rolling_mean(X, 'Beta', '6h')
    X = compute_rolling_std(X, 'RmsBob', '6h')
    X = compute_rolling_mean(X, 'RmsBob', '6h')
    X = compute_rolling_std(X, 'Vx', '6h')
    X = compute_rolling_mean(X, 'Vx', '6h')

    X = compute_rolling_std(X, "B", "12h")
    X = compute_rolling_mean(X, "B", "12h")
    X = compute_rolling_std(X, 'Beta', '12h')
    X = compute_rolling_mean(X, 'Beta', '12h')
    X = compute_rolling_std(X, 'RmsBob', '12h')
    X = compute_rolling_mean(X, 'RmsBob', '12h')
    X = compute_rolling_std(X, 'Vx', '12h')
    X = compute_rolling_mean(X, 'Vx', '12h')
    return X

In [14]:
X, y = get_train_data()
Xt, yt = get_test_data()
X = transform1(X)
Xt = transform1(Xt)

In [15]:
cat = CatBoostClassifier(
    iterations=1000,
    loss_function='Logloss',
    custom_metric=['F1', 'Recall', 'Accuracy'],
    #auto_class_weights='SqrtBalanced',
    #use_best_model=True,
    verbose=1000,
    task_type='CPU'
)

# cat.fit(
#     X,
#     y=y,
#     verbose=50,
#     eval_set=(Xt, yt),
#     plot=True
# )

In [16]:
# import shap
# explainer = shap.TreeExplainer(cat)
# shap_values = explainer.shap_values(X)
# shap.summary_plot(shap_values, X.values, plot_type='bar', class_names=cat.classes_, feature_names=X.columns)

In [17]:
params = {
        'learning_rate': [0.1],
        'depth': [3,5,7],
        'l2_leaf_reg': [3,5,7,9]
}

In [18]:
gsr = cat.grid_search(
    params,
    X=X,
    y=y,
    cv=5,
    train_size=0.8,
    shuffle=False,
    stratified=False
)

0:	learn: 0.5821902	test: 0.5591280	best: 0.5591280 (0)	total: 26.8ms	remaining: 26.8s
999:	learn: 0.1323861	test: 0.0599135	best: 0.0598760 (964)	total: 14.4s	remaining: 0us

bestTest = 0.05987599335
bestIteration = 964

0:	loss: 0.0598760	best: 0.0598760 (0)	total: 14.6s	remaining: 2m 40s
0:	learn: 0.5821902	test: 0.5591280	best: 0.5591280 (0)	total: 15.7ms	remaining: 15.7s
999:	learn: 0.1322690	test: 0.0608241	best: 0.0604362 (729)	total: 14.2s	remaining: 0us

bestTest = 0.0604361597
bestIteration = 729

1:	loss: 0.0604362	best: 0.0598760 (0)	total: 28.8s	remaining: 2m 24s
0:	learn: 0.5823119	test: 0.5592733	best: 0.5592733 (0)	total: 17.4ms	remaining: 17.3s
999:	learn: 0.1321747	test: 0.0601872	best: 0.0598868 (868)	total: 14.3s	remaining: 0us

bestTest = 0.05988680617
bestIteration = 868

2:	loss: 0.0598868	best: 0.0598760 (0)	total: 43.1s	remaining: 2m 9s
0:	learn: 0.5823124	test: 0.5592739	best: 0.5592739 (0)	total: 14.3ms	remaining: 14.3s
999:	learn: 0.1320288	test: 0.0609298	b

In [19]:
cat.get_params()

{'iterations': 1000,
 'loss_function': 'Logloss',
 'verbose': 1000,
 'custom_metric': ['F1', 'Recall', 'Accuracy'],
 'task_type': 'CPU',
 'depth': 7,
 'l2_leaf_reg': 9,
 'learning_rate': 0.1}

In [20]:
# from catboost import cv, Pool
# cv_dataset = Pool(data=X, label=y)
# parameters = {
#     'iterations': 300,
#     'loss_function': 'Logloss',
#     'verbose': 1000,
#     'custom_metric': ['F1', 'Recall', 'Accuracy'],
#     #'task_type': 'GPU',
#     'depth': 7,
#     'l2_leaf_reg': 7,
#     'learning_rate': 0.1,
#     'allow_writing_files': False
# }
# scores = cv(cv_dataset,
#             parameters,
#             fold_count=10,
#             type='TimeSeries',
#             plot=True)