In [655]:
import pandas as pd
import numpy as np

In [656]:
import xgboost as xgb
import lightgbm as gbm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

In [657]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, classification_report, confusion_matrix, make_scorer

from functools import partial

In [658]:
train = pd.read_parquet('../data/processed/train.parquet')
test = pd.read_parquet('../data/processed/test.parquet')

In [659]:
metric = partial(f1_score, average='macro')
score = make_scorer(f1_score, average='macro')

In [660]:
X, y = train.loc[:, ~train.columns.isin(['revenue', "id"])], train.revenue
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2)

In [661]:
def evaluate_model(clf, metric, score, data):
    """Evaluates a model and returns a summary for visualize. [just internal use]
    """
    X_train, X_test, y_train, y_test = data
    f1_cross = cross_val_score(clf, X_train, y_train, cv=5, scoring=score)
    clf.fit(X_train, y_train)
    return {
        "model": clf.__class__.__name__, 
        "test": metric(y_test, clf.predict(X_test)), 
        "train": metric(y_train, clf.predict(X_train)),
        "f1_cross_mean": np.mean(f1_cross),
        "f1_cross_std": np.std(f1_cross)
    }

In [662]:
lgb_params = {
    'n_estimators':300,
    'boosting_type': 'dart',
    'objective': 'binary',
    'learning_rate': 0.1, 
    'max_depth' : 8,
    'num_leaves': 64,
#     'is_unbalance':True, # is_unbalance vs scale_pos_weight
     ## see https://towardsdatascience.com/understanding-lightgbm-parameters-and-how-to-tune-them-6764e20c6e5b
    'scale_pos_weight': y_train.sum() / (y_train-1).abs().sum()
#     'max_bin': 512,
#     'subsample_for_bin': 200,
#     'subsample': 1,
#     'subsample_freq': 1,
#     'colsample_bytree': 0.8,
#     'reg_alpha': 5,
#     'reg_lambda': 10,
#     'min_split_gain': 1,
#     'min_child_weight': 1,
#     'min_child_samples': 5,
}

In [663]:
# I've tested out other models but I select LightGBM
models = [
    gbm.LGBMClassifier(**lgb_params,),
    
#     xgb.XGBClassifier(
#         n_estimators=400
#     ), 
#     RandomForestClassifier(),
#     LogisticRegression(),
#     AdaBoostClassifier()
]

In [664]:
results = [evaluate_model(model, metric, score, (X_train, X_val, y_train, y_val)) for model in models]

In [665]:
pd.DataFrame.from_dict(results)

Unnamed: 0,model,test,train,f1_cross_mean,f1_cross_std
0,LGBMClassifier,0.759424,0.935723,0.759777,0.008417


In [666]:
models[0].get_params()

{'boosting_type': 'dart',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 8,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 300,
 'n_jobs': -1,
 'num_leaves': 64,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'scale_pos_weight': 0.18265524625267665}

## Model evaluation
I planned to use LightGBM. 

In [637]:
# Create parameters to search
# these hypermarateres were obtained from 004 notebook with optuna.
# Hence, I let it to have a reproducible workflow.

grid_params = {
    'learning_rate': [0.08],
    'boosting_type' : ['dart'],
    'n_estimators': [300],
    'lambda_l1': [0.001, 0.02],
    'lambda_l2': [1e-3, 0.1, 5],
#     'max_depth': [17,23],
    'num_leaves': [23, 75],
#     "random_state": [42],
#     'random_state' : [501], # Updated from 'seed'
#     'colsample_bytree' : [0.65, 0.66],
#     'subsample' : [0.60, 0.75],
#     'reg_alpha' : [1,1.2],
#     'reg_lambda' : [1,1.2,1.4],
}



In [638]:
clf = gbm.LGBMClassifier(**lgb_params)

In [None]:
# Create the grid
grid = GridSearchCV(
    clf, 
    grid_params,
    cv=StratifiedKFold(2),
    n_jobs=-1,
    scoring=score,
    verbose=1
)

# Run the grid
# I setted fit_params for lightgbm, if needed, remove it (in case of changing the model)
grid.fit(
    X_train, y_train, 
    eval_set=[(X_train, y_train), (X_val, y_val)], 
    early_stopping_rounds=15,
    eval_metric=[lambda y_true, y_pred: ("F1_MACRO", metric(y_true, np.round(y_pred)), True)],
    feature_name=X_train.columns.tolist(),
    categorical_feature=["Month", "TrafficType", "Browser", "SpecialDay", "OperatingSystems", "Region", "VisitorType", "Weekend"]
)
# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [641]:
print(classification_report(y_test, grid.best_estimator_.predict(X_test)))
print(confusion_matrix(y_test, grid.best_estimator_.predict(X_test)))
print(metric(y_test, grid.best_estimator_.predict(X_test)))
print(metric(y_train, grid.best_estimator_.predict(X_train)))


              precision    recall  f1-score   support

           0       0.94      0.93      0.94      1460
           1       0.65      0.66      0.65       267

    accuracy                           0.89      1727
   macro avg       0.79      0.79      0.79      1727
weighted avg       0.89      0.89      0.89      1727

[[1364   96]
 [  92  175]]
0.793042870765575
0.8960253976445436


In [680]:
### for submition, parameters from optuna + grid-search
### dictionary update: {**dict1, **dict2}, or with dict.update(dict2)

clf = gbm.LGBMClassifier(**{**lgb_params, **grid.best_params_})

In [672]:
clf.fit(
    pd.concat([X_train, X_val], axis=0), pd.concat([y_train,y_val], axis=0),
    eval_set=[(X_train, y_train), (X_val, y_val)], 
    early_stopping_rounds=15,
    eval_metric=[lambda y_true, y_pred: ("F1_MACRO", metric(y_true, np.round(y_pred)), True)],
    feature_name=X_train.columns.tolist(),
    categorical_feature=["Month", "TrafficType", "Browser", "SpecialDay", "OperatingSystems", "Region", "VisitorType", "Weekend"]
)

[1]	valid_0's binary_logloss: 0.414235	valid_0's F1_MACRO: 0.458158	valid_1's binary_logloss: 0.413747	valid_1's F1_MACRO: 0.458219
[2]	valid_0's binary_logloss: 0.40034	valid_0's F1_MACRO: 0.458158	valid_1's binary_logloss: 0.3999	valid_1's F1_MACRO: 0.458219
[3]	valid_0's binary_logloss: 0.388542	valid_0's F1_MACRO: 0.458158	valid_1's binary_logloss: 0.38766	valid_1's F1_MACRO: 0.458219
[4]	valid_0's binary_logloss: 0.378102	valid_0's F1_MACRO: 0.458158	valid_1's binary_logloss: 0.376341	valid_1's F1_MACRO: 0.458219
[5]	valid_0's binary_logloss: 0.368981	valid_0's F1_MACRO: 0.458158	valid_1's binary_logloss: 0.366515	valid_1's F1_MACRO: 0.458219
[6]	valid_0's binary_logloss: 0.360855	valid_0's F1_MACRO: 0.458158	valid_1's binary_logloss: 0.357981	valid_1's F1_MACRO: 0.458219
[7]	valid_0's binary_logloss: 0.353361	valid_0's F1_MACRO: 0.458158	valid_1's binary_logloss: 0.349727	valid_1's F1_MACRO: 0.458219
[8]	valid_0's binary_logloss: 0.356504	valid_0's F1_MACRO: 0.458158	valid_1's bi

LGBMClassifier(boosting_type='dart', lambda_l1=0.02, lambda_l2=5,
               learning_rate=0.08, max_depth=8, n_estimators=300, num_leaves=75,
               objective='binary', scale_pos_weight=0.18265524625267665)

In [673]:
print(classification_report(y_train, clf.predict(X_train)))
print(classification_report(y_test, clf.predict(X_test)))

print(metric(y_train, clf.predict(X_train)))
print(metric(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      4670
           1       0.86      0.78      0.82       853

    accuracy                           0.95      5523
   macro avg       0.91      0.88      0.89      5523
weighted avg       0.95      0.95      0.95      5523

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      1460
           1       0.68      0.65      0.67       267

    accuracy                           0.90      1727
   macro avg       0.81      0.80      0.80      1727
weighted avg       0.90      0.90      0.90      1727

0.8946028736852122
0.8036607548885857


In [674]:
test_pred = test.loc[:, ~test.columns.isin(['revenue', "id"])]
submit = clf.predict(test_pred)

In [675]:
from datetime import datetime
get_date = lambda : datetime.now().strftime("%Y%m%d_%H%M%S")

In [676]:
date_part = get_date()
# save results
pd.DataFrame({"id":test.id, "revenue":submit}).to_csv(f'../data/results/{date_part}_lgb.csv', index=False)

# just if necessary, persist the parameters 
pd.DataFrame([clf.get_params()]).to_csv(f'../data/results/{date_part}_params.csv')

__________________________
## Model Interpretability
- *TODO*

In [None]:
clf.feature_importances_

In [None]:
gbm.plot_importance(clf, max_num_features=10)