In [55]:
import pandas as pd
import numpy as np

In [56]:
import xgboost as xgb
import lightgbm as gbm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

In [57]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, classification_report, confusion_matrix, make_scorer

from functools import partial

In [58]:
train = pd.read_parquet('../data/processed/train.parquet')
test = pd.read_parquet('../data/processed/test.parquet')

In [59]:
metric = partial(f1_score, average='macro')
score = make_scorer(f1_score, average='macro')

In [60]:
X, y = train.loc[:, ~train.columns.isin(['revenue', "id"])], train.revenue

from imblearn.over_sampling import SMOTE
resampler = SMOTE(random_state=0)
X, y = resampler.fit_resample(X, y)

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2)

In [62]:
def evaluate_model(clf, metric, score, data):
    """Evaluates a model and returns a summary for visualize. [just internal use]
    """
    X_train, X_test, y_train, y_test = data
    f1_cross = cross_val_score(clf, X_train, y_train, cv=5, scoring=score)
    clf.fit(X_train, y_train)
    return {
        "model": clf.__class__.__name__, 
        "test": metric(y_test, clf.predict(X_test)), 
        "train": metric(y_train, clf.predict(X_train)),
        "f1_cross_mean": np.mean(f1_cross),
        "f1_cross_std": np.std(f1_cross)
    }

In [68]:
lgb_params = {
    'n_estimators':450,
    'boosting_type': 'dart',
    'objective': 'binary',
    'learning_rate': 0.1, 
    'max_depth' : 8,
    'num_leaves': 64,
#     'is_unbalance':True, # is_unbalance vs scale_pos_weight
     ## see https://towardsdatascience.com/understanding-lightgbm-parameters-and-how-to-tune-them-6764e20c6e5b
    'scale_pos_weight': y_train.sum() / (y_train-1).abs().sum()
#     'max_bin': 512,
#     'subsample_for_bin': 200,
#     'subsample': 1,
#     'subsample_freq': 1,
#     'colsample_bytree': 0.8,
#     'reg_alpha': 5,
#     'reg_lambda': 10,
#     'min_split_gain': 1,
#     'min_child_weight': 1,
#     'min_child_samples': 5,
}

In [69]:
# I've tested out other models but I select LightGBM
models = [
    gbm.LGBMClassifier(**lgb_params,),
    
#     xgb.XGBClassifier(
#         n_estimators=400
#     ), 
#     RandomForestClassifier(),
#     LogisticRegression(),
#     AdaBoostClassifier()
]

In [70]:
results = [evaluate_model(model, metric, score, (X_train, X_val, y_train, y_val)) for model in models]

In [78]:
pd.DataFrame.from_dict(results)

Unnamed: 0,model,test,train,f1_cross_mean,f1_cross_std
0,LGBMClassifier,0.945201,0.999786,0.940361,0.004936


In [79]:
models[0].get_params()

{'boosting_type': 'dart',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 8,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 450,
 'n_jobs': -1,
 'num_leaves': 64,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'scale_pos_weight': 1.0}

## Model evaluation
I planned to use LightGBM. 

In [74]:
# Create parameters to search
# these hypermarateres were obtained from 004 notebook with optuna.
# Hence, I let it to have a reproducible workflow.

grid_params = {
    'learning_rate': [0.08],
    'boosting_type' : ['dart', 'goss'],
    'n_estimators': [300, 350, 400],
    'lambda_l1': [0.001, 0.02],
    'lambda_l2': [1e-3, 0.1, 5],
    'max_depth': [8,16,25],
    'num_leaves': [23, 75],
#     "random_state": [42],
#     'random_state' : [501], # Updated from 'seed'
#     'colsample_bytree' : [0.65, 0.66],
#     'subsample' : [0.60, 0.75],
#     'reg_alpha' : [1,1.2],
#     'reg_lambda' : [1,1.2,1.4],
}



In [75]:
clf = gbm.LGBMClassifier(**lgb_params)

In [76]:
# Create the grid
grid = GridSearchCV(
    clf, 
    grid_params,
    cv=StratifiedKFold(2),
    n_jobs=-1,
    scoring=score,
    verbose=1
)

# Run the grid
# I setted fit_params for lightgbm, if needed, remove it (in case of changing the model)
grid.fit(
    X_train, y_train, 
    eval_set=[(X_train, y_train), (X_val, y_val)], 
    early_stopping_rounds=15,
    eval_metric=[lambda y_true, y_pred: ("F1_MACRO", metric(y_true, np.round(y_pred)), True)],
    feature_name=X_train.columns.tolist(),
    categorical_feature=["Month", "TrafficType", "Browser", "SpecialDay", "OperatingSystems", "Region", "VisitorType", "Weekend"]
)
# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

Fitting 2 folds for each of 216 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 54.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 70.2min
[Parallel(n_jobs=-1)]: Done 432 out of 432 | elapsed: 75.1min finished
New categorical_feature is ['Browser', 'Month', 'OperatingSystems', 'Region', 'SpecialDay', 'TrafficType', 'VisitorType', 'Weekend']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	training's binary_logloss: 0.635641	training's F1_MACRO: 0.919366	valid_1's binary_logloss: 0.638542	valid_1's F1_MACRO: 0.900676
[2]	training's binary_logloss: 0.586434	training's F1_MACRO: 0.922885	valid_1's binary_logloss: 0.591989	valid_1's F1_MACRO: 0.901939
[3]	training's binary_logloss: 0.544246	training's F1_MACRO: 0.921189	valid_1's binary_logloss: 0.552087	valid_1's F1_MACRO: 0.903669
[4]	training's binary_logloss: 0.507006	training's F1_MACRO: 0.925466	valid_1's binary_logloss: 0.516518	valid_1's F1_MACRO: 0.905377
[5]	training's binary_logloss: 0.474269	training's F1_MACRO: 0.926529	valid_1's binary_logloss: 0.485696	valid_1's F1_MACRO: 0.90879
[6]	training's binary_logloss: 0.445218	training's F1_MACRO: 0.927172	valid_1's binary_logloss: 0.4589	valid_1's F1_MACRO: 0.907942
[7]	training's binary_logloss: 0.419825	training's F1_MACRO: 0.926965	valid_1's binary_logloss: 0.435716	valid_1's F1_MACRO: 0.91094
[8]	training's binary_logloss: 0.430496	training's F1_MACRO: 0.926

In [80]:
print(classification_report(y_test, grid.best_estimator_.predict(X_test)))
print(confusion_matrix(y_test, grid.best_estimator_.predict(X_test)))
print(metric(y_test, grid.best_estimator_.predict(X_test)))
print(metric(y_train, grid.best_estimator_.predict(X_train)))


              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1460
           1       0.92      0.94      0.93      1460

    accuracy                           0.93      2920
   macro avg       0.93      0.93      0.92      2920
weighted avg       0.93      0.93      0.92      2920

[[1334  126]
 [  93 1367]]
0.9249904197043369
1.0


In [81]:
### for submition, parameters from optuna + grid-search
### dictionary update: {**dict1, **dict2}, or with dict.update(dict2)

clf = gbm.LGBMClassifier(**{**lgb_params, **grid.best_params_})

In [82]:
clf.fit(
    pd.concat([X_train, X_val], axis=0), pd.concat([y_train,y_val], axis=0),
    eval_set=[(X_train, y_train), (X_val, y_val)], 
    early_stopping_rounds=15,
    eval_metric=[lambda y_true, y_pred: ("F1_MACRO", metric(y_true, np.round(y_pred)), True)],
    feature_name=X_train.columns.tolist(),
    categorical_feature=["Month", "TrafficType", "Browser", "SpecialDay", "OperatingSystems", "Region", "VisitorType", "Weekend"]
)

New categorical_feature is ['Browser', 'Month', 'OperatingSystems', 'Region', 'SpecialDay', 'TrafficType', 'VisitorType', 'Weekend']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	valid_0's binary_logloss: 0.635865	valid_0's F1_MACRO: 0.920661	valid_1's binary_logloss: 0.635656	valid_1's F1_MACRO: 0.922517
[2]	valid_0's binary_logloss: 0.586946	valid_0's F1_MACRO: 0.923969	valid_1's binary_logloss: 0.586462	valid_1's F1_MACRO: 0.926369
[3]	valid_0's binary_logloss: 0.544638	valid_0's F1_MACRO: 0.925037	valid_1's binary_logloss: 0.544116	valid_1's F1_MACRO: 0.928938
[4]	valid_0's binary_logloss: 0.508093	valid_0's F1_MACRO: 0.923965	valid_1's binary_logloss: 0.507206	valid_1's F1_MACRO: 0.92851




[5]	valid_0's binary_logloss: 0.475195	valid_0's F1_MACRO: 0.925787	valid_1's binary_logloss: 0.474625	valid_1's F1_MACRO: 0.931077
[6]	valid_0's binary_logloss: 0.446472	valid_0's F1_MACRO: 0.925681	valid_1's binary_logloss: 0.445382	valid_1's F1_MACRO: 0.933217
[7]	valid_0's binary_logloss: 0.420769	valid_0's F1_MACRO: 0.926218	valid_1's binary_logloss: 0.419531	valid_1's F1_MACRO: 0.933218
[8]	valid_0's binary_logloss: 0.431452	valid_0's F1_MACRO: 0.926434	valid_1's binary_logloss: 0.430369	valid_1's F1_MACRO: 0.931934
[9]	valid_0's binary_logloss: 0.407337	valid_0's F1_MACRO: 0.928252	valid_1's binary_logloss: 0.406617	valid_1's F1_MACRO: 0.934929
[10]	valid_0's binary_logloss: 0.385884	valid_0's F1_MACRO: 0.929109	valid_1's binary_logloss: 0.385392	valid_1's F1_MACRO: 0.93707
[11]	valid_0's binary_logloss: 0.366712	valid_0's F1_MACRO: 0.92911	valid_1's binary_logloss: 0.366572	valid_1's F1_MACRO: 0.935358
[12]	valid_0's binary_logloss: 0.373865	valid_0's F1_MACRO: 0.928039	valid_1

LGBMClassifier(boosting_type='dart', lambda_l1=0.02, lambda_l2=0.001,
               learning_rate=0.08, max_depth=25, n_estimators=400,
               num_leaves=75, objective='binary', scale_pos_weight=1.0)

In [83]:
print(classification_report(y_train, clf.predict(X_train)))
print(classification_report(y_test, clf.predict(X_test)))

print(metric(y_train, clf.predict(X_train)))
print(metric(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4670
           1       1.00      1.00      1.00      4670

    accuracy                           1.00      9340
   macro avg       1.00      1.00      1.00      9340
weighted avg       1.00      1.00      1.00      9340

              precision    recall  f1-score   support

           0       0.94      0.91      0.93      1460
           1       0.92      0.94      0.93      1460

    accuracy                           0.93      2920
   macro avg       0.93      0.93      0.93      2920
weighted avg       0.93      0.93      0.93      2920

0.9992505351171239
0.926358039049946


In [84]:
test_pred = test.loc[:, ~test.columns.isin(['revenue', "id"])]
submit = clf.predict(test_pred)

In [85]:
from datetime import datetime
get_date = lambda : datetime.now().strftime("%Y%m%d_%H%M%S")

In [86]:
date_part = get_date()
# save results
pd.DataFrame({"id":test.id, "revenue":submit}).to_csv(f'../data/results/{date_part}_lgb.csv', index=False)

# just if necessary, persist the parameters 
pd.DataFrame([clf.get_params()]).to_csv(f'../data/results/{date_part}_params.csv')

In [87]:
clf._Booster.save_model(f'../data/results/{date_part}_model.txt')

<lightgbm.basic.Booster at 0x7fab2013d990>

__________________________
## Model Interpretability
- *TODO*

In [None]:
clf.feature_importances_

In [None]:
gbm.plot_importance(clf, max_num_features=10)