In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
import lightgbm
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score, roc_auc_score, classification_report
from catboost import CatBoostClassifier
import time
import os
from sklearn.linear_model import LogisticRegression

In [2]:
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:85% !important; }</style>"))
display(HTML("<style>.prompt { min-width:10ex !important; }</style>"))
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.float_format', lambda x: '%.6f' % x) 

  from IPython.core.display import display, HTML, clear_output
  from IPython.core.display import display, HTML, clear_output


In [3]:
df_with_feats = pd.read_parquet("../data/train_with_feats_v4.parquet")

In [4]:
df_train, df_val = train_test_split(df_with_feats, test_size=0.2, random_state=42, stratify=df_with_feats.buy_post)

In [5]:
information_value = pd.read_csv('../data/feature_information_value.csv', sep=';')

IV < 0,02 — отсутствует; 

0,02 ≤ IV <0,1 — низкая; 

0,1 ≤ IV < 0,3 — средняя; 


IV ≥ 0,3 — высокая. 


In [6]:
best_iv_feats = information_value[information_value["IV"] >= 0.02].feature.values
len(best_iv_feats)

42

In [7]:
information_value[information_value["IV"] >= 0.02].sort_values(by="IV", ascending=False)

Unnamed: 0,feature,IV
62,receipt_count,0.430457
30,weekends_std,0.24315
69,strange_std,0.237534
47,dom_std,0.16164
1,revenue_sum_count_14d,0.155249
65,revenue_sum_count_28d,0.155249
3,revenue_sum_count_7d,0.155249
35,was_promo_std,0.120205
46,fun_sum,0.105191
88,dow_std,0.097692


### Modeling [gain]

importance by gain

In [8]:
df_train["buy_post"].value_counts()

buy_post
1    287276
0    112724
Name: count, dtype: int64

In [9]:
scale_pos_weight = 112724/ 287276

In [10]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': "binary",
    'num_leaves': 50,
    'max_depth': 20,
    'learning_rate': 0.15,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'subsample_freq': 1,
    'min_child_samples': 50,
    'n_jobs': 4,
    'n_estimators': 300,
    'scale_pos_weight': scale_pos_weight,
    'class_weight': None,
    'colsample_bytree': 0.8,
    'importance_type': 'gain',
    'min_child_samples': 50,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'random_state': None,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0,
    'subsample_for_bin': 200000,
}

In [11]:
current_feats = information_value.feature.values
len(current_feats)

108

In [12]:
lgbm_model = lightgbm.LGBMModel(**params)

In [13]:
lgbm_model.fit(df_train[current_feats], df_train["buy_post"], )

[LightGBM] [Info] Number of positive: 287276, number of negative: 112724
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.267005 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22195
[LightGBM] [Info] Number of data points in the train set: 400000, number of used features: 108
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.718190 -> initscore=0.935501
[LightGBM] [Info] Start training from score 0.935501


In [14]:
y_pred_lgbm = lgbm_model.predict(df_val[current_feats])

In [15]:
def with_cutoff(y_pred, cut_off):
    return [int(x > cut_off) for x in y_pred]

In [16]:
current_cut_off = 0.5

print(classification_report(df_val["buy_post"], with_cutoff(y_pred_lgbm, current_cut_off)))

              precision    recall  f1-score   support

           0       0.43      0.68      0.52     28181
           1       0.84      0.64      0.72     71819

    accuracy                           0.65    100000
   macro avg       0.63      0.66      0.62    100000
weighted avg       0.72      0.65      0.67    100000



In [17]:
f1_score(df_val["buy_post"], with_cutoff(y_pred_lgbm, current_cut_off))

0.7234429482972651

In [18]:
roc_auc_score(df_val["buy_post"], y_pred_lgbm)

0.7246865524565382

In [85]:
def get_feats_importances(model, type_):
    df = pd.DataFrame()
    df["feature"] = model.feature_name_
    df[type_] = model.feature_importances_
    df = df.merge(information_value, on="feature", how="inner")
    return df

In [86]:
imp_gain = get_feats_importances(lgbm_model, "gain")

### Modeling [splits]


importance by splits

In [87]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': "binary",
    'num_leaves': 50,
    'max_depth': 20,
    'learning_rate': 0.15,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'subsample_freq': 1,
    'min_child_samples': 50,
    'n_jobs': 4,
    'n_estimators': 300,
    'scale_pos_weight': scale_pos_weight,
    'class_weight': None,
    'colsample_bytree': 0.8,
    'importance_type': 'split',
    'min_child_samples': 50,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'random_state': None,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0,
    'subsample_for_bin': 200000,
}

In [88]:
lgbm_model = lightgbm.LGBMModel(**params)
lgbm_model.fit(df_train[current_feats], df_train["buy_post"], )
y_pred_lgbm = lgbm_model.predict(df_val[current_feats])

[LightGBM] [Info] Number of positive: 287276, number of negative: 112724
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.151100 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22195
[LightGBM] [Info] Number of data points in the train set: 400000, number of used features: 108
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.718190 -> initscore=0.935501
[LightGBM] [Info] Start training from score 0.935501


In [89]:
imp_splits = get_feats_importances(lgbm_model, "split")

In [91]:
df_imp = imp_gain.merge(imp_splits[['feature', 'split']], on='feature', how='left')

### Modeling [permutaion]

premutaion importance

In [90]:
from sklearn.inspection import permutation_importance

In [98]:
lgbm_for_permut = lightgbm.LGBMClassifier(**params)

In [99]:
lgbm_for_permut.fit(df_train[current_feats], df_train["buy_post"])

[LightGBM] [Info] Number of positive: 287276, number of negative: 112724
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.143210 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22195
[LightGBM] [Info] Number of data points in the train set: 400000, number of used features: 108
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.718190 -> initscore=0.935501
[LightGBM] [Info] Start training from score 0.935501


In [100]:
r = permutation_importance(lgbm_for_permut, df_val[current_feats], df_val["buy_post"], n_repeats=30,random_state=0, scoring=["f1"])

In [102]:
df_permut = pd.DataFrame()
df_permut["feature"] = lgbm_for_permut.feature_name_
df_permut["permut"] = r["f1"]["importances_mean"]

In [103]:
df_imp = df_imp.merge(df_permut[['feature', 'permut']], on='feature', how='left')

### Feature selection

Отбор фичей - выбрать какой то топ фичей по каждому imporatnce и взять объединение этих признаков

In [196]:
top = 20

In [197]:
gain_feat_top_30 = df_imp.sort_values(by='gain', ascending=False).reset_index()[:top]["feature"].values

In [198]:
split_feat_top_30 = df_imp.sort_values(by='split', ascending=False).reset_index()[:top]["feature"].values

In [199]:
permut_feat_top_30 = df_imp.sort_values(by='permut', ascending=False).reset_index()[:top]["feature"].values

In [200]:
best_feats_by_importance = list(set(split_feat_top_30).union(permut_feat_top_30))
len(best_feats_by_importance)

31

### Modeling

Построение модели классификации lightgbm

In [181]:
def plot_roc_auc_curve(y_true, y_pred):
    from sklearn.metrics import f1_score, roc_auc_score, classification_report, roc_curve, auc
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    fig = px.area(
        x=fpr, y=tpr,
        title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
        labels=dict(x='False Positive Rate', y='True Positive Rate'),
        width=700, height=500
    )
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )

    fig.update_yaxes(scaleanchor="x", scaleratio=1)
    fig.update_xaxes(constrain='domain')
    fig.show()

In [173]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': "binary",
    'num_leaves': 50,
    'max_depth': 20,
    'learning_rate': 0.15,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'subsample_freq': 1,
    'min_child_samples': 50,
    'n_jobs': 4,
    'n_estimators': 300,
    'scale_pos_weight': scale_pos_weight,
    'class_weight': None,
    'colsample_bytree': 0.8,
    'importance_type': 'split',
    'min_child_samples': 50,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'random_state': None,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0,
    'subsample_for_bin': 200000,
}

In [174]:
current_feats = best_feats_by_importance

In [175]:
lgbm_model = lightgbm.LGBMModel(**params)
lgbm_model.fit(df_train[current_feats], df_train["buy_post"], )
y_pred_lgbm = lgbm_model.predict(df_val[current_feats])

[LightGBM] [Info] Number of positive: 287276, number of negative: 112724
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013791 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3254
[LightGBM] [Info] Number of data points in the train set: 400000, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.718190 -> initscore=0.935501
[LightGBM] [Info] Start training from score 0.935501


In [176]:
y_pred_lgbm = lgbm_model.predict(df_val[current_feats])

In [185]:
current_cut_off = 0.54

print(classification_report(df_val["buy_post"], with_cutoff(y_pred_lgbm, current_cut_off)))
print(f"f1 score - {f1_score(df_val['buy_post'], with_cutoff(y_pred_lgbm, current_cut_off))}")
print(f"roc_auc score - {roc_auc_score(df_val['buy_post'], y_pred_lgbm)}")

              precision    recall  f1-score   support

           0       0.41      0.76      0.53     28181
           1       0.86      0.56      0.68     71819

    accuracy                           0.62    100000
   macro avg       0.63      0.66      0.60    100000
weighted avg       0.73      0.62      0.64    100000

f1 score - 0.6798915370343941
roc_auc score - 0.7243413687929148


### Test

Тестирование и отправка решения

In [160]:
df_test = pd.read_parquet("../data/test_with_feats_v4.parquet")

In [161]:
y_test_pred = lgbm_model.predict(df_test[current_feats])

In [162]:
df_test["buy_post"] = with_cutoff(y_test_pred, current_cut_off)

In [163]:
df_test[["customer_id", "buy_post"]].describe()

Unnamed: 0,customer_id,buy_post
count,112334.0,112334.0
mean,20748611.321951,0.385582
std,11582578.50584,0.486735
min,52341.0,0.0
25%,11041231.25,0.0
50%,22155898.0,0.0
75%,28861910.5,1.0
max,46668221.0,1.0


In [164]:
len(current_feats)

17

In [166]:
df_test[["customer_id", "buy_post"]].to_csv("top17_feats_by_all_imp_lgbm_55.csv", sep=';', index=False)