# Solution

## Data loading

In [8]:
import pandas as pd

pd.options.display.max_columns = 100

dtypes = {
    'delivery_type': 'category',
    'addr_region_reg': 'category',
    'addr_region_fact': 'category',
    'channel_name': 'category',
    'channel_name_2': 'category',
    'sas_limit_after_003_amt': 'uint8',
    'sas_limit_last_amt': 'uint8'
}
cols_to_use = list(dtypes.keys()) + ['card_id', 'target']

train = pd.read_csv('data/train.csv', index_col='card_id', dtype=dtypes, usecols=cols_to_use)
train.head()

Unnamed: 0_level_0,target,delivery_type,addr_region_reg,addr_region_fact,channel_name,channel_name_2,sas_limit_after_003_amt,sas_limit_last_amt
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
cid_10620,1,cat_1,107,107,cat_0,cat_3,1,1
cid_105724,0,cat_1,9,9,cat_2,cat_5,3,3
cid_101410,1,cat_1,109,109,cat_0,cat_3,1,1
cid_38961,0,cat_1,66,66,cat_0,cat_3,3,3
cid_57462,0,cat_1,16,16,cat_0,cat_3,0,0


## Feature extraction

## Learning

In [2]:
X = train.copy()
y = X.pop('target')

In [3]:
from sklearn import model_selection

X_fit, X_val, y_fit, y_val = model_selection.train_test_split(
    X, y,
    random_state=42
)

In [4]:
import lightgbm as lgb

fit = lgb.Dataset(X_fit, y_fit)
val = lgb.Dataset(X_val, y_val, reference=fit)

model = lgb.train(
    params={
        'learning_rate': .01,
        'objective': 'binary',
        'metric': 'auc',
        'boost_from_average': True,
        'min_data_in_leaf': 20,
        'max_bin': 255,
        'num_leaves': 31,
        'min_data_per_group': 100,
        'cat_smooth': 10,
        'seed': 42
    },
    train_set=fit,
    num_boost_round=10_000,
    valid_sets=[fit, val],
    valid_names=['fit', 'val'],
    early_stopping_rounds=20,
    verbose_eval=100
)

Training until validation scores don't improve for 20 rounds
[100]	fit's auc: 0.676013	val's auc: 0.666181
[200]	fit's auc: 0.680191	val's auc: 0.667645
[300]	fit's auc: 0.683536	val's auc: 0.668348
[400]	fit's auc: 0.686088	val's auc: 0.668576
Early stopping, best iteration is:
[455]	fit's auc: 0.687095	val's auc: 0.668707


In [5]:
import numpy as np
from sklearn import metrics

y_pred = model.predict(X_val)

def roc_auc_score_at_K(predicted_proba, target, rate=0.1):  
    order = np.argsort(-predicted_proba) 
    top_k = int(rate * len(predicted_proba)) 
    return metrics.roc_auc_score(target[order][:top_k], predicted_proba[order][:top_k]) 

val_score = roc_auc_score_at_K(y_pred, y_val)
val_score

0.5104378263670544

In [6]:
importances = model.feature_importance(importance_type='gain').astype(int)
print(pd.Series(importances, index=X_fit.columns).sort_values(ascending=False))

channel_name               154835
addr_region_fact            81983
channel_name_2              50571
addr_region_reg             44278
sas_limit_last_amt          29202
sas_limit_after_003_amt     11774
delivery_type               11005
dtype: int64


In [7]:
model.save_model(f'models/model_{val_score:.4f}.lgb')

<lightgbm.basic.Booster at 0x7fa8fbc19d50>