# Solution

## Data loading

In [6]:
import pandas as pd

pd.options.display.max_columns = 100

dtypes = {
    'delivery_type': 'category',
    'addr_region_reg': 'category',
    'addr_region_fact': 'category',
    'channel_name': 'category',
    'channel_name_2': 'category',
    'sas_limit_after_003_amt': 'uint8',
    'sas_limit_last_amt': 'uint8',
    
    'channel_name_modified_2018': 'category',
    'clnt_education_name': 'category',
    'clnt_marital_status_name': 'category',
    'clnt_employment_type_name': 'category',
    'clnt_speciality_sphere_name': 'category',
    'clnt_sex_name': 'category',
    'prt_name': 'category',
    'feature_0': 'category',
    
    #'inquiry_recent_period': 'uint',
    #'prt_name': 'category',
    #'inquiry_1_week': 'uint',
}
cols_to_use = list(dtypes.keys()) + ['card_id', 'target']

train = pd.read_csv('data/train.csv', index_col='card_id', dtype=dtypes, usecols=cols_to_use)
train.head()

Unnamed: 0_level_0,target,delivery_type,addr_region_reg,addr_region_fact,channel_name,channel_name_2,channel_name_modified_2018,sas_limit_after_003_amt,sas_limit_last_amt,clnt_education_name,clnt_marital_status_name,clnt_employment_type_name,clnt_speciality_sphere_name,clnt_sex_name,prt_name,feature_0
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
cid_10620,1,cat_1,107,107,cat_0,cat_3,cat_0,1,1,cat_1,cat_4,cat_0,cat_0,cat_0,cat_9,cat_7
cid_105724,0,cat_1,9,9,cat_2,cat_5,cat_2,3,3,cat_1,cat_2,cat_3,cat_26,cat_1,cat_8,cat_9
cid_101410,1,cat_1,109,109,cat_0,cat_3,cat_0,1,1,cat_1,cat_2,cat_3,cat_15,cat_0,cat_9,cat_10
cid_38961,0,cat_1,66,66,cat_0,cat_3,cat_0,3,3,cat_1,cat_2,cat_3,cat_14,cat_0,cat_9,cat_7
cid_57462,0,cat_1,16,16,cat_0,cat_3,cat_0,0,0,cat_4,cat_2,cat_3,cat_24,cat_0,cat_9,cat_13


## Feature extraction

## Learning

In [7]:
X = train.copy()
y = X.pop('target')

In [8]:
from sklearn import model_selection

X_fit, X_val, y_fit, y_val = model_selection.train_test_split(
    X, y,
    random_state=42
)

In [9]:
import lightgbm as lgb

fit = lgb.Dataset(X_fit, y_fit)
val = lgb.Dataset(X_val, y_val, reference=fit)

model = lgb.train(
    params={
        'learning_rate': .01,
        'objective': 'binary',
        'metric': 'auc',
        'boost_from_average': True,
        'min_data_in_leaf': 20,
        'max_bin': 255,
        'num_leaves': 31,
        'min_data_per_group': 100,
        'cat_smooth': 10,
        'seed': 42
    },
    train_set=fit,
    num_boost_round=10_000,
    valid_sets=[fit, val],
    valid_names=['fit', 'val'],
    early_stopping_rounds=20,
    verbose_eval=100
)

Training until validation scores don't improve for 20 rounds
[100]	fit's auc: 0.7065	val's auc: 0.690153
[200]	fit's auc: 0.71469	val's auc: 0.694616
[300]	fit's auc: 0.721188	val's auc: 0.696373
[400]	fit's auc: 0.726616	val's auc: 0.69717
[500]	fit's auc: 0.731218	val's auc: 0.697435
Early stopping, best iteration is:
[572]	fit's auc: 0.734173	val's auc: 0.697591


In [10]:
import numpy as np
from sklearn import metrics

y_pred = model.predict(X_val)

def roc_auc_score_at_K(predicted_proba, target, rate=0.1):  
    order = np.argsort(-predicted_proba) 
    top_k = int(rate * len(predicted_proba)) 
    return metrics.roc_auc_score(target[order][:top_k], predicted_proba[order][:top_k]) 

val_score = roc_auc_score_at_K(y_pred, y_val)
val_score

0.538140046285833

In [13]:
importances = model.feature_importance(importance_type='gain').astype(int)
print(pd.Series(importances, index=X_fit.columns).sort_values(ascending=False))

channel_name_modified_2018     137112
addr_region_fact               112116
prt_name                       101772
addr_region_reg                 65244
clnt_speciality_sphere_name     49993
channel_name_2                  47283
sas_limit_last_amt              25207
delivery_type                   13477
sas_limit_after_003_amt         11492
channel_name                     9589
feature_0                        7273
clnt_education_name              6259
clnt_marital_status_name         2979
clnt_sex_name                    1578
clnt_employment_type_name          16
dtype: int64


In [14]:
#model.save_model(f'models/model_{val_score:.4f}.lgb')
model.save_model(f'track_2/model.lgb')

<lightgbm.basic.Booster at 0x7f98a21e0710>