# Solution

## Data loading

In [1]:
import pandas as pd

pd.options.display.max_columns = 100

dtypes = {
    'delivery_type': 'category',
    'addr_region_reg': 'category',
    'addr_region_fact': 'category',
    'channel_name': 'category',
    'channel_name_2': 'category',
    'sas_limit_after_003_amt': 'uint8',
    'sas_limit_last_amt': 'uint8',
    'channel_name_modified_2018': 'category',
    'clnt_education_name': 'category',
    'clnt_marital_status_name': 'category',
    'clnt_employment_type_name': 'category',
    'clnt_speciality_sphere_name': 'category',
    'clnt_sex_name': 'category',
    'prt_name': 'category',
    'feature_0': 'category',
    'clnt_income_month_avg_net_amt': 'uint',
    
}

fill_missing = [
    ('inquiry_recent_period', 0, 'uint'),
    ('inquiry_1_week', 0, 'uint')
]

cols_to_use = (
    list(dtypes.keys()) +
    list(col for col, *_ in fill_missing) + 
    ['card_id', 'target']
)

df = pd.read_csv('data/train.csv', index_col='card_id', dtype=dtypes, usecols=cols_to_use)

for col, fill, dtype in fill_missing:
    df[col] = df[col].fillna(fill).astype(dtype)

df.head()

Unnamed: 0_level_0,target,delivery_type,addr_region_reg,addr_region_fact,channel_name,channel_name_2,channel_name_modified_2018,sas_limit_after_003_amt,sas_limit_last_amt,clnt_education_name,clnt_marital_status_name,clnt_employment_type_name,clnt_speciality_sphere_name,clnt_sex_name,prt_name,feature_0,inquiry_recent_period,inquiry_1_week
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
cid_10620,1,cat_1,107,107,cat_0,cat_3,cat_0,1,1,cat_1,cat_4,cat_0,cat_0,cat_0,cat_9,cat_7,0,1
cid_105724,0,cat_1,9,9,cat_2,cat_5,cat_2,3,3,cat_1,cat_2,cat_3,cat_26,cat_1,cat_8,cat_9,58,0
cid_101410,1,cat_1,109,109,cat_0,cat_3,cat_0,1,1,cat_1,cat_2,cat_3,cat_15,cat_0,cat_9,cat_10,30,0
cid_38961,0,cat_1,66,66,cat_0,cat_3,cat_0,3,3,cat_1,cat_2,cat_3,cat_14,cat_0,cat_9,cat_7,366,0
cid_57462,0,cat_1,16,16,cat_0,cat_3,cat_0,0,0,cat_4,cat_2,cat_3,cat_24,cat_0,cat_9,cat_13,26,0


## Feature extraction

In [2]:
class MeanEncoder:
    
    def __init__(self, df, on, by, prior_count=100):
        self.on = on
        self.by = by
        counts = df.groupby(by)[on].agg(['mean', 'count'])
        avg = df[on].mean()
        self.means = (
            counts
            .eval('(mean * count + @avg * @prior_count) / (count + @prior_count)')
            .rename(str(self))
        )
    
    def __str__(self):
        return f'avg_{self.on}_by_{self.by}'
        
    def transform(self, df):
        return df.join(self.means, on=self.by)[self.means.name]

In [3]:
class NMissing:
    
    def __str__(self):
        return 'n_missing'
    
    def transform(self, df):
        return df.isnull().sum(axis='columns').rename(str(self))

In [4]:
extractors = [
    MeanEncoder(df, on='target', by='addr_region_fact'),
    NMissing()
]

In [5]:
features = []

for ex in extractors:
    print(str(ex))
    features.append(ex.transform(df)) 
    
train = pd.concat([df] + features, axis='columns')
train.head()

avg_target_by_addr_region_fact
n_missing


Unnamed: 0_level_0,target,delivery_type,addr_region_reg,addr_region_fact,channel_name,channel_name_2,channel_name_modified_2018,sas_limit_after_003_amt,sas_limit_last_amt,clnt_education_name,clnt_marital_status_name,clnt_employment_type_name,clnt_speciality_sphere_name,clnt_sex_name,prt_name,feature_0,inquiry_recent_period,inquiry_1_week,avg_target_by_addr_region_fact,n_missing
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
cid_10620,1,cat_1,107,107,cat_0,cat_3,cat_0,1,1,cat_1,cat_4,cat_0,cat_0,cat_0,cat_9,cat_7,0,1,0.662549,0
cid_105724,0,cat_1,9,9,cat_2,cat_5,cat_2,3,3,cat_1,cat_2,cat_3,cat_26,cat_1,cat_8,cat_9,58,0,0.498339,0
cid_101410,1,cat_1,109,109,cat_0,cat_3,cat_0,1,1,cat_1,cat_2,cat_3,cat_15,cat_0,cat_9,cat_10,30,0,0.434518,0
cid_38961,0,cat_1,66,66,cat_0,cat_3,cat_0,3,3,cat_1,cat_2,cat_3,cat_14,cat_0,cat_9,cat_7,366,0,0.478198,0
cid_57462,0,cat_1,16,16,cat_0,cat_3,cat_0,0,0,cat_4,cat_2,cat_3,cat_24,cat_0,cat_9,cat_13,26,0,0.525827,0


## Learning

Separate features from target.

In [6]:
X = train.drop(columns='target')
y = train['target']

Train/test split.

In [7]:
from sklearn import model_selection

X_fit, X_val, y_fit, y_val = model_selection.train_test_split(
    X, y,
    random_state=42
)

LightGBM dance.

In [8]:
import lightgbm as lgb

fit = lgb.Dataset(X_fit, y_fit)
val = lgb.Dataset(X_val, y_val, reference=fit)

model = lgb.train(
    params={
        'learning_rate': .01,
        'objective': 'binary',
        'metric': 'auc',
        'boost_from_average': True,
        'min_data_in_leaf': 20,
        'max_bin': 255,
        'num_leaves': 31,
        'min_data_per_group': 100,
        'cat_smooth': 10,
        'seed': 42
    },
    train_set=fit,
    num_boost_round=10_000,
    valid_sets=[fit, val],
    valid_names=['fit', 'val'],
    early_stopping_rounds=20,
    verbose_eval=100
)

Training until validation scores don't improve for 20 rounds
[100]	fit's auc: 0.730339	val's auc: 0.714236
[200]	fit's auc: 0.73849	val's auc: 0.718866
[300]	fit's auc: 0.745667	val's auc: 0.721493
[400]	fit's auc: 0.751705	val's auc: 0.722878
[500]	fit's auc: 0.756937	val's auc: 0.723766
[600]	fit's auc: 0.761492	val's auc: 0.724165
Early stopping, best iteration is:
[649]	fit's auc: 0.763505	val's auc: 0.724226


In [None]:
Training until validation scores don't improve for 20 rounds
[100]	fit's auc: 0.730339	val's auc: 0.714236
[200]	fit's auc: 0.73849	val's auc: 0.718866
[300]	fit's auc: 0.745667	val's auc: 0.721493
[400]	fit's auc: 0.751705	val's auc: 0.722878
[500]	fit's auc: 0.756937	val's auc: 0.723766
[600]	fit's auc: 0.761492	val's auc: 0.724165
Early stopping, best iteration is:
[649]	fit's auc: 0.763505	val's auc: 0.724226

In [9]:
import numpy as np
from sklearn import metrics

y_pred = model.predict(X_val)

def roc_auc_score_at_K(predicted_proba, target, rate=0.1):  
    order = np.argsort(-predicted_proba) 
    top_k = int(rate * len(predicted_proba)) 
    return metrics.roc_auc_score(target[order][:top_k], predicted_proba[order][:top_k]) 

val_score = roc_auc_score_at_K(y_pred, y_val)
val_score

0.5254742949234489

In [None]:
0.5254742949234489

In [10]:
importances = model.feature_importance(importance_type='gain').astype(int)
print(pd.Series(importances, index=X_fit.columns).sort_values(ascending=False))

inquiry_recent_period             186882
prt_name                          134204
addr_region_fact                  102147
addr_region_reg                    77018
inquiry_1_week                     64546
channel_name_2                     52492
clnt_speciality_sphere_name        51133
sas_limit_last_amt                 25360
channel_name                       15173
delivery_type                      11548
sas_limit_after_003_amt             7851
channel_name_modified_2018          6766
feature_0                           6520
clnt_education_name                 5602
clnt_marital_status_name            2883
avg_target_by_addr_region_fact      2228
clnt_sex_name                       1641
clnt_employment_type_name              0
n_missing                              0
dtype: int64


In [11]:
#model.save_model(f'models/model_{val_score:.4f}.lgb')
model.save_model(f'track_2/model.lgb')

<lightgbm.basic.Booster at 0x7fecddf8fa10>

In [12]:
import joblib

!rm -f track_2/*.pkl

for ex in extractors:
    joblib.dump(ex, f'track_2/{ex}.pkl')

## Custom loss

In [28]:
import lightgbm
import numpy as np
from scipy import special

def logloss_init_score(y):
    p = y.mean()
    p = np.clip(p, 1e-15, 1 - 1e-15)
    log_odds = np.log(p / (1 - p))
    return log_odds

def logloss_objective(preds, train_data):
    y = train_data.get_label()
    p = special.expit(preds)
    grad = p - y
    hess = p * (1 - p)
    #big = p > .6
    #grad[~big] = 0.1
    #hess[~big] = 0.1
    w = 3 ** (1 + p)
    return grad * w, hess * w

def logloss_metric(preds, train_data):
    y = train_data.get_label()
    p = special.expit(preds)
    is_higher_better = True
    return 'auc', metrics.roc_auc_score(y, p), is_higher_better

fit = lightgbm.Dataset(
    X_fit, y_fit,
    init_score=np.full_like(y_fit, logloss_init_score(y_fit), dtype=float)
)

val = lightgbm.Dataset(
    X_val, y_val,
    init_score=np.full_like(y_val, logloss_init_score(y_fit), dtype=float),
    reference=fit
)

model = lightgbm.train(
    params={'learning_rate': 0.01},
    train_set=fit,
    num_boost_round=10000,
    valid_sets=(fit, val),
    valid_names=('fit', 'val'),
    early_stopping_rounds=20,
    verbose_eval=100,
    fobj=logloss_objective,
    feval=logloss_metric
)

y_pred = special.expit(model.predict(X_val))

print()
print(f"Test's ROC AUC: {metrics.roc_auc_score(y_val, y_pred):.5f}")
print(f"Test's logloss: {metrics.log_loss(y_val, y_pred):.5f}")


Training until validation scores don't improve for 20 rounds
[100]	fit's auc: 0.73167	val's auc: 0.714288
[200]	fit's auc: 0.73969	val's auc: 0.719166
[300]	fit's auc: 0.746372	val's auc: 0.721622
[400]	fit's auc: 0.752676	val's auc: 0.722875
[500]	fit's auc: 0.757934	val's auc: 0.723622
[600]	fit's auc: 0.762459	val's auc: 0.723991
[700]	fit's auc: 0.766309	val's auc: 0.724167
Early stopping, best iteration is:
[729]	fit's auc: 0.767411	val's auc: 0.724235

Test's ROC AUC: 0.72424
Test's logloss: 0.61179


In [27]:
logloss_init_score(y_fit)

-0.047831339328793056

Training until validation scores don't improve for 20 rounds
[100]	fit's auc: 0.73167	val's auc: 0.714288
[200]	fit's auc: 0.73969	val's auc: 0.719166
[300]	fit's auc: 0.746372	val's auc: 0.721622
[400]	fit's auc: 0.752676	val's auc: 0.722875
[500]	fit's auc: 0.757934	val's auc: 0.723622
[600]	fit's auc: 0.762459	val's auc: 0.723991
[700]	fit's auc: 0.766309	val's auc: 0.724167
Early stopping, best iteration is:
[729]	fit's auc: 0.767411	val's auc: 0.724235

Test's ROC AUC: 0.72424
Test's logloss: 0.61135


In [25]:
val_score = roc_auc_score_at_K(y_pred, y_val)
val_score

0.53896355688664

0.5250457232015541

In [29]:
model.save_model(f'track_2/model.lgb')

<lightgbm.basic.Booster at 0x7fecdf6988d0>

In [None]:
Local 0.525 LB 0.513
Local 0.538 LB 