In [1]:
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
import os
from itertools import combinations
from sklearn.metrics import cohen_kappa_score, f1_score


warnings.simplefilter('ignore')
tqdm.pandas()

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
pd.set_option('max_colwidth', 200)

In [2]:
seed = 2020

In [3]:
df_feature = pd.read_pickle('data/feature.pkl')

In [4]:
for f in df_feature.select_dtypes('object').columns:
    if f not in ['cust_no']:
        lbl = LabelEncoder()
        df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))

In [5]:
df_train = df_feature[df_feature.label.notna()].copy().reset_index(drop=True)
df_test = df_feature[df_feature.label.isna()].copy().reset_index(drop=True)

df_train.shape, df_test.shape

((145296, 346), (76722, 346))

In [None]:
ycol = 'label'
feature_names = list(
    filter(lambda x: x not in [ycol, 'cust_no', 'fq', 'fake'], df_train.columns))

model = lgb.LGBMClassifier(boosting_type='gbdt',
                           num_leaves=32,
                           max_depth=6,
                           learning_rate=0.01,
                           n_estimators=10000,
                           subsample=0.8,
                           feature_fraction=0.6,
                           reg_alpha=10,
                           reg_lambda=12,
                           random_state=seed,
                           is_unbalance=True)

prob_oof = np.zeros((df_train.shape[0], 3))
test_pred_prob = np.zeros((df_test.shape[0], 3))

df_importance_list = []

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names], df_train[ycol])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(
        fold_id + 1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=500,
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)
    prob_oof[val_idx] = pred_val

    pred_test = lgb_model.predict_proba(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)
    test_pred_prob += pred_test / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val



Training until validation scores don't improve for 50 rounds
[500]	train's multi_logloss: 0.613997	valid's multi_logloss: 0.626054
[1000]	train's multi_logloss: 0.591286	valid's multi_logloss: 0.616881
[1500]	train's multi_logloss: 0.574413	valid's multi_logloss: 0.613086
[2000]	train's multi_logloss: 0.560111	valid's multi_logloss: 0.61126
[2500]	train's multi_logloss: 0.547135	valid's multi_logloss: 0.610222
[3000]	train's multi_logloss: 0.534981	valid's multi_logloss: 0.609474


In [None]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby([
    'column'
])['importance'].agg('mean').sort_values(ascending=False).reset_index()
df_importance

In [None]:
class_num = 3


def search_weight(valid_y, raw_prob, init_weight=[1.0]*class_num, step=0.001):
    weight = init_weight.copy()
    f_best = cohen_kappa_score(valid_y, raw_prob.argmax(
        axis=1))
    flag_score = 0
    round_num = 1
    while(flag_score != f_best):
        print('round: ', round_num)
        round_num += 1
        flag_score = f_best
        for c in range(class_num):
            for n_w in range(0, 2000, 10):
                num = n_w * step
                new_weight = weight.copy()
                new_weight[c] = num

                prob_df = raw_prob.copy()
                prob_df = prob_df * np.array(new_weight)

                f = cohen_kappa_score(valid_y, prob_df.argmax(
                    axis=1))
                if f > f_best:
                    weight = new_weight.copy()
                    f_best = f
    return weight


weight = search_weight(df_train['label'], prob_oof)

weight

In [None]:
prediction = df_test[['cust_no']]
test_pred_prob_weight = test_pred_prob * np.array(weight)
prediction['label'] = np.argmax(test_pred_prob_weight, axis=1)
prediction['label'] = prediction['label'] - 1

In [None]:
prediction.head()

In [None]:
os.makedirs('prob', exist_ok=True)
np.save('prob/lgb1.npy', test_pred_prob_weight)

In [None]:
df_oof = df_train[['cust_no', 'label']]
prob_oof_weight = prob_oof * np.array(weight)
df_oof['pred'] = np.argmax(prob_oof_weight, axis=1)
df_oof['pred'] = df_oof['pred'] - 1
df_oof['label'] = df_oof['label'] - 1

df_oof.head()

In [None]:
kappa = cohen_kappa_score(df_oof['label'], df_oof['pred'])
kappa

In [None]:
os.makedirs('sub', exist_ok=True)
prediction.to_csv('sub/xm_{}.csv'.format(kappa), index=False)

In [None]:
df_oof['pred'].value_counts()

In [None]:
df_oof['label'].value_counts()

In [None]:
prediction['label'].value_counts()