In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
from sklearn.metrics import f1_score, roc_auc_score
import catboost as cbt
import gc

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings('ignore')

In [None]:
current_path = './'
seed = 2019
n_fold = 5

In [None]:
df_feature = pd.read_pickle(os.path.join(
    current_path, 'feature', 'feature.pickle'))

In [None]:
df_feature.head()

In [None]:
for f in tqdm(list(df_feature.select_dtypes('object'))):
    if f not in ['id']:
        le = LabelEncoder()
        df_feature[f] = le.fit_transform(
            df_feature[f].astype('str')).astype('int')

In [None]:
df_test = df_feature[df_feature['target'].isnull()].copy()
df_train = df_feature[df_feature['target'].notnull()].copy()

del df_feature
gc.collect()

In [None]:
ycol = 'target'
feature_names = list(
    filter(lambda x: x not in [ycol, 'timestamp', 'ts', 'id', 'day', 'hour', 'minute', 'ts_datetime', 'minute10',
                               'personidentification', 'level', 'followscore', 'personalscore', 'gender',
                               'hourl', 'group'],
           df_train.columns))

model = lgb.LGBMClassifier(num_leaves=64,
                           max_depth=10,
                           learning_rate=0.4,
                           n_estimators=10000000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=seed,
                           metric='auc'
                           )

# model = lgb.LGBMClassifier(
#     learning_rate=0.01,
#     n_estimators=10000000,
#     num_leaves=255,
#     subsample=0.9,
#     colsample_bytree=0.8,
#     random_state=seed,
#     metric='auc'
# )

oof = []
prediction = df_test[['id']]
prediction['target'] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=n_fold, shuffle=False, random_state=seed)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names], df_train['day'])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=100,
                          eval_metric='auc',
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)[:, 1]
    df_oof = df_train.iloc[val_idx][['id', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]
    prediction['target'] += pred_test / n_fold

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()

In [None]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

In [None]:
df_oof = pd.concat(oof)
df_oof['pred_bin'] = df_oof['pred'].rank()
df_oof['pred_bin'] = (df_oof['pred_bin'] >= df_oof.shape[0]
                      * 0.8934642948637943).astype(int)

auc = roc_auc_score(df_oof['target'], df_oof['pred_bin'])
f1 = f1_score(df_oof['target'], df_oof['pred_bin'])

print('f1:', f1)
print('auc:', auc)

In [None]:
sub = prediction.copy(deep=True)
sub['target'] = sub['target'].rank()
sub['target'] = (sub['target'] >= sub.shape[0] *
                 0.8934642948637943).astype(int)
sub.to_csv(os.path.join(current_path, 'sub', '{}.csv'.format(f1)),
           index=False, encoding='utf-8')

In [None]:
oof_train = df_oof[['id', 'pred']]
oof_test = prediction[['id', 'target']]

oof_train.columns = ['id', 'oof_prob']
oof_test.columns = ['id', 'oof_prob']

oof = pd.concat([oof_train, oof_test], sort=False)

In [None]:
oof.to_pickle(os.path.join(current_path, 'prob', 'oof_lgb_qian.pickle'))