In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
from sklearn.metrics import f1_score, roc_auc_score
import catboost as cbt
import gc

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings('ignore')

In [None]:
current_path = './'
seed = 2019

In [None]:
df_feature = pd.read_pickle(os.path.join(
    current_path, 'feature', 'feature.pickle'))
df_feature['id'] = df_feature['id'].astype('str')

In [None]:
df_oof_lgb = pd.read_pickle(os.path.join(
    current_path, 'prob', 'oof_lgb_qian.pickle'))
df_oof_lgb.columns = ['id', 'lgb_oof_prob']
df_oof_lgb['id'] = df_oof_lgb['id'].astype('str')
df_feature = df_feature.merge(df_oof_lgb, how='left', on='id')
print(df_feature['lgb_oof_prob'].isnull().sum())

In [None]:
df_feature.head()

In [None]:
for f in tqdm(list(df_feature.select_dtypes('object'))):
    if f not in ['id']:
        le = LabelEncoder()
        df_feature[f] = le.fit_transform(
            df_feature[f].astype('str')).astype('int')

In [None]:
df_val = df_feature[(df_feature['target'].notnull())
                    & (df_feature['day'] == 10)]
df_train = df_feature[df_feature['day'] < 10]

In [None]:
gc.collect()

ycol = 'target'
feature_names = list(
    filter(lambda x: x not in [ycol, 'timestamp', 'ts', 'id', 'day', 'hour', 'minute', 'ts_datetime', 'minute10',
                               'personidentification', 'level', 'followscore', 'personalscore', 'gender',
                               'hourl', 'group'],
           df_train.columns))


model = lgb.LGBMClassifier(num_leaves=64,
                           max_depth=10,
                           learning_rate=0.4,
                           n_estimators=100000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=seed,
                           metric='auc'
                           )

loss = 0
df_importance_list = []
oof_list = []

X_train = df_train[feature_names]
Y_train = df_train[ycol]

X_val = df_val[feature_names]
Y_val = df_val[ycol]

lgb_model = model.fit(X_train,
                      Y_train,
                      eval_names=['train', 'valid'],
                      eval_set=[(X_train, Y_train), (X_val, Y_val)],
                      verbose=50,
                      eval_metric='auc',
                      early_stopping_rounds=50)

In [None]:
auc = lgb_model.best_score_['valid']['auc']
print(auc)

In [None]:
df_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': lgb_model.feature_importances_,
})
df_importance = df_importance.sort_values(by='importance', ascending=False)
df_importance

In [None]:
val_pred = lgb_model.predict_proba(
    X_val, num_iteration=lgb_model.best_iteration_)[:, 1]
df_oof = pd.DataFrame()
df_oof['lgb_pred'] = val_pred
df_oof['target'] = Y_val.values
df_oof['pred_label'] = df_oof['lgb_pred'].rank()
df_oof['pred_label'] = (df_oof['pred_label'] >=
                        df_oof.shape[0] * 0.8934642948637943).astype(int)
f1 = f1_score(df_oof['target'], df_oof['pred_label'])
print('f1:', f1)

In [None]:
best_iteration = lgb_model.best_iteration_

with open(os.path.join(current_path, 'best_it.txt'), 'w') as f:
    f.write(str(best_iteration)+'\n')
    f.write(str(f1))