In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
from sklearn.metrics import f1_score
import catboost as cbt
import gc

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings('ignore')

In [None]:
current_path = './'
seed = 2019

In [None]:
df_feature = pd.read_pickle(os.path.join(
    current_path, 'feature', 'feature_1.pickle'))
df_feature['id'] = df_feature['id'].astype('str')

In [None]:
df_feature.shape

In [None]:
df_oof_lgb = pd.read_pickle(os.path.join(
    current_path, 'prob', 'oof_lgb_qian.pickle'))
df_oof_lgb.columns = ['id', 'lgb_oof_prob']
df_feature = df_feature.merge(df_oof_lgb, how='left', on='id')
print(df_feature['lgb_oof_prob'].isnull().sum())

In [None]:
df_oof_cat = pd.read_pickle(os.path.join(
    current_path, 'prob', 'oof_cat.pickle'))
df_oof_cat.columns = ['id', 'cat_oof_prob']
df_feature = df_feature.merge(df_oof_cat, how='left', on='id')
print(df_feature['cat_oof_prob'].isnull().sum())

In [None]:
df_feature.head()

In [None]:
with open(os.path.join(current_path, 'best_it.txt'), 'r') as f:
    lines = f.readlines()
    best_iteration = lines[0]
    f1 = lines[1]

best_iteration = int(best_iteration)
f1 = float(f1)
print(best_iteration, f1)

In [None]:
bt = int(best_iteration * 1)
print(bt)

In [None]:
for f in tqdm(list(df_feature.select_dtypes('object'))):
    if f not in ['id']:
        le = LabelEncoder()
        df_feature[f] = le.fit_transform(
            df_feature[f].astype('str')).astype('int')

In [None]:
df_test = df_feature[df_feature['target'].isnull()]
df_train = df_feature[df_feature['target'].notnull()]

del df_feature, df_oof_lgb, df_oof_cat
gc.collect()

In [None]:
ycol = 'target'
feature_names = list(
    filter(lambda x: x not in [ycol, 'timestamp', 'ts', 'id', 'day', 'hour', 'minute', 'ts_datetime', 'minute10',
                               'personidentification', 'level', 'followscore', 'personalscore', 'gender',
                               'hourl', 'group'],
           df_train.columns))

X_train = df_train[feature_names]
Y_train = df_train[ycol]

model = lgb.LGBMClassifier(num_leaves=64,
                           max_depth=10,
                           learning_rate=0.4,
                           n_estimators=bt,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=seed,
                           metric='auc',
                           )

lgb_model2 = model.fit(X_train,
                       Y_train,
                       eval_names=['train', 'valid'],
                       eval_set=[(X_train, Y_train)],
                       verbose=50,
                       eval_metric='auc')

In [None]:
df_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': lgb_model2.feature_importances_,
})

df_importance = df_importance.sort_values(by='importance', ascending=False)
df_importance

In [None]:
test_pred = lgb_model2.predict_proba(
    df_test[feature_names], num_iteration=bt)[:, 1]
prediction = df_test[['id']]
prediction['target'] = test_pred
np.save(os.path.join(current_path, 'prob',
                     'sub_{}.npy'.format(f1)), prediction.values)

In [None]:
sub = prediction.copy(deep=True)
sub['target'] = sub['target'].rank()
sub['target'] = (sub['target'] >= sub.shape[0] *
                 0.8934642948637943).astype(int)
sub.to_csv(os.path.join(current_path, 'sub', '{}.csv'.format(f1)),
           index=False, encoding='utf-8')