In [None]:
import pandas as pd
from tqdm import tqdm
import warnings
import gc
import os
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from collections import OrderedDict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import time
from itertools import combinations

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

In [None]:
seed = 2021

In [None]:
df_train = pd.read_csv('/home/mw/input/pre8881/train.csv')
df_test = pd.read_csv('/home/mw/input/pretest_a3048/test_a.csv')

In [None]:
df_train.shape, df_test.shape

In [None]:
df_feature = df_train.append(df_test, sort=False)

In [None]:
df_feature.head()

In [None]:
df_feature['tp_ratio'] = df_feature['nprem_tp'] / df_feature['si_tp']

In [None]:
# 计数
for f in [['dpt'], ['client_no'], ['trademark_cn'], ['brand_cn'], ['make_cn'], ['series']]:
    df_temp = df_feature.groupby(f).size().reset_index()
    df_temp.columns = f + ['{}_count'.format('_'.join(f))]
    df_feature = df_feature.merge(df_temp, how='left')

In [None]:
df_feature['birth_month'] = df_feature['birth_month'].apply(
    lambda x: int(x[:-1]) if type(x) != float else 0)

In [None]:
# 简单统计
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()

    return df_merge


def statis_feat(df_know, df_unknow):
    for f in tqdm(['p1_census_register', 'dpt']):
        df_unknow = stat(df_know, df_unknow, [f], {
                         'y1_is_purchase': ['mean']})

    return df_unknow

In [None]:
# 5折交叉
df_train = df_feature[~df_feature['y1_is_purchase'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_feature[df_feature['y1_is_purchase'].isnull()]

df_stas_feat = None
kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
for train_index, val_index in kfold.split(df_train, df_train['y1_is_purchase']):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del(df_fold_train)
    del(df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test)
df_feature = pd.concat([df_stas_feat, df_test], axis=0)

del(df_stas_feat)
del(df_train)
del(df_test)
gc.collect()

In [None]:
df_feature.head()

# 模型训练

In [None]:
for f in list(df_feature.select_dtypes('object')):
    if f in ['carid', 'regdate']:
        continue
    le = LabelEncoder()
    df_feature[f] = le.fit_transform(
        df_feature[f].astype('str')).astype('int')

In [None]:
df_train = df_feature[df_feature['y1_is_purchase'].notnull()]
df_test = df_feature[df_feature['y1_is_purchase'].isnull()]

In [None]:
ycol = 'y1_is_purchase'
feature_names = list(
    filter(lambda x: x not in [ycol, 'regdate', 'carid'], df_train.columns))

model = lgb.LGBMClassifier(num_leaves=64,
                           max_depth=10,
                           learning_rate=0.01,
                           n_estimators=10000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=seed,
                           metric=None)

oof = []
prediction = df_test[['carid']]
prediction['label'] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(
        df_train[feature_names], df_train[ycol])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['valid'],
                          eval_set=[(X_val, Y_val)],
                          verbose=500,
                          eval_metric='auc',
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)[:, 1]
    df_oof = df_train.iloc[val_idx][[
        'carid', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]
    prediction['label'] += pred_test / 5

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()

In [None]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

In [None]:
df_oof = pd.concat(oof)
score = roc_auc_score(df_oof['y1_is_purchase'], df_oof['pred'])
score

In [None]:
score

In [None]:
df_oof.head(20)

In [None]:
prediction.head()

In [None]:
os.makedirs('sub', exist_ok=True)
prediction.to_csv(f'sub/{score}.csv', index=False)
prediction.to_csv(f'sub/sub.csv', index=False)