In [1]:
import warnings
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

import gc
import os

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
from gensim.models import Word2Vec

warnings.simplefilter('ignore')
tqdm.pandas()
%matplotlib inline

pd.set_option('max_columns', None)
pd.set_option('max_rows', 300)
pd.set_option('max_colwidth', 200)

In [2]:
seed = 2020

In [3]:
# 读取数据
df_train_label = pd.read_csv('raw_data/train/train_label.csv')
df_train_base = pd.read_csv('raw_data/train/train_base.csv')
df_train_trans = pd.read_csv('raw_data/train/train_trans.csv')

df_test_base = pd.read_csv('raw_data/test_a/test_a_base.csv')
df_test_trans = pd.read_csv('raw_data/test_a/test_a_trans.csv')

df_trans = df_train_trans.append(df_test_trans)
df_trans = df_trans.reset_index(drop=True)

In [4]:
def parse_time(tm):
    days, _, time = tm.split(' ')
    time = time.split('.')[0]

    time = '2020-1-1 ' + time
    time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
    time = (time + timedelta(days=int(days)))

    return time


df_trans['date'] = df_trans['tm_diff'].apply(parse_time)
df_trans['day'] = df_trans['date'].dt.day
df_trans['hour'] = df_trans['date'].dt.hour

In [5]:
df_trans.sort_values(['user', 'date'], inplace=True)
df_trans = df_trans.reset_index(drop=True)

In [6]:
df_trans.head()

Unnamed: 0,user,platform,tunnel_in,tunnel_out,amount,type1,ip,type2,ip_3,tm_diff,date,day,hour
0,TestA_00001,46c69cbbce5f1568,b2e7fa260df4998d,6ee790756007e69a,84299,45a1168437c708ff,,11a213398ee0c623,,2 days 09:38:22.000000000,2020-01-03 09:38:22,3,9
1,TestA_00001,46c69cbbce5f1568,b2e7fa260df4998d,6ee790756007e69a,100537,45a1168437c708ff,,11a213398ee0c623,,19 days 12:50:46.000000000,2020-01-20 12:50:46,20,12
2,TestA_00001,46c69cbbce5f1568,b2e7fa260df4998d,6ee790756007e69a,103071,45a1168437c708ff,,11a213398ee0c623,,19 days 12:50:47.000000000,2020-01-20 12:50:47,20,12
3,TestA_00001,46c69cbbce5f1568,b2e7fa260df4998d,6ee790756007e69a,47289,45a1168437c708ff,,11a213398ee0c623,,20 days 12:33:42.000000000,2020-01-21 12:33:42,21,12
4,TestA_00002,42573d7287a8c9c2,,6ee790756007e69a,41187,f67d4b5a05a1352a,,,,11 days 17:51:53.000000000,2020-01-12 17:51:53,12,17


In [7]:
df_train = df_train_base.merge(df_train_label, how='left')
df_test = df_test_base

df_feature = df_train.append(df_test)

# 特征工程

## 交易信息特征

In [8]:
df_temp = df_trans.groupby(['user'
                            ])['amount'].agg(amount_mean='mean',
                                             amount_std='std',
                                             amount_sum='sum',
                                             amount_max='max',
                                             amount_min='min').reset_index()
df_feature = df_feature.merge(df_temp, how='left')

## 基本信息

In [9]:
for f in [
        'balance', 'balance_avg', 'balance1', 'balance1_avg', 'balance2',
        'balance2_avg', 'product1_amount', 'product2_amount',
        'product3_amount', 'product4_amount', 'product5_amount', 'product6_amount'
]:
    df_feature[f] = df_feature[f].apply(lambda x: int(x.split(' ')[1]) if type(x) != float else np.NaN)

In [10]:
df_feature['product7_fail_ratio'] = df_feature[
    'product7_fail_cnt'] / df_feature['product7_cnt']
df_feature['card_cnt'] = df_feature['card_a_cnt'] + df_feature[
    'card_b_cnt'] + df_feature['card_c_cnt'] + df_feature['card_d_cnt']

df_feature['acc_card_ratio'] = df_feature['acc_count'] / df_feature['card_cnt']
df_feature['login_cnt'] = df_feature['login_cnt_period1'] + df_feature['login_cnt_period2']

In [11]:
# 欺诈率
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()
    return df_merge


def statis_feat(df_know, df_unknow):
    df_unknow = stat(df_know, df_unknow, ['province'], {'label': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['city'], {'label': ['mean']})

    return df_unknow


df_train = df_feature[~df_feature['label'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_feature[df_feature['label'].isnull()]

df_stas_feat = None
kf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
for train_index, val_index in kf.split(df_train, df_train['label']):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del (df_fold_train)
    del (df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test)
df_feature = pd.concat([df_stas_feat, df_test], axis=0)
df_feature = df_feature.reset_index(drop=True)

del (df_stas_feat)
del (df_train)
del (df_test)
gc.collect()

0

In [12]:
df_feature.head()

Unnamed: 0,user,sex,age,provider,level,verified,using_time,regist_type,card_a_cnt,card_b_cnt,card_c_cnt,agreement1,op1_cnt,op2_cnt,card_d_cnt,agreement_total,service1_cnt,service1_amt,service2_cnt,agreement2,agreement3,agreement4,acc_count,login_cnt_period1,login_cnt_period2,ip_cnt,login_cnt_avg,login_days_cnt,province,city,balance,balance_avg,balance1,balance1_avg,balance2,balance2_avg,service3,service3_level,product1_amount,product2_amount,product3_amount,product4_amount,product5_amount,product6_amount,product7_cnt,product7_fail_cnt,label,amount_mean,amount_std,amount_sum,amount_max,amount_min,product7_fail_ratio,card_cnt,acc_card_ratio,login_cnt,province_label_mean,city_label_mean
0,Train_41088,category 0,24853,category 0,category 2,category 0,24731,category 7,24712,24712,24706,category 0,24712,24712,24706,24737,24706,24706,24706,category 1,category 0,category 0,24737,25394,25023,24791,24725,24791,c3e48f852a0da7b6,1b25064aa7fe4945,11,8.0,4,4.0,10,4,category 0,,2,9,1,0,0,5,24712,24706,1.0,44079.0,6399.927734,132237.0,47774.0,36689.0,0.999757,98836,0.250283,50417,0.360114,0.445455
1,Train_31781,category 1,24859,category 0,category 1,category 0,24717,category 1,24731,24712,24725,category 0,24731,24712,24706,24749,24706,24706,24706,category 0,category 0,category 0,24743,29840,26251,25011,24737,24974,6dd7071b6edc22d2,998bea7dacc1ac2e,1,1.0,1,1.0,1,1,category 0,,1,1,1,0,0,1,24706,24706,1.0,97000.0,85970.142515,291000.0,195918.0,40310.0,1.0,98874,0.250248,56091,0.270062,0.263736
2,Train_31874,category 0,24895,category 0,category 2,category 0,24707,category 1,24712,24712,24706,category 0,24719,24712,24706,24737,24706,24706,24706,category 1,category 1,category 0,24725,25163,25163,24804,24725,24829,71c3649e6dfc18fe,a18aa4ca362cb2c9,6,8.0,11,12.0,6,5,category 0,,1,1,1,0,0,1,24712,24706,0.0,28347.708333,4623.104598,680345.0,42553.0,24767.0,0.999757,98836,0.250162,50326,0.313309,0.295139
3,Train_17154,category 0,24846,category 0,category 2,category 0,24732,category 7,24719,24719,24706,category 0,24737,24712,24706,24749,24706,24706,24706,category 0,category 1,category 0,24712,25680,25005,24834,24719,24822,648024953e363510,e174abcd1cd8c033,7,7.0,1,1.0,7,5,category 0,,2,6,1,0,0,1,24712,24706,0.0,54089.933333,36418.750442,811349.0,156321.0,36689.0,0.999757,98850,0.249995,50685,0.215081,0.236908
4,Train_40531,category 1,24908,category 0,category 2,category 0,24728,category 1,24712,24712,24706,category 0,24725,24712,24706,24761,24706,24706,24706,category 0,category 1,category 1,24725,25577,25048,24761,24743,24761,21c43413032d5522,76cc98309376a6d2,2,2.0,3,3.0,1,1,category 0,,2,2,1,0,0,1,24712,24706,1.0,404972.666667,419810.655236,1214918.0,858932.0,30746.0,0.999757,98836,0.250162,50625,0.228516,0.138365


# 模型训练

In [13]:
for f in df_feature.select_dtypes('object').columns:
    if f not in ['user']:
        lbl = LabelEncoder()
        df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))

In [14]:
df_feature.to_pickle('data/feature.pkl')

In [15]:
df_train = df_feature[df_feature.label.notna()].copy()
df_test = df_feature[df_feature.label.isna()].copy()

df_train.shape, df_test.shape

((47782, 58), (24315, 58))

In [16]:
ycol = 'label'
feature_names = list(
    filter(lambda x: x not in [ycol, 'user'], df_train.columns))

model = lgb.LGBMClassifier(objective='binary',
                           boosting_type='gbdt',
                           num_leaves=32,
                           max_depth=6,
                           learning_rate=0.01,
                           n_estimators=10000,
                           subsample=0.8,
                           feature_fraction=0.6,
                           reg_alpha=10,
                           reg_lambda=12,
                           random_state=seed,
                           is_unbalance=True,
                           metric='auc')

df_oof = df_train[['user', ycol]].copy()
df_oof['prob'] = 0
prediction = df_test[['user']]
prediction['prob'] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (trn_idx, val_idx) in enumerate(
        kfold.split(df_train[feature_names], df_train[ycol])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(
        fold_id + 1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=100,
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)[:, 1]
    df_oof.loc[val_idx, 'prob'] = pred_val

    pred_test = lgb_model.predict_proba(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]
    prediction['prob'] += pred_test / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



Training until validation scores don't improve for 50 rounds
[100]	train's auc: 0.719676	valid's auc: 0.698623
[200]	train's auc: 0.731708	valid's auc: 0.704727
[300]	train's auc: 0.740915	valid's auc: 0.708396
[400]	train's auc: 0.748864	valid's auc: 0.711081
[500]	train's auc: 0.756819	valid's auc: 0.71332
[600]	train's auc: 0.763761	valid's auc: 0.714557
[700]	train's auc: 0.76987	valid's auc: 0.71521
[800]	train's auc: 0.775886	valid's auc: 0.715716
[900]	train's auc: 0.781564	valid's auc: 0.716062
[1000]	train's auc: 0.786915	valid's auc: 0.716472
[1100]	train's auc: 0.792077	valid's auc: 0.716754
[1200]	train's auc: 0.796878	valid's auc: 0.717306
[1300]	train's auc: 0.801879	valid's auc: 0.71751
[1400]	train's auc: 0.80624	valid's auc: 0.717794
[1500]	train's auc: 0.810552	valid's auc: 0.717991
[1600]	train's auc: 0.814734	valid's auc: 0.718238
[1700]	train's auc: 0.818738	valid's auc: 0.718484
[1800]	train's auc: 0.822545	valid's auc: 0.718589
[1900]	train's auc: 0.826348	vali

In [17]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby([
    'column'
])['importance'].agg('mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,amount_mean,2697.8
1,amount_min,2442.8
2,amount_sum,2433.2
3,city_label_mean,2322.4
4,amount_std,2041.6
5,amount_max,1955.0
6,city,1953.0
7,age,1867.8
8,province_label_mean,1867.8
9,login_cnt_period1,1713.2


In [18]:
auc = roc_auc_score(df_oof[ycol], df_oof['prob'])
print('auc:', auc)

auc: 0.7154059918761352


In [19]:
os.makedirs('sub', exist_ok=True)
prediction.to_csv('sub/yizhifu_{}.csv'.format(auc), index=False)