In [None]:
import warnings
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

import gc
import os

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
from gensim.models import Word2Vec
from sklearn.feature_selection import SelectPercentile, f_classif, chi2

warnings.simplefilter('ignore')
tqdm.pandas()
%matplotlib inline

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
pd.set_option('max_colwidth', 200)

In [None]:
seed = 2020

In [None]:
# 读取数据
df_train_label = pd.read_csv('raw_data/train/train_label.csv')
df_train_base = pd.read_csv('raw_data/train/train_base.csv')
df_train_trans = pd.read_csv('raw_data/train/train_trans.csv')
df_train_op = pd.read_csv('raw_data/train/train_op.csv')

df_test_base = pd.read_csv('raw_data/test_b/testb_base.csv')
df_test_trans = pd.read_csv('raw_data/test_b/testb_trans.csv')
df_test_op = pd.read_csv('raw_data/test_b/testb_op.csv')

df_trans = df_train_trans.append(df_test_trans)
df_trans = df_trans.reset_index(drop=True)

df_op = df_train_op.append(df_test_op)
df_op = df_op.reset_index(drop=True)

In [None]:
def parse_time(tm):
    days, _, time = tm.split(' ')
    time = time.split('.')[0]

    time = '2020-1-1 ' + time
    time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
    time = (time + timedelta(days=int(days)))

    return time


df_trans['date'] = df_trans['tm_diff'].apply(parse_time)
df_trans['day'] = df_trans['date'].dt.day
df_trans['hour'] = df_trans['date'].dt.hour

df_op['date'] = df_op['tm_diff'].apply(parse_time)
df_op['day'] = df_op['date'].dt.day
df_op['hour'] = df_op['date'].dt.hour

In [None]:
df_trans.sort_values(['user', 'date'], inplace=True)
df_trans = df_trans.reset_index(drop=True)

df_op.sort_values(['user', 'date'], inplace=True)
df_op = df_op.reset_index(drop=True)

In [None]:
df_train_base.head()

In [None]:
df_train = df_train_base.merge(df_train_label, how='left')
df_test = df_test_base

df_feature = df_train.append(df_test)

In [None]:
def select_feature(df, select_feature, ycol, p):
    X = df[select_feature]
    X.fillna(0, inplace=True)
    Y = df[ycol]

    selectChi2 = SelectPercentile(chi2, percentile=p).fit(X, Y)
    selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, Y)

    chi2_selected = selectChi2.get_support()
    print('Chi2 selected {} features.'.format(chi2_selected.sum()))
    f_classif_selected = selectF_classif.get_support()
    print('F_classif selected {} features.'.format(f_classif_selected.sum()))
    selected = chi2_selected & f_classif_selected
    print('Chi2 & F_classif selected {} features'.format(selected.sum()))
    selected_features = [f for f, s in zip(select_feature, selected) if s]

    del_features = list(set(select_feature) - set(selected_features))
    del_features.sort()
    return del_features

# 特征工程

In [None]:
os.makedirs('model', exist_ok=True)
os.makedirs('embedding', exist_ok=True)


def w2v_emb(df, f1, f2):
    emb_size = 16

    model_path = 'model/w2v_{}_{}_{}.m'.format(f1, f2, emb_size)
    embedding_path = 'embedding/{}_{}_{}.pkl'.format(f1, f2, emb_size)

    if os.path.exists(embedding_path):
        embedding = pd.read_pickle(embedding_path)
        return embedding

    tmp = df.groupby(f1, as_index=False)[f2].agg(
        {'{}_{}_list'.format(f1, f2): list})

    sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()
    del tmp['{}_{}_list'.format(f1, f2)]
    for i in range(len(sentences)):
        sentences[i] = [str(x) for x in sentences[i]]

    if os.path.exists(model_path):
        model = Word2Vec.load(model_path)
    else:
        model = Word2Vec(sentences,
                         size=emb_size,
                         window=5,
                         min_count=5,
                         sg=0,
                         hs=1,
                         seed=seed)
        model.save(model_path)

    emb_matrix = []
    for seq in tqdm(sentences):
        vec = []
        for w in seq:
            if w in model:
                vec.append(model[w])
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)

    df_emb = pd.DataFrame(emb_matrix)
    df_emb.columns = [
        '{}_{}_emb_{}'.format(f1, f2, i) for i in range(emb_size)
    ]

    embedding = pd.concat([tmp, df_emb], axis=1)
    embedding.to_pickle(embedding_path)

    return embedding

In [None]:
def tfidf_emb(df, f1, f2):
    emb_size = 10

    df[f2] = df[f2].astype(str)
    df[f2].fillna('-1', inplace=True)
    group_df = df.groupby([f1]).apply(
        lambda x: x[f2].tolist()).reset_index()
    group_df.columns = [f1, 'list']
    group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))
    enc_vec = TfidfVectorizer()
    tfidf_vec = enc_vec.fit_transform(group_df['list'])
    svd_enc = TruncatedSVD(n_components=emb_size, n_iter=20, random_state=seed)
    vec_svd = svd_enc.fit_transform(tfidf_vec)
    vec_svd = pd.DataFrame(vec_svd)
    vec_svd.columns = ['svd_tfidf_{}_{}'.format(
        f2, i) for i in range(emb_size)]
    group_df = pd.concat([group_df, vec_svd], axis=1)
    del group_df['list']
    return group_df

In [None]:
def countvec_emb(df, f1, f2):
    emb_size = 10

    df[f2] = df[f2].astype(str)
    df[f2].fillna('-1', inplace=True)
    group_df = df.groupby([f1]).apply(
        lambda x: x[f2].tolist()).reset_index()
    group_df.columns = [f1, 'list']
    group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))
    enc_vec = CountVectorizer()
    tfidf_vec = enc_vec.fit_transform(group_df['list'])
    svd_enc = TruncatedSVD(n_components=emb_size, n_iter=20, random_state=seed)
    vec_svd = svd_enc.fit_transform(tfidf_vec)
    vec_svd = pd.DataFrame(vec_svd)
    vec_svd.columns = ['svd_countvec_{}_{}'.format(
        f2, i) for i in range(emb_size)]
    group_df = pd.concat([group_df, vec_svd], axis=1)
    del group_df['list']
    return group_df

## 操作信息特征

In [None]:
df_op.head()

In [None]:
for f1, f2 in [['user', 'op_mode'], ['user', 'op_type']]:
    df_feature = df_feature.merge(tfidf_emb(df_op, f1, f2), on=f1, how='left')

In [None]:
for f1, f2 in [['user', 'op_mode'], ['user', 'op_type']]:
    df_feature = df_feature.merge(
        countvec_emb(df_op, f1, f2), on=f1, how='left')

In [None]:
for col in ['channel', 'op_mode']:
    df_temp = df_op[['user', col]].copy()
    df_temp['tmp'] = 1
    df_temp = df_temp.pivot_table(index='user', columns=col,
                                  values='tmp', aggfunc=np.sum).reset_index().fillna(0)
    df_temp.columns = [c if c == 'user' else 'op_{}_{}_count'.format(
        col, c) for c in df_temp.columns]
    df_feature = df_feature.merge(df_temp, how='left')

In [None]:
for col in ['op_type']:
    df_temp = df_op[['user', 'hour', col]].copy()
    df_temp = df_temp.pivot_table(index='user', columns=col,
                                  values='hour', aggfunc=np.mean).reset_index().fillna(0)
    df_temp.columns = [c if c == 'user' else 'op_{}_{}_hour_mean'.format(
        col, c) for c in df_temp.columns]
    df_feature = df_feature.merge(df_temp, how='left')

In [None]:
df_temp = df_op.groupby(['user', 'op_device']).size().reset_index()
df_temp.drop([0], axis=1, inplace=True)
df_temp = df_temp.sort_values(
    by=['user', 'op_device'], ascending=['asc', 'asc'])
df_temp.drop_duplicates('user', keep='last', inplace=True)
df_feature = df_feature.merge(df_temp, how='left')

In [None]:
df_temp = df_op.groupby('user')['hour'].agg(
    op_hour_std='std').reset_index()
df_feature = df_feature.merge(df_temp, how='left')

df_temp = df_op.groupby('user')['day'].agg(
    op_day_std='std', op_day_max='max').reset_index()
df_feature = df_feature.merge(df_temp, how='left')

In [None]:
df_op['date_diff'] = df_op.groupby('user')['date'].diff()
df_op['op_second_diff'] = df_op['date_diff'].dt.seconds
df_op['op_hour_diff'] = df_op['op_second_diff'] / 3600
df_op['op_day_diff'] = df_op['op_hour_diff'] / 24

df_temp = df_op.groupby('user')['op_second_diff'].agg(
    op_second_diff_mean='mean').reset_index()
df_feature = df_feature.merge(df_temp, how='left')

In [None]:
df_feature.head()

## 交易信息特征

In [None]:
df_trans.head()

In [None]:
for f1, f2 in [['user', 'amount']]:
    df_feature = df_feature.merge(w2v_emb(df_trans, f1, f2), on=f1, how='left')

In [None]:
for col in ['platform', 'type1', 'type2', 'hour']:
    df_temp = df_trans.pivot_table(
        index='user', columns=col, values='amount', aggfunc='sum').reset_index()
    df_temp.columns = [c if c == 'user' else 'trans_{}_{}_amount_sum'.format(
        col, c) for c in df_temp.columns]
    df_feature = df_feature.merge(df_temp, how='left')

for col in ['type1', 'hour']:
    df_temp = df_trans.pivot_table(
        index='user', columns=col, values='amount', aggfunc='mean').reset_index()
    df_temp.columns = [c if c == 'user' else 'trans_{}_{}_amount_mean'.format(
        col, c) for c in df_temp.columns]
    df_feature = df_feature.merge(df_temp, how='left')

for col in ['type1']:
    df_temp = df_trans.pivot_table(
        index='user', columns=col, values='amount', aggfunc='max').reset_index()
    df_temp.columns = [c if c == 'user' else 'trans_{}_{}_amount_max'.format(
        col, c) for c in df_temp.columns]
    df_feature = df_feature.merge(df_temp, how='left')

for col in ['type1']:
    df_temp = df_trans.pivot_table(
        index='user', columns=col, values='amount', aggfunc='min').reset_index()
    df_temp.columns = [c if c == 'user' else 'trans_{}_{}_amount_min'.format(
        col, c) for c in df_temp.columns]
    df_feature = df_feature.merge(df_temp, how='left')

In [None]:
for window in [31, 3, 5, 10, 15]:
    df_temp = df_trans[df_trans['day'] > 31-window].groupby('user')['amount'].agg({
        'trans_amount_mean_{}d'.format(window): 'mean',
        'trans_amount_std_{}d'.format(window): 'std',
        'trans_amount_max_{}d'.format(window): 'max',
        'trans_amount_min_{}d'.format(window): 'min',
        'trans_amount_sum_{}d'.format(window): 'sum',
    }).reset_index()
    df_feature = df_feature.merge(df_temp, how='left')

In [None]:
for window in [3, 5, 10]:
    for col in ['type1']:
        df_temp = df_trans[df_trans['day'] > 31-window].pivot_table(
            index='user', columns=col, values='amount', aggfunc='sum').reset_index()
        df_temp.columns = [c if c == 'user' else 'trans_{}_{}_amount_sum_{}d'.format(
            col, c, window) for c in df_temp.columns]
        df_feature = df_feature.merge(df_temp, how='left')

In [None]:
df_temp = df_trans.groupby(['user'])['ip'].agg(
    trans_ip_count='count').reset_index()
df_feature = df_feature.merge(df_temp, how='left')

df_temp = df_trans.groupby(['user'])['ip_3'].agg(
    trans_ip_3_count='count').reset_index()
df_feature = df_feature.merge(df_temp, how='left')

In [None]:
df_temp = df_trans.groupby('user')['hour'].agg(
    trans_hour_std='std').reset_index()
df_feature = df_feature.merge(df_temp, how='left')

In [None]:
df_trans['date_diff'] = df_trans.groupby('user')['date'].diff()
df_trans['trans_second_diff'] = df_trans['date_diff'].dt.seconds
df_trans['trans_hour_diff'] = df_trans['trans_second_diff'] / 3600
df_trans['trans_day_diff'] = df_trans['trans_hour_diff'] / 24

df_temp = df_trans.groupby('user')['trans_day_diff'].agg(
    trans_day_diff_mean='mean', trans_day_diff_std='std').reset_index()
df_feature = df_feature.merge(df_temp, how='left')

In [None]:
df_feature.head()

## 基本信息

In [None]:
for f in [
        'balance', 'balance_avg', 'balance1', 'balance1_avg', 'balance2',
        'balance2_avg', 'product1_amount', 'product2_amount',
        'product3_amount', 'product4_amount', 'product5_amount', 'product6_amount'
]:
    df_feature[f] = df_feature[f].apply(lambda x: int(
        x.split(' ')[1]) if type(x) != float else np.NaN)

In [None]:
df_feature['product7_fail_ratio'] = df_feature[
    'product7_fail_cnt'] / df_feature['product7_cnt']
df_feature['card_cnt'] = df_feature['card_a_cnt'] + df_feature[
    'card_b_cnt'] + df_feature['card_c_cnt'] + df_feature['card_d_cnt']

df_feature['acc_card_ratio'] = df_feature['acc_count'] / df_feature['card_cnt']
df_feature['login_cnt'] = df_feature['login_cnt_period1'] + \
    df_feature['login_cnt_period2']

df_feature['login_cnt_period2_login_cnt_ratio'] = df_feature['login_cnt_period2'] / \
    df_feature['login_cnt']
df_feature['login_cnt_period1_login_cnt_ratio'] = df_feature['login_cnt_period1'] / \
    df_feature['login_cnt']

df_feature['using_time_op2_cnt_ratio'] = df_feature['using_time'] / \
    df_feature['op2_cnt']
df_feature['using_time_op1_cnt_ratio'] = df_feature['using_time'] / \
    df_feature['op1_cnt']

In [None]:
# 欺诈率
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()
    return df_merge


def statis_feat(df_know, df_unknow):
    df_unknow = stat(df_know, df_unknow, ['province'], {'label': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['city'], {'label': ['mean']})
    df_unknow = stat(df_know, df_unknow, [
                     'city', 'level'], {'label': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['op_device'], {'label': ['mean']})
    df_unknow = stat(df_know, df_unknow, [
                     'age', 'op_device'], {'label': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['using_time'], {'label': ['mean']})
    df_unknow = stat(df_know, df_unknow, [
                     'city', 'op_device'], {'label': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['age', 'city'], {'label': ['mean']})
    df_unknow = stat(df_know, df_unknow, [
                     'op_device', 'level'], {'label': ['mean']})

    return df_unknow


df_train = df_feature[~df_feature['label'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_feature[df_feature['label'].isnull()]

df_stas_feat = None
kf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
for train_index, val_index in kf.split(df_train, df_train['label']):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del (df_fold_train)
    del (df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test)
df_feature = pd.concat([df_stas_feat, df_test], axis=0)
df_feature = df_feature.reset_index(drop=True)

del (df_stas_feat)
del (df_train)
del (df_test)
gc.collect()

In [None]:
df_feature.to_pickle('data/feature2.pkl')