In [1]:
import os
import re
import numpy as np
import pandas as pd
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

import scorpyo as sp

from null_importance import get_null_importance


pd.set_option('max_rows', 500, 'max_columns', 200)

In [2]:
path_project = r'/Users/liliangshan/workspace/python/01_datasets/ccf_system_access_risk_identification'

# path dir
path_row_data = os.path.join(path_project, 'row_data')
path_new_data = os.path.join(path_project, 'new_data')
path_results  = os.path.join(path_project, 'results')
path_results_jupyter  = os.path.join(path_results, 'jupyter')

# path row_data
path_train = os.path.join(path_row_data, 'train.csv')
path_test  = os.path.join(path_row_data, 'evaluation_public.csv')
path_sample_submission = os.path.join(path_row_data, 'submit_example.csv')


path_new_train = os.path.join(path_new_data, 'train_lightgbm_20221014.csv')
path_new_test  = os.path.join(path_new_data, 'test_lightgbm_20221014.csv')

## results
path_output_report = os.path.join(path_results, '01_原始数据探察_20221014.xlsx')

y_label = "is_risk"

In [3]:
df_row_train = sp.read_data(path_train)
df_row_val  = sp.read_data(path_test)

df_row_train['url_sit'] = df_row_train['url'].map(lambda x: x.split('/')[0])
df_row_train['url_page'] = df_row_train['url'].map(lambda x: x.split('/')[1])

df_row_val['url_sit'] = df_row_val['url'].map(lambda x: x.split('/')[0])
df_row_val['url_page'] = df_row_val['url'].map(lambda x: x.split('/')[1])



df_train_info = df_row_train.copy().sort_values(by='op_datetime')

## 定义一次/一天/一月
1. op\_times\_groups 一次
2. op\_days 一天
3. op\_month 一月

In [4]:
df = pd.concat([df_row_train, df_row_val]).reset_index(drop=True)
df = df.sort_values(by='op_datetime')

# 认证日期时间
df['op_datetime'] = pd.to_datetime(df['op_datetime'])
# 将数据分为每一天
df['op_days'] = df['op_datetime'].map(lambda x: x.strftime('%Y-%m-%d'))

# 按客户进行统计这次认证和上次认证的时间差
df['op_second'] = df['op_datetime']
df['op_second1'] = df.groupby('device_num_transform')['op_second'].shift(1)
df['op_diff_second1'] = (df['op_second'] - df['op_second1']).map(lambda x: x.total_seconds())
df['op_diff_second1_tmp'] = df['op_diff_second1']>10
# 客户第几次登录
df['op_times_groups'] = df.groupby('device_num_transform')['op_diff_second1_tmp'].apply(lambda x: x.cumsum())

# 系统层面的一段时间
df['system_op_second'] = df['op_datetime']
df['system_op_second1'] = df['system_op_second'].shift(1)
df['system_op_diff_second1'] = (df['system_op_second'] - df['system_op_second1']).map(lambda x: x.total_seconds())
df['system_op_diff_second1_tmp'] = df['system_op_diff_second1']>400
# 客户第几次登录
df['system_op_times_groups'] = df['system_op_diff_second1_tmp'].cumsum()

df = df.drop(columns=['op_second','op_second1','op_diff_second1_tmp', 
                      'system_op_second', 'system_op_second1', 'system_op_diff_second1_tmp'])

## 环境特征

In [5]:

time_feats = ['system_op_times_groups', 'op_days', 'op_month']

cate_feats = ['ip_transform', 'user_name', 'device_num_transform', 'department', 'browser_version', 'browser', 'os_type','os_version',
              'ip_type','http_status_code', 'op_city', 'log_system_transform', 'url', 'url_page','url_sit']

# 客户+时间+按时间cumsum/cumunique
# 累计量统计
df['helper'] = 1
# 是否为异常状态码
df['http_status_code_helper'] = df['http_status_code'].map(lambda x: 1 if x in [400, 500, 502, 404] else 0) 
# 给样本编号
df['sampler_index_helper'] = df['helper'].cumsum()

for i in time_feats:
    i_tmp = df.groupby([i])
    # 系统往前看，处理了多少事
    df['system_{}_cumsum'] = i_tmp['helper'].cumsum()
    # 系统往前看，处理了多少坏事
    df['system_{}_error_code_cumsum'] = i_tmp['http_status_code_helper'].cumsum()
    for j in cate_feats:
        index_set = set(df.groupby([i, j],as_index=False).first()['sampler_index_helper'].values)
        df['tmp_helper'] = df['sampler_index_helper'].map(lambda x: 1 if x in index_set else 0)
        # 系统往前看，不同维度的处理了多少情况
        df['system_{}_{}_cumunique'.format(i, j)] = df.groupby([i, j])['tmp_helper'].cumsum()
        
        if j not in ['ip_transform', 'user_name', 'device_num_transform']:
            for k in df[j].unique():
                tmp = df[df[j]==k].groupby([i])

                # 系统往前看，不同维度不同情况分别处理了多少次
                df['system_{}_{}_{}_cumsum'.format(i,j,k)] = tmp['helper'].cumsum()
                # 系统往前看，不同维度不同情况error_code分别处理了多少次
                df['system_{}_{}_{}_error_code_cumsum'.format(i,j,k)] = tmp['http_status_code_helper'].cumsum()

remove_cols = [x for x in df.columns if x[-6:]=='helper']

df = df.drop(columns=remove_cols)


KeyboardInterrupt: 

## 对象层面

In [None]:

time_feats = ['op_times_groups', 'op_days', 'op_month']

object_feats = ['ip_transform', 'user_name', 'device_num_transform']

cate_feats = ['ip_transform', 'user_name', 'device_num_transform', 'department', 'browser_version', 'browser', 'os_type','os_version',
              'ip_type','http_status_code', 'op_city', 'log_system_transform', 'url', 'url_page','url_sit']

# 客户+时间+按时间cumsum/cumunique
# 累计量统计
df['helper'] = 1
# 是否为异常状态码
df['http_status_code_helper'] = df['http_status_code'].map(lambda x: 1 if x in [400, 500, 502, 404] else 0) 
# 给样本编号
df['sampler_index_helper'] = df['helper'].cumsum()


for i in time_feats:
    for j in object_feats:
        j_tmp = df.groupby([i,j])
        df['{}_{}_cumsum'.format(i,j)] = j_tmp['helper'].cumsum()
        df['{}_{}_error_code_cumsum'.format(i,j)] = j_tmp['helper'].cumsum()
        for k in cate_feats:
            if k == j: continue
            index_set = set(df.groupby([i,j,k], as_index=False).first()['sampler_index_helper'].values)            
            df['tmp_helper'] = df['sampler_index_helper'].map(lambda x: 1 if x in index_set else 0)
            df['{}_{}_{}_cumunique'.format(i,j,k)] = df.groupby([i,j,k])['tmp_helper'].cumsum()
            
            if j not in ['ip_transform', 'user_name', 'device_num_transform']:
                for v in df[k].unique():
                    v_tmp = df[df[k]==v].groupby([i,j])
                    df['{}_{}_{}_{}_cumsum'.format(i,j,k,v)] = v_tmp.cumsum()
                    df['{}_{}_{}_{}_error_code_cumsum'.format(i,j,k,v)] = v_tmp.cumsum()

remove_cols = [x for x in df.columns if x[-6:]=='helper']

df = df.drop(columns=remove_cols)

## 其它操作

In [None]:
# 几点钟
df['hour'] = df['op_datetime'].dt.hour
df['hour_sin'] = np.sin(df['hour']/24*2*np.pi)
df['hour_cos'] = np.cos(df['hour']/24*2*np.pi)

# 周几
df['dayofweek'] = df['op_datetime'].dt.dayofweek
df['dayofweek_sin'] = np.sin(df['dayofweek']/7*2*np.pi)
df['dayofweek_cos'] = np.cos(df['dayofweek']/7*2*np.pi)

# 一个月的第几天
df['day'] = df['op_datetime'].dt.day
df['day_sin'] = np.sin(df['day']/31*2*np.pi)
df['day_cos'] = np.cos(df['day']/31*2*np.pi)

## 特征筛选

In [None]:
# 删除时间及类别型变量过多的特征
remove_col = ['op_datetime', 'op_month', 'user_name', 'ip_transform', 'device_num_transform', 'op_days', 'ts', 'ts1', 'ts2','ts3']

remove_col = [x for x in df.columns if x in remove_col]
df = df.drop(columns=remove_col)
df.head()

### 评分卡特征筛选

In [None]:
df_row_train = df[df[y_label].notna()].reset_index(drop=True)
df_row_val = df[df[y_label].isna()].reset_index(drop=True)

df_train, df_test, convert_cols = sp.transform_data_detail(df_row_train, df_row_val, y_label, excel_path=path_output_report)
df_train.head()

In [None]:
train_woe, val_woe, bins_adj = sp.binning_data_detail(train=df_train,
                        test=df_test, y=y_label, excel_path=path_output_report, var_skip=['id'],)

In [None]:
res_data, res_val, bins_selection = sp.select_feat_detail(train_woe, val_woe,df_test, y_label, bins_adj,exclude=['id'],
                                                iv_max_threshold=1.5,
                                                remove_negitive_coef=False,
                                                corr_threshold=0.7,
                                                #chi_threshold=0.00001,
                                                excel_path=path_output_report)

### null importance特征筛选

In [None]:
feats, categorical_feats = get_null_importance(df_train.drop(columns=[y_label,'id']),
                                               df_train[y_label], 
                                               thresholds=15)

In [None]:

object_feats = ['ip_transform', 'user_name', 'device_num_transform']

df_train = pd.concat([df_train_info[object_feats], df_train[feats], res_data], axis=1)
df_test = pd.concat([df_test[feats], res_val], axis=1)

## modeling

In [None]:

feats = df_train.columns.drop(['id', y_label]+object_feats)
feats = feats.drop(categorical_feats)
#feats 

In [None]:
categorical_feats

In [None]:
feats = feats_importance.sort_values('importance', ascending=False)[:20]['name'].values

In [None]:

import time
from sklearn.metrics import roc_auc_score as auc
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold

In [None]:
params = {
    'learning_rate': 0.05,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'verbose': -1,
    'seed': 2222,
    'n_jobs': -1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'bagging_freq': 4,
    # 'min_child_weight': 10,
}

In [None]:
fold_num = 5
seeds = [2022]
oof = np.zeros(len(df_train))
importance = 0
pred_y = pd.DataFrame()
score = []
for seed in seeds:
    kf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=seed)
    # kf = KFold(n_splits=fold_num, shuffle=True, random_state=seed)
    for fold, (train_idx, val_idx) in enumerate(kf.split(df_train[feats], df_train[y_label])):
        print('-----------', fold)
        train = lgb.Dataset(df_train.loc[train_idx, feats],
                            df_train.loc[train_idx, y_label],
                           # categorical_feature=categorical_feats
                           )
        val = lgb.Dataset(df_train.loc[val_idx, feats],
                          df_train.loc[val_idx, y_label],
                          #categorical_feature=categorical_feats
                         )
        model = lgb.train(params, train, valid_sets=[val], 
                          num_boost_round=20000, early_stopping_rounds=100)

        oof[val_idx] += model.predict(df_train.loc[val_idx, feats]) / len(seeds)
        pred_y['fold_%d_seed_%d' % (fold, seed)] = model.predict(df_test[feats])
        importance += model.feature_importance(importance_type='gain') / fold_num
        score.append(auc(df_train.loc[val_idx, y_label], model.predict(df_train.loc[val_idx, feats])))
feats_importance = pd.DataFrame()
feats_importance['name'] = feats
feats_importance['importance'] = importance
display(feats_importance.sort_values('importance', ascending=False)[:30])

df_train['oof'] = oof
display(np.mean(score), np.std(score))

score = np.mean(score)
df_test[y_label] = pred_y.mean(axis=1).values
df_test = df_test.sort_values('id').reset_index(drop=True)

sub = pd.read_csv(path_sample_submission)
sub[y_label] = df_test[y_label].values
sub.to_csv(os.path.join(path_results_jupyter,time.strftime('lgb_%Y%m%d%H%M_')+'%.5f.csv'%score), index=False)

In [None]:
feats_importance.sort_values('importance', ascending=False)[:50]

In [None]:
feats_importance.sort_values('importance', ascending=False)[:50].name.values

In [None]:
df_train_info['oof']=df_train['oof']
df_train_info[abs(df_train_info['is_risk']-df_train_info['oof'])>0.7]['device_num_transform'].value_counts()

In [None]:
df_train_info[df_train_info['device_num_transform']=='Rfv57YyO3vny']