In [1]:
import os
import re
import numpy as np
import pandas as pd
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

import scorpyo as sp

from null_importance import get_null_importance


pd.set_option('max_rows', 500, 'max_columns', 200)

In [2]:
path_project = r'/Users/liliangshan/workspace/python/01_datasets/ccf_system_access_risk_identification'

# path dir
path_row_data = os.path.join(path_project, 'row_data')
path_new_data = os.path.join(path_project, 'new_data')
path_results  = os.path.join(path_project, 'results')
path_results_jupyter  = os.path.join(path_results, 'jupyter')

# path row_data
path_train = os.path.join(path_row_data, 'train.csv')
path_test  = os.path.join(path_row_data, 'evaluation_public.csv')
path_sample_submission = os.path.join(path_row_data, 'submit_example.csv')


path_new_train = os.path.join(path_new_data, 'train_lightgbm_20221014.csv')
path_new_test  = os.path.join(path_new_data, 'test_lightgbm_20221014.csv')

## results
path_output_report = os.path.join(path_results, '01_原始数据探察_20221014.xlsx')

y_label = "is_risk"

In [3]:
df_row_train = sp.read_data(path_train)
df_row_val  = sp.read_data(path_test)

df_row_train['url_sit'] = df_row_train['url'].map(lambda x: x.split('/')[0])
df_row_train['url_page'] = df_row_train['url'].map(lambda x: x.split('/')[1])

df_row_val['url_sit'] = df_row_val['url'].map(lambda x: x.split('/')[0])
df_row_val['url_page'] = df_row_val['url'].map(lambda x: x.split('/')[1])



df_train_info = df_row_train.copy().sort_values(by='op_datetime')

## 定义一次/一天/一月
1. op\_times\_groups 一次
2. op\_days 一天
3. op\_month 一月

In [4]:
df = pd.concat([df_row_train, df_row_val]).reset_index(drop=True)
df = df.sort_values(by='op_datetime')

# 认证日期时间
df['op_datetime'] = pd.to_datetime(df['op_datetime'])
# 将数据分为每一天
df['op_days'] = df['op_datetime'].map(lambda x: x.strftime('%Y-%m-%d'))

# 按客户进行统计这次认证和上次认证的时间差
df['op_second'] = df['op_datetime']
df['op_second1'] = df.groupby('device_num_transform')['op_second'].shift(1)
df['op_diff_second1'] = (df['op_second'] - df['op_second1']).map(lambda x: x.total_seconds())
df['op_diff_second1_log'] = df['op_diff_second1'].apply(np.log)
df['op_diff_second1_log_log'] = df['op_diff_second1'].apply(np.log).apply(np.log)

df['op_diff_second1_tmp'] = df['op_diff_second1']>10
# 客户第几次登录
df['op_times_groups'] = df.groupby('device_num_transform')['op_diff_second1_tmp'].apply(lambda x: x.cumsum())

# 系统层面的一段时间
df['system_op_second'] = df['op_datetime']
df['system_op_second1'] = df['system_op_second'].shift(1)
df['system_op_diff_second1'] = (df['system_op_second'] - df['system_op_second1']).map(lambda x: x.total_seconds())

df['system_op_diff_second1_log'] = df['system_op_diff_second1'].apply(np.log)
df['system_op_diff_second1_log_log'] = df['system_op_diff_second1'].apply(np.log).apply(np.log)
df['system_op_diff_second1_tmp'] = df['system_op_diff_second1']>400
# 客户第几次登录
df['system_op_times_groups'] = df['system_op_diff_second1_tmp'].cumsum()

df = df.drop(columns=['op_second','op_second1','op_diff_second1_tmp', 
                      'system_op_second', 'system_op_second1', 'system_op_diff_second1_tmp'])

## 环境特征

In [5]:

time_feats = ['system_op_times_groups', 'op_days', 'op_month']

cate_feats = ['ip_transform', 'user_name', 'device_num_transform', 'department', 'browser_version', 'browser', 'os_type','os_version',
              'ip_type','http_status_code', 'op_city', 'log_system_transform', 'url', 'url_page','url_sit']

# 客户+时间+按时间cumsum/cumunique
# 累计量统计
df['helper'] = 1
# 是否为异常状态码
df['http_status_code_helper'] = df['http_status_code'].map(lambda x: 1 if x in [400, 500, 502, 404] else 0) 
# 给样本编号
df['sampler_index_helper'] = df['helper'].cumsum()

for i in time_feats:
    i_tmp = df.groupby([i])
    # 系统往前看，处理了多少事
    df['system_{}_cumsum'.format(i)] = i_tmp['helper'].cumsum()
    # 系统往前看，处理了多少坏事
    df['system_{}_error_code_cumsum'.format(i)] = i_tmp['http_status_code_helper'].cumsum()
    for method in ['mean', 'std', "prod"]:
        df['system_{}_op_diff_second1_{}'.format(i, method)] = i_tmp['op_diff_second1'].transform(method)
    for j in cate_feats:
        index_set = set(df.groupby([i, j],as_index=False).first()['sampler_index_helper'].values)
        df['tmp_helper'] = df['sampler_index_helper'].map(lambda x: 1 if x in index_set else 0)
        j_tmp = df.groupby([i, j])
        # 系统往前看，不同维度的处理了多少情况
        df['system_{}_{}_cumunique'.format(i, j)] = j_tmp['tmp_helper'].cumsum()
        
        if i=='system_op_times_groups':
            for method in ['mean', 'std']:
                df['system_{}_{}_op_diff_second1_{}'.format(i,j, method)] = j_tmp['op_diff_second1'].transform(method)

        if j not in ['ip_transform', 'user_name', 'device_num_transform']:
            for k in df[j].unique():
                tmp = df[df[j]==k].groupby([i])

                # 系统往前看，不同维度不同情况分别处理了多少次
                df['system_{}_{}_{}_cumsum'.format(i,j,k)] = tmp['helper'].cumsum()
                # 系统往前看，不同维度不同情况error_code分别处理了多少次
                df['system_{}_{}_{}_error_code_cumsum'.format(i,j,k)] = tmp['http_status_code_helper'].cumsum()

remove_cols = [x for x in df.columns if x[-6:]=='helper']

df = df.drop(columns=remove_cols)

## 对象层面

In [6]:

time_feats = ['op_times_groups', 'op_days', 'op_month']

object_feats = ['ip_transform', 'user_name', 'device_num_transform']

cate_feats = ['ip_transform', 'user_name', 'device_num_transform', 'department', 'browser_version', 'browser', 'os_type','os_version',
              'ip_type','http_status_code', 'op_city', 'log_system_transform', 'url', 'url_page','url_sit']

# 客户+时间+按时间cumsum/cumunique
# 累计量统计
df['helper'] = 1
# 是否为异常状态码
df['http_status_code_helper'] = df['http_status_code'].map(lambda x: 1 if x in [400, 500, 502, 404] else 0) 
# 给样本编号
df['sampler_index_helper'] = df['helper'].cumsum()


for i in time_feats:
    for j in object_feats:
        j_tmp = df.groupby([i,j])
        df['{}_{}_cumsum'.format(i,j)] = j_tmp['helper'].cumsum()
        df['{}_{}_error_code_cumsum'.format(i,j)] = j_tmp['http_status_code_helper'].cumsum()
        
        if i=='op_times_groups':
            for method in ['mean', 'std']:
                df['{}_{}_op_diff_second1_{}'.format(i, j, method)] = j_tmp['op_diff_second1'].transform(method)

        for k in cate_feats:
            if k == j: continue
            index_set = set(df.groupby([i,j,k], as_index=False).first()['sampler_index_helper'].values)            
            df['tmp_helper'] = df['sampler_index_helper'].map(lambda x: 1 if x in index_set else 0)
            k_tmp = df.groupby([i,j,k])
            df['{}_{}_{}_cumunique'.format(i,j,k)] = k_tmp['tmp_helper'].cumsum()
            for method in ['mean', 'std', "prod"]:
                df['{}_{}_{}_op_diff_second1_{}'.format(i, j, k, method)] = k_tmp['op_diff_second1'].transform(method)

            if k not in ['ip_transform', 'user_name', 'device_num_transform']:
                for v in df[k].unique():
                    v_tmp = df[df[k]==v].groupby([i,j])
                    df['{}_{}_{}_{}_cumsum'.format(i,j,k,v)] = v_tmp['helper'].cumsum()
                    df['{}_{}_{}_{}_error_code_cumsum'.format(i,j,k,v)] = v_tmp['http_status_code_helper'].cumsum()

remove_cols = [x for x in df.columns if x[-6:]=='helper']

df = df.drop(columns=remove_cols)

## 其它操作

In [7]:
# 几点钟
df['hour'] = df['op_datetime'].dt.hour
df['hour_sin'] = np.sin(df['hour']/24*2*np.pi)
df['hour_cos'] = np.cos(df['hour']/24*2*np.pi)

# 周几
df['dayofweek'] = df['op_datetime'].dt.dayofweek
df['dayofweek_sin'] = np.sin(df['dayofweek']/7*2*np.pi)
df['dayofweek_cos'] = np.cos(df['dayofweek']/7*2*np.pi)

# 一个月的第几天
df['day'] = df['op_datetime'].dt.day
df['day_sin'] = np.sin(df['day']/31*2*np.pi)
df['day_cos'] = np.cos(df['day']/31*2*np.pi)

## 特征筛选

In [8]:
df_tmp = pd.read_csv('/Users/liliangshan/workspace/python/01_datasets/ccf_system_access_risk_identification/new_data/corpus_txt/models_device_num_transform_embedding.txt', skiprows=1, header=None, sep=' ' )
df_tmp.columns = ['corpus_{}'.format(i) for i in range(6)]
df_tmp

Unnamed: 0,corpus_0,corpus_1,corpus_2,corpus_3,corpus_4,corpus_5
0,u9diCFdYZ,1.191250,0.884346,0.960100,-0.587945,-0.437067
1,w2CfuqTz3,-0.310487,0.538982,1.129704,-1.143607,0.040968
2,YBCE8ld50,0.031544,1.011728,-0.039371,-1.519553,-0.246772
3,5KbVyNsBf,0.134305,0.212648,0.355266,-1.386184,-0.776587
4,FQND8WWo5,0.745319,0.974483,0.725129,-0.819523,-1.404120
...,...,...,...,...,...,...
3105,4XpDhY9N8,0.447430,0.264440,0.598619,-1.016775,-0.057525
3106,b8PyvUL9p,0.446785,0.555997,0.494677,-0.910030,-0.297620
3107,6CzA6Vd7a,0.249200,0.495751,0.629262,-0.913831,-0.121946
3108,6zzVVzNbn,0.279105,0.324233,0.421833,-1.026533,-0.260341


In [9]:
df = pd.merge(left=df, right=df_tmp, how='left', left_on='ip_transform', right_on='corpus_0')

In [10]:
# 删除时间及类别型变量过多的特征
remove_col = ['op_datetime', 'op_month', 'user_name', 'ip_transform', 'device_num_transform', 'op_days', 'ts', 'ts1',
              'ts2','ts3', 'corpus_0']

remove_col = [x for x in df.columns if x in remove_col]
df = df.drop(columns=remove_col)
df.head()

Unnamed: 0,id,department,browser_version,browser,os_type,os_version,ip_type,http_status_code,op_city,log_system_transform,url,is_risk,url_sit,url_page,op_diff_second1,op_diff_second1_log,op_diff_second1_log_log,op_times_groups,system_op_diff_second1,system_op_diff_second1_log,system_op_diff_second1_log_log,system_op_times_groups,system_system_op_times_groups_cumsum,system_system_op_times_groups_error_code_cumsum,system_system_op_times_groups_op_diff_second1_mean,system_system_op_times_groups_op_diff_second1_std,system_system_op_times_groups_op_diff_second1_prod,system_system_op_times_groups_ip_transform_cumunique,system_system_op_times_groups_ip_transform_op_diff_second1_mean,system_system_op_times_groups_ip_transform_op_diff_second1_std,system_system_op_times_groups_user_name_cumunique,system_system_op_times_groups_user_name_op_diff_second1_mean,system_system_op_times_groups_user_name_op_diff_second1_std,system_system_op_times_groups_device_num_transform_cumunique,system_system_op_times_groups_device_num_transform_op_diff_second1_mean,system_system_op_times_groups_device_num_transform_op_diff_second1_std,system_system_op_times_groups_department_cumunique,system_system_op_times_groups_department_op_diff_second1_mean,system_system_op_times_groups_department_op_diff_second1_std,system_system_op_times_groups_department_rd_cumsum,system_system_op_times_groups_department_rd_error_code_cumsum,system_system_op_times_groups_department_hr_cumsum,system_system_op_times_groups_department_hr_error_code_cumsum,system_system_op_times_groups_department_sales_cumsum,system_system_op_times_groups_department_sales_error_code_cumsum,system_system_op_times_groups_department_other_cumsum,system_system_op_times_groups_department_other_error_code_cumsum,system_system_op_times_groups_department_nan_cumsum,system_system_op_times_groups_department_nan_error_code_cumsum,system_system_op_times_groups_department_accounting_cumsum,system_system_op_times_groups_department_accounting_error_code_cumsum,system_system_op_times_groups_browser_version_cumunique,system_system_op_times_groups_browser_version_op_diff_second1_mean,system_system_op_times_groups_browser_version_op_diff_second1_std,system_system_op_times_groups_browser_version_chrome_93_cumsum,system_system_op_times_groups_browser_version_chrome_93_error_code_cumsum,system_system_op_times_groups_browser_version_safari_13_cumsum,system_system_op_times_groups_browser_version_safari_13_error_code_cumsum,system_system_op_times_groups_browser_version_firefox_78_cumsum,system_system_op_times_groups_browser_version_firefox_78_error_code_cumsum,system_system_op_times_groups_browser_version_edge_93_cumsum,system_system_op_times_groups_browser_version_edge_93_error_code_cumsum,system_system_op_times_groups_browser_version_ie_9_cumsum,system_system_op_times_groups_browser_version_ie_9_error_code_cumsum,system_system_op_times_groups_browser_version_chrome_77_cumsum,system_system_op_times_groups_browser_version_chrome_77_error_code_cumsum,system_system_op_times_groups_browser_version_chrome_90_cumsum,system_system_op_times_groups_browser_version_chrome_90_error_code_cumsum,system_system_op_times_groups_browser_version_ie_11_cumsum,system_system_op_times_groups_browser_version_ie_11_error_code_cumsum,system_system_op_times_groups_browser_cumunique,system_system_op_times_groups_browser_op_diff_second1_mean,system_system_op_times_groups_browser_op_diff_second1_std,system_system_op_times_groups_browser_chrome_cumsum,system_system_op_times_groups_browser_chrome_error_code_cumsum,system_system_op_times_groups_browser_safari_cumsum,system_system_op_times_groups_browser_safari_error_code_cumsum,system_system_op_times_groups_browser_firefox_cumsum,system_system_op_times_groups_browser_firefox_error_code_cumsum,system_system_op_times_groups_browser_edge_cumsum,system_system_op_times_groups_browser_edge_error_code_cumsum,system_system_op_times_groups_browser_ie_cumsum,system_system_op_times_groups_browser_ie_error_code_cumsum,system_system_op_times_groups_os_type_cumunique,system_system_op_times_groups_os_type_op_diff_second1_mean,system_system_op_times_groups_os_type_op_diff_second1_std,system_system_op_times_groups_os_type_win_cumsum,system_system_op_times_groups_os_type_win_error_code_cumsum,system_system_op_times_groups_os_type_macos_cumsum,system_system_op_times_groups_os_type_macos_error_code_cumsum,system_system_op_times_groups_os_version_cumunique,system_system_op_times_groups_os_version_op_diff_second1_mean,system_system_op_times_groups_os_version_op_diff_second1_std,system_system_op_times_groups_os_version_win10_cumsum,system_system_op_times_groups_os_version_win10_error_code_cumsum,system_system_op_times_groups_os_version_macos_big_sur_11_cumsum,system_system_op_times_groups_os_version_macos_big_sur_11_error_code_cumsum,system_system_op_times_groups_os_version_win7_cumsum,system_system_op_times_groups_os_version_win7_error_code_cumsum,system_system_op_times_groups_os_version_win11_cumsum,...,op_month_device_num_transform_log_system_transform_2umVQwhiiwNJ_cumsum,op_month_device_num_transform_log_system_transform_2umVQwhiiwNJ_error_code_cumsum,op_month_device_num_transform_log_system_transform_fwM6KZKjrzjm_cumsum,op_month_device_num_transform_log_system_transform_fwM6KZKjrzjm_error_code_cumsum,op_month_device_num_transform_log_system_transform_nan_cumsum,op_month_device_num_transform_log_system_transform_nan_error_code_cumsum,op_month_device_num_transform_log_system_transform_dwS3cdn15GK4_cumsum,op_month_device_num_transform_log_system_transform_dwS3cdn15GK4_error_code_cumsum,op_month_device_num_transform_log_system_transform_RwHe8Q1R7AlB_cumsum,op_month_device_num_transform_log_system_transform_RwHe8Q1R7AlB_error_code_cumsum,op_month_device_num_transform_log_system_transform_9RAS6RNfETj5_cumsum,op_month_device_num_transform_log_system_transform_9RAS6RNfETj5_error_code_cumsum,op_month_device_num_transform_log_system_transform_2UNHLdxlhIzv_cumsum,op_month_device_num_transform_log_system_transform_2UNHLdxlhIzv_error_code_cumsum,op_month_device_num_transform_url_cumunique,op_month_device_num_transform_url_op_diff_second1_mean,op_month_device_num_transform_url_op_diff_second1_std,op_month_device_num_transform_url_op_diff_second1_prod,op_month_device_num_transform_url_xxx.com/github_cumsum,op_month_device_num_transform_url_xxx.com/github_error_code_cumsum,op_month_device_num_transform_url_hr.xxx.com/_cumsum,op_month_device_num_transform_url_hr.xxx.com/_error_code_cumsum,op_month_device_num_transform_url_work.xxx.com/task_cumsum,op_month_device_num_transform_url_work.xxx.com/task_error_code_cumsum,op_month_device_num_transform_url_xxx.com/mail_cumsum,op_month_device_num_transform_url_xxx.com/mail_error_code_cumsum,op_month_device_num_transform_url_xxx.com/oa_cumsum,op_month_device_num_transform_url_xxx.com/oa_error_code_cumsum,op_month_device_num_transform_url_xxx.com/getVerifyCode_cumsum,op_month_device_num_transform_url_xxx.com/getVerifyCode_error_code_cumsum,op_month_device_num_transform_url_xxx.com/loginAuth_cumsum,op_month_device_num_transform_url_xxx.com/loginAuth_error_code_cumsum,op_month_device_num_transform_url_wpsdoc.xxx.com/kdocs_cumsum,op_month_device_num_transform_url_wpsdoc.xxx.com/kdocs_error_code_cumsum,op_month_device_num_transform_url_business.xxx.com/_cumsum,op_month_device_num_transform_url_business.xxx.com/_error_code_cumsum,op_month_device_num_transform_url_xxx.com/checkingin_cumsum,op_month_device_num_transform_url_xxx.com/checkingin_error_code_cumsum,op_month_device_num_transform_url_xxx.com/getLoginType_cumsum,op_month_device_num_transform_url_xxx.com/getLoginType_error_code_cumsum,op_month_device_num_transform_url_wpsdoc.xxx.com/download_cumsum,op_month_device_num_transform_url_wpsdoc.xxx.com/download_error_code_cumsum,op_month_device_num_transform_url_xxx.com/accounting_cumsum,op_month_device_num_transform_url_xxx.com/accounting_error_code_cumsum,op_month_device_num_transform_url_page_cumunique,op_month_device_num_transform_url_page_op_diff_second1_mean,op_month_device_num_transform_url_page_op_diff_second1_std,op_month_device_num_transform_url_page_op_diff_second1_prod,op_month_device_num_transform_url_page_github_cumsum,op_month_device_num_transform_url_page_github_error_code_cumsum,op_month_device_num_transform_url_page__cumsum,op_month_device_num_transform_url_page__error_code_cumsum,op_month_device_num_transform_url_page_task_cumsum,op_month_device_num_transform_url_page_task_error_code_cumsum,op_month_device_num_transform_url_page_mail_cumsum,op_month_device_num_transform_url_page_mail_error_code_cumsum,op_month_device_num_transform_url_page_oa_cumsum,op_month_device_num_transform_url_page_oa_error_code_cumsum,op_month_device_num_transform_url_page_getVerifyCode_cumsum,op_month_device_num_transform_url_page_getVerifyCode_error_code_cumsum,op_month_device_num_transform_url_page_loginAuth_cumsum,op_month_device_num_transform_url_page_loginAuth_error_code_cumsum,op_month_device_num_transform_url_page_kdocs_cumsum,op_month_device_num_transform_url_page_kdocs_error_code_cumsum,op_month_device_num_transform_url_page_checkingin_cumsum,op_month_device_num_transform_url_page_checkingin_error_code_cumsum,op_month_device_num_transform_url_page_getLoginType_cumsum,op_month_device_num_transform_url_page_getLoginType_error_code_cumsum,op_month_device_num_transform_url_page_download_cumsum,op_month_device_num_transform_url_page_download_error_code_cumsum,op_month_device_num_transform_url_page_accounting_cumsum,op_month_device_num_transform_url_page_accounting_error_code_cumsum,op_month_device_num_transform_url_sit_cumunique,op_month_device_num_transform_url_sit_op_diff_second1_mean,op_month_device_num_transform_url_sit_op_diff_second1_std,op_month_device_num_transform_url_sit_op_diff_second1_prod,op_month_device_num_transform_url_sit_xxx.com_cumsum,op_month_device_num_transform_url_sit_xxx.com_error_code_cumsum,op_month_device_num_transform_url_sit_hr.xxx.com_cumsum,op_month_device_num_transform_url_sit_hr.xxx.com_error_code_cumsum,op_month_device_num_transform_url_sit_work.xxx.com_cumsum,op_month_device_num_transform_url_sit_work.xxx.com_error_code_cumsum,op_month_device_num_transform_url_sit_wpsdoc.xxx.com_cumsum,op_month_device_num_transform_url_sit_wpsdoc.xxx.com_error_code_cumsum,op_month_device_num_transform_url_sit_business.xxx.com_cumsum,op_month_device_num_transform_url_sit_business.xxx.com_error_code_cumsum,hour,hour_sin,hour_cos,dayofweek,dayofweek_sin,dayofweek_cos,day,day_sin,day_cos,corpus_1,corpus_2,corpus_3,corpus_4,corpus_5
0,44477,rd,chrome_93,chrome,win,win10,内网,200,深圳,nHrKgKdJ1Mzt,xxx.com/github,1.0,xxx.com,github,,,,0,,,,0,1,0,,,1.0,1,,,1,,,1,,,1,,,1.0,0.0,,,,,,,,,,,1,,,1.0,0.0,,,,,,,,,,,,,,,1,,,1.0,0.0,,,,,,,,,1,,,1.0,0.0,,,1,,,1.0,0.0,,,,,,...,,,,,,,,,,,,,,,1,42634.933333,51507.725287,1.30122e+122,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,1,42634.933333,51507.725287,1.30122e+122,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1,48144.097561,60210.665583,5.648623999999999e+168,1.0,0.0,,,,,,,,,2,0.5,0.866025,4,-0.433884,-0.900969,7,0.988468,0.151428,0.809328,1.102687,0.245249,-1.10431,0.374233
1,45489,rd,safari_13,safari,macos,macos_big_sur_11,内网,200,成都,nHrKgKdJ1Mzt,xxx.com/github,1.0,xxx.com,github,,,,0,603.0,6.401917,1.856598,1,1,0,,,1.0,1,,,1,,,1,,,1,,,1.0,0.0,,,,,,,,,,,1,,,,,1.0,0.0,,,,,,,,,,,,,1,,,,,1.0,0.0,,,,,,,1,,,,,1.0,0.0,1,,,,,1.0,0.0,,,,...,,,,,,,,,,,,,,,1,60340.166667,71604.782837,4.378261e+120,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,1,60340.166667,71604.782837,4.378261e+120,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1,50344.564103,65831.591909,3.897362e+149,1.0,0.0,,,,,,,,,2,0.5,0.866025,4,-0.433884,-0.900969,7,0.988468,0.151428,0.343569,0.56189,0.66424,-1.121874,-0.210729
2,45706,hr,firefox_78,firefox,win,win7,内网,200,深圳,a5G25puBl9xj,hr.xxx.com/,1.0,hr.xxx.com,,,,,0,384.0,5.950643,1.783499,1,2,0,,,1.0,1,,,1,,,1,,,1,,,,,1.0,0.0,,,,,,,,,1,,,,,,,1.0,0.0,,,,,,,,,,,1,,,,,,,1.0,0.0,,,,,1,,,1.0,0.0,,,1,,,,,,,1.0,0.0,,...,,,,,,,,,,,,,,,1,42199.647059,44769.681976,7.108256e+68,,,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1,42199.647059,44769.681976,7.108256e+68,,,1.0,0.0,,,,,,,,,,,,,,,,,,,,,1,42199.647059,44769.681976,7.108256e+68,,,1.0,0.0,,,,,,,3,0.707107,0.707107,4,-0.433884,-0.900969,7,0.988468,0.151428,0.867425,0.218973,0.902779,-0.86094,-0.92494
3,45901,rd,edge_93,edge,win,win10,内网,200,杭州,nHrKgKdJ1Mzt,xxx.com/github,1.0,xxx.com,github,,,,0,5318.0,8.578853,2.1493,2,1,0,,,1.0,1,,,1,,,1,,,1,,,1.0,0.0,,,,,,,,,,,1,,,,,,,,,1.0,0.0,,,,,,,,,1,,,,,,,,,1.0,0.0,,,1,,,1.0,0.0,,,1,,,1.0,0.0,,,,,,...,,,,,,,,,,,,,,,1,35702.933333,55414.856364,2.986052e+108,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,1,35702.933333,55414.856364,2.986052e+108,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1,38517.446809,67366.099455,3.8445339999999997e+174,1.0,0.0,,,,,,,,,4,0.866025,0.5,4,-0.433884,-0.900969,7,0.988468,0.151428,-0.110667,0.980962,1.108754,-0.948152,-0.148743
4,43827,sales,ie_9,ie,win,win10,内网,200,重庆,sW0whYIx8LFM,work.xxx.com/task,1.0,work.xxx.com,task,,,,0,2890.0,7.969012,2.07556,3,1,0,,,1.0,1,,,1,,,1,,,1,,,,,,,1.0,0.0,,,,,,,1,,,,,,,,,,,1.0,0.0,,,,,,,1,,,,,,,,,,,1.0,0.0,1,,,1.0,0.0,,,1,,,1.0,0.0,,,,,,...,,,,,,,,,,,,,,,1,36298.0,49621.220854,4771291000000.0,,,,,1.0,0.0,,,,,,,,,,,,,,,,,,,,,1,36298.0,49621.220854,4771291000000.0,,,,,1.0,0.0,,,,,,,,,,,,,,,,,,,1,36298.0,49621.220854,4771291000000.0,,,,,1.0,0.0,,,,,5,0.965926,0.258819,4,-0.433884,-0.900969,7,0.988468,0.151428,-0.100326,-0.134813,0.719877,-1.437971,-0.635148


In [11]:
df_row_train = df[df[y_label].notna()].reset_index(drop=True)
df_row_val = df[df[y_label].isna()].reset_index(drop=True)

df_train, df_test, convert_cols = sp.transform_data_detail(df_row_train, df_row_val, y_label, excel_path=path_output_report)
df_train.head()

sheet05.可能为数值类型的object类型数据统计在/Users/liliangshan/workspace/python/01_datasets/ccf_system_access_risk_identification/results/01_原始数据探察_20221014.xlsx中已经存在，我们将对原文件进行覆盖
sheet06.数据预处理在/Users/liliangshan/workspace/python/01_datasets/ccf_system_access_risk_identification/results/01_原始数据探察_20221014.xlsx中已经存在，我们将对原文件进行覆盖


Unnamed: 0,id,department,browser_version,browser,os_version,op_city,log_system_transform,url,is_risk,url_sit,url_page,op_diff_second1,op_diff_second1_log,op_diff_second1_log_log,op_times_groups,system_op_diff_second1,system_op_diff_second1_log,system_op_diff_second1_log_log,system_op_times_groups,system_system_op_times_groups_cumsum,system_system_op_times_groups_error_code_cumsum,system_system_op_times_groups_op_diff_second1_mean,system_system_op_times_groups_op_diff_second1_std,system_system_op_times_groups_op_diff_second1_prod,system_system_op_times_groups_ip_transform_op_diff_second1_mean,system_system_op_times_groups_ip_transform_op_diff_second1_std,system_system_op_times_groups_user_name_op_diff_second1_mean,system_system_op_times_groups_user_name_op_diff_second1_std,system_system_op_times_groups_device_num_transform_op_diff_second1_mean,system_system_op_times_groups_device_num_transform_op_diff_second1_std,system_system_op_times_groups_department_op_diff_second1_mean,system_system_op_times_groups_department_op_diff_second1_std,system_system_op_times_groups_department_rd_cumsum,system_system_op_times_groups_department_rd_error_code_cumsum,system_system_op_times_groups_department_sales_cumsum,system_system_op_times_groups_department_sales_error_code_cumsum,system_system_op_times_groups_browser_version_op_diff_second1_mean,system_system_op_times_groups_browser_version_op_diff_second1_std,system_system_op_times_groups_browser_version_edge_93_cumsum,system_system_op_times_groups_browser_version_edge_93_error_code_cumsum,system_system_op_times_groups_browser_version_chrome_90_cumsum,system_system_op_times_groups_browser_version_chrome_90_error_code_cumsum,system_system_op_times_groups_browser_op_diff_second1_mean,system_system_op_times_groups_browser_op_diff_second1_std,system_system_op_times_groups_browser_chrome_cumsum,system_system_op_times_groups_browser_chrome_error_code_cumsum,system_system_op_times_groups_browser_edge_cumsum,system_system_op_times_groups_browser_edge_error_code_cumsum,system_system_op_times_groups_os_type_op_diff_second1_mean,system_system_op_times_groups_os_type_op_diff_second1_std,system_system_op_times_groups_os_type_win_cumsum,system_system_op_times_groups_os_type_win_error_code_cumsum,system_system_op_times_groups_os_version_op_diff_second1_mean,system_system_op_times_groups_os_version_op_diff_second1_std,system_system_op_times_groups_os_version_win10_cumsum,system_system_op_times_groups_os_version_win10_error_code_cumsum,system_system_op_times_groups_os_version_win7_cumsum,system_system_op_times_groups_os_version_win7_error_code_cumsum,system_system_op_times_groups_ip_type_op_diff_second1_mean,system_system_op_times_groups_ip_type_op_diff_second1_std,system_system_op_times_groups_ip_type_内网_cumsum,system_system_op_times_groups_ip_type_内网_error_code_cumsum,system_system_op_times_groups_http_status_code_op_diff_second1_mean,system_system_op_times_groups_http_status_code_op_diff_second1_std,system_system_op_times_groups_http_status_code_200_cumsum,system_system_op_times_groups_op_city_op_diff_second1_mean,system_system_op_times_groups_op_city_op_diff_second1_std,system_system_op_times_groups_op_city_深圳_cumsum,system_system_op_times_groups_op_city_深圳_error_code_cumsum,system_system_op_times_groups_op_city_成都_cumsum,system_system_op_times_groups_op_city_成都_error_code_cumsum,system_system_op_times_groups_op_city_杭州_cumsum,system_system_op_times_groups_op_city_杭州_error_code_cumsum,system_system_op_times_groups_op_city_北京_cumsum,system_system_op_times_groups_op_city_北京_error_code_cumsum,system_system_op_times_groups_log_system_transform_op_diff_second1_mean,system_system_op_times_groups_log_system_transform_op_diff_second1_std,system_system_op_times_groups_log_system_transform_nHrKgKdJ1Mzt_cumsum,system_system_op_times_groups_log_system_transform_nHrKgKdJ1Mzt_error_code_cumsum,system_system_op_times_groups_log_system_transform_dwS3cdn15GK4_cumsum,system_system_op_times_groups_log_system_transform_dwS3cdn15GK4_error_code_cumsum,system_system_op_times_groups_url_op_diff_second1_mean,system_system_op_times_groups_url_op_diff_second1_std,system_system_op_times_groups_url_xxx.com/github_cumsum,system_system_op_times_groups_url_xxx.com/github_error_code_cumsum,system_system_op_times_groups_url_wpsdoc.xxx.com/download_cumsum,system_system_op_times_groups_url_wpsdoc.xxx.com/download_error_code_cumsum,system_system_op_times_groups_url_page_op_diff_second1_mean,system_system_op_times_groups_url_page_op_diff_second1_std,system_system_op_times_groups_url_page_github_cumsum,system_system_op_times_groups_url_page_github_error_code_cumsum,system_system_op_times_groups_url_page_download_cumsum,system_system_op_times_groups_url_page_download_error_code_cumsum,system_system_op_times_groups_url_sit_op_diff_second1_mean,system_system_op_times_groups_url_sit_op_diff_second1_std,system_system_op_times_groups_url_sit_xxx.com_cumsum,system_system_op_times_groups_url_sit_xxx.com_error_code_cumsum,system_system_op_times_groups_url_sit_wpsdoc.xxx.com_cumsum,system_system_op_times_groups_url_sit_wpsdoc.xxx.com_error_code_cumsum,system_op_days_cumsum,...,op_month_device_num_transform_ip_transform_op_diff_second1_std,op_month_device_num_transform_ip_transform_op_diff_second1_prod,op_month_device_num_transform_user_name_op_diff_second1_mean,op_month_device_num_transform_user_name_op_diff_second1_std,op_month_device_num_transform_user_name_op_diff_second1_prod,op_month_device_num_transform_department_op_diff_second1_mean,op_month_device_num_transform_department_op_diff_second1_std,op_month_device_num_transform_department_op_diff_second1_prod,op_month_device_num_transform_department_rd_cumsum,op_month_device_num_transform_department_rd_error_code_cumsum,op_month_device_num_transform_department_sales_cumsum,op_month_device_num_transform_department_sales_error_code_cumsum,op_month_device_num_transform_browser_version_op_diff_second1_mean,op_month_device_num_transform_browser_version_op_diff_second1_std,op_month_device_num_transform_browser_version_op_diff_second1_prod,op_month_device_num_transform_browser_version_edge_93_cumsum,op_month_device_num_transform_browser_version_edge_93_error_code_cumsum,op_month_device_num_transform_browser_version_chrome_90_cumsum,op_month_device_num_transform_browser_version_chrome_90_error_code_cumsum,op_month_device_num_transform_browser_op_diff_second1_mean,op_month_device_num_transform_browser_op_diff_second1_std,op_month_device_num_transform_browser_op_diff_second1_prod,op_month_device_num_transform_browser_chrome_cumsum,op_month_device_num_transform_browser_chrome_error_code_cumsum,op_month_device_num_transform_browser_edge_cumsum,op_month_device_num_transform_browser_edge_error_code_cumsum,op_month_device_num_transform_os_type_op_diff_second1_mean,op_month_device_num_transform_os_type_op_diff_second1_std,op_month_device_num_transform_os_type_op_diff_second1_prod,op_month_device_num_transform_os_type_win_cumsum,op_month_device_num_transform_os_type_win_error_code_cumsum,op_month_device_num_transform_os_version_op_diff_second1_mean,op_month_device_num_transform_os_version_op_diff_second1_std,op_month_device_num_transform_os_version_op_diff_second1_prod,op_month_device_num_transform_os_version_win10_cumsum,op_month_device_num_transform_os_version_win10_error_code_cumsum,op_month_device_num_transform_os_version_win7_cumsum,op_month_device_num_transform_os_version_win7_error_code_cumsum,op_month_device_num_transform_ip_type_op_diff_second1_mean,op_month_device_num_transform_ip_type_op_diff_second1_std,op_month_device_num_transform_ip_type_op_diff_second1_prod,op_month_device_num_transform_ip_type_内网_cumsum,op_month_device_num_transform_ip_type_内网_error_code_cumsum,op_month_device_num_transform_http_status_code_op_diff_second1_mean,op_month_device_num_transform_http_status_code_op_diff_second1_std,op_month_device_num_transform_http_status_code_op_diff_second1_prod,op_month_device_num_transform_http_status_code_200_cumsum,op_month_device_num_transform_op_city_op_diff_second1_mean,op_month_device_num_transform_op_city_op_diff_second1_std,op_month_device_num_transform_op_city_op_diff_second1_prod,op_month_device_num_transform_op_city_深圳_cumsum,op_month_device_num_transform_op_city_深圳_error_code_cumsum,op_month_device_num_transform_op_city_成都_cumsum,op_month_device_num_transform_op_city_成都_error_code_cumsum,op_month_device_num_transform_op_city_杭州_cumsum,op_month_device_num_transform_op_city_杭州_error_code_cumsum,op_month_device_num_transform_op_city_北京_cumsum,op_month_device_num_transform_op_city_北京_error_code_cumsum,op_month_device_num_transform_log_system_transform_op_diff_second1_mean,op_month_device_num_transform_log_system_transform_op_diff_second1_std,op_month_device_num_transform_log_system_transform_op_diff_second1_prod,op_month_device_num_transform_log_system_transform_nHrKgKdJ1Mzt_cumsum,op_month_device_num_transform_log_system_transform_nHrKgKdJ1Mzt_error_code_cumsum,op_month_device_num_transform_log_system_transform_dwS3cdn15GK4_cumsum,op_month_device_num_transform_log_system_transform_dwS3cdn15GK4_error_code_cumsum,op_month_device_num_transform_url_op_diff_second1_mean,op_month_device_num_transform_url_op_diff_second1_std,op_month_device_num_transform_url_op_diff_second1_prod,op_month_device_num_transform_url_xxx.com/github_cumsum,op_month_device_num_transform_url_xxx.com/github_error_code_cumsum,op_month_device_num_transform_url_wpsdoc.xxx.com/download_cumsum,op_month_device_num_transform_url_wpsdoc.xxx.com/download_error_code_cumsum,op_month_device_num_transform_url_page_op_diff_second1_mean,op_month_device_num_transform_url_page_op_diff_second1_std,op_month_device_num_transform_url_page_op_diff_second1_prod,op_month_device_num_transform_url_page_github_cumsum,op_month_device_num_transform_url_page_github_error_code_cumsum,op_month_device_num_transform_url_page_download_cumsum,op_month_device_num_transform_url_page_download_error_code_cumsum,op_month_device_num_transform_url_sit_op_diff_second1_mean,op_month_device_num_transform_url_sit_op_diff_second1_std,op_month_device_num_transform_url_sit_op_diff_second1_prod,op_month_device_num_transform_url_sit_xxx.com_cumsum,op_month_device_num_transform_url_sit_xxx.com_error_code_cumsum,op_month_device_num_transform_url_sit_wpsdoc.xxx.com_cumsum,op_month_device_num_transform_url_sit_wpsdoc.xxx.com_error_code_cumsum,hour,hour_sin,hour_cos,dayofweek,dayofweek_sin,dayofweek_cos,day,day_sin,day_cos,corpus_1,corpus_2,corpus_3,corpus_4,corpus_5
0,44477,rd,chrome_93,chrome,win10,深圳,nHrKgKdJ1Mzt,xxx.com/github,1.0,xxx.com,github,,,,0,,,,0,1,0,,,1.0,,,,,,,,,1.0,0.0,,,,,,,,,,,1.0,0.0,,,,,1.0,0.0,,,1.0,0.0,,,,,1.0,0.0,,,1.0,,,1.0,0.0,,,,,,,,,1.0,0.0,,,,,1.0,0.0,,,,,1.0,0.0,,,,,1.0,0.0,,,1,...,56036.133473,2.584153e+181,36381.45283,55264.776208,7.660154e+179,36381.45283,55264.776208,7.660154e+179,1.0,0.0,,,37129.545455,55416.665714,1.0991990000000001e+188,,,,,37129.545455,55416.665714,1.0991990000000001e+188,1.0,0.0,,,37129.545455,55416.665714,1.0991990000000001e+188,1.0,0.0,37129.545455,55416.665714,1.0991990000000001e+188,1.0,0.0,,,36381.45283,55264.776208,7.660154e+179,1.0,0.0,39271.557692,56266.221127,5.495995999999999e+187,1.0,37129.545455,55416.665714,1.0991990000000001e+188,1.0,0.0,,,,,,,42634.933333,51507.725287,1.30122e+122,1.0,0.0,,,42634.933333,51507.725287,1.30122e+122,1.0,0.0,,,42634.933333,51507.725287,1.30122e+122,1.0,0.0,,,48144.097561,60210.665583,5.648623999999999e+168,1.0,0.0,,,2,0.5,0.866025,4,-0.433884,-0.900969,7,0.988468,0.151428,0.809328,1.102687,0.245249,-1.10431,0.374233
1,45489,rd,safari_13,safari,macos_big_sur_11,成都,nHrKgKdJ1Mzt,xxx.com/github,1.0,xxx.com,github,,,,0,603.0,6.401917,1.856598,1,1,0,,,1.0,,,,,,,,,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,,,1.0,,,,,1.0,0.0,,,,,,,1.0,0.0,,,,,1.0,0.0,,,,,1.0,0.0,,,,,1.0,0.0,,,2,...,63685.848336,2.936996e+163,48849.0,65576.934888,2.217189e+152,48849.0,65576.934888,2.217189e+152,1.0,0.0,,,46443.636364,62987.038358,3.1406359999999997e+169,,,,,46443.636364,62987.038358,3.1406359999999997e+169,,,,,46443.636364,62987.038358,3.1406359999999997e+169,,,46443.636364,62987.038358,3.1406359999999997e+169,,,,,48849.0,65576.934888,2.217189e+152,1.0,0.0,46443.636364,62987.038358,3.1406359999999997e+169,1.0,46443.636364,62987.038358,3.1406359999999997e+169,,,1.0,0.0,,,,,60340.166667,71604.782837,4.378261e+120,1.0,0.0,,,60340.166667,71604.782837,4.378261e+120,1.0,0.0,,,60340.166667,71604.782837,4.378261e+120,1.0,0.0,,,50344.564103,65831.591909,3.897362e+149,1.0,0.0,,,2,0.5,0.866025,4,-0.433884,-0.900969,7,0.988468,0.151428,0.343569,0.56189,0.66424,-1.121874,-0.210729
2,45706,hr,firefox_78,firefox,win7,深圳,a5G25puBl9xj,hr.xxx.com/,1.0,hr.xxx.com,,,,,0,384.0,5.950643,1.783499,1,2,0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,,,,,1.0,0.0,,,2.0,0.0,,,2.0,,,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,...,49812.427023,2.576431e+210,29931.2,51079.557818,1.505465e+187,29931.2,51079.557818,1.505465e+187,,,,,30921.742424,49560.274727,5.0111590000000005e+213,,,,,30921.742424,49560.274727,5.0111590000000005e+213,,,,,30921.742424,49560.274727,5.0111590000000005e+213,1.0,0.0,30921.742424,49560.274727,5.0111590000000005e+213,,,1.0,0.0,29931.2,51079.557818,1.505465e+187,1.0,0.0,31887.953125,50028.986631,5.567954e+212,1.0,30921.742424,49560.274727,5.0111590000000005e+213,1.0,0.0,,,,,,,42199.647059,44769.681976,7.108256e+68,,,,,42199.647059,44769.681976,7.108256e+68,,,,,42199.647059,44769.681976,7.108256e+68,,,,,42199.647059,44769.681976,7.108256e+68,,,,,3,0.707107,0.707107,4,-0.433884,-0.900969,7,0.988468,0.151428,0.867425,0.218973,0.902779,-0.86094,-0.92494
3,45901,rd,edge_93,edge,win10,杭州,nHrKgKdJ1Mzt,xxx.com/github,1.0,xxx.com,github,,,,0,5318.0,8.578853,2.1493,2,1,0,,,1.0,,,,,,,,,1.0,0.0,,,,,1.0,0.0,,,,,,,1.0,0.0,,,1.0,0.0,,,1.0,0.0,,,,,1.0,0.0,,,1.0,,,,,,,1.0,0.0,,,,,1.0,0.0,,,,,1.0,0.0,,,,,1.0,0.0,,,,,1.0,0.0,,,4,...,60165.310261,1.776476e+198,31518.435484,61527.749882,8.395597999999999e+185,31518.435484,61527.749882,8.395597999999999e+185,1.0,0.0,,,29992.328358,59448.52142,9.301113000000001e+204,1.0,0.0,,,29992.328358,59448.52142,9.301113000000001e+204,,,1.0,0.0,29992.328358,59448.52142,9.301113000000001e+204,1.0,0.0,29992.328358,59448.52142,9.301113000000001e+204,1.0,0.0,,,31518.435484,61527.749882,8.395597999999999e+185,1.0,0.0,30446.742424,59786.704706,9.301113000000001e+204,1.0,29992.328358,59448.52142,9.301113000000001e+204,,,,,1.0,0.0,,,35702.933333,55414.856364,2.986052e+108,1.0,0.0,,,35702.933333,55414.856364,2.986052e+108,1.0,0.0,,,35702.933333,55414.856364,2.986052e+108,1.0,0.0,,,38517.446809,67366.099455,3.8445339999999997e+174,1.0,0.0,,,4,0.866025,0.5,4,-0.433884,-0.900969,7,0.988468,0.151428,-0.110667,0.980962,1.108754,-0.948152,-0.148743
4,43827,sales,ie_9,ie,win10,重庆,sW0whYIx8LFM,work.xxx.com/task,1.0,work.xxx.com,task,,,,0,2890.0,7.969012,2.07556,3,1,0,,,1.0,,,,,,,,,,,1.0,0.0,,,,,,,,,,,,,,,1.0,0.0,,,1.0,0.0,,,,,1.0,0.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,...,52774.068487,2.208497e+162,52821.432432,53012.083656,2.4825730000000003e+158,52821.432432,53012.083656,2.4825730000000003e+158,,,1.0,0.0,51665.5,52774.068487,2.208497e+162,,,,,51665.5,52774.068487,2.208497e+162,,,,,51665.5,52774.068487,2.208497e+162,1.0,0.0,51665.5,52774.068487,2.208497e+162,1.0,0.0,,,52821.432432,53012.083656,2.4825730000000003e+158,1.0,0.0,51665.5,52774.068487,2.208497e+162,1.0,22356.4,35448.820154,2.219075e+19,,,,,,,,,36298.0,49621.220854,4771291000000.0,,,,,36298.0,49621.220854,4771291000000.0,,,,,36298.0,49621.220854,4771291000000.0,,,,,36298.0,49621.220854,4771291000000.0,,,,,5,0.965926,0.258819,4,-0.433884,-0.900969,7,0.988468,0.151428,-0.100326,-0.134813,0.719877,-1.437971,-0.635148


In [12]:
# df_train = df[df[y_label].notna()].reset_index(drop=True)
# df_test = df[df[y_label].isna()].reset_index(drop=True)

df_train_extend = df_train[[y_label,'id']]
df_test_extend = df_test[[y_label,'id']]

In [13]:
# df_train = df_train.fillna(-999)
# df_test = df_test.fillna(-999)

### null importance特征筛选

In [14]:
feats, categorical_feats = get_null_importance(df_train.drop(columns=[y_label,'id']).copy(),
                                               df_train[y_label].copy(), 
                                               thresholds=15)

In [15]:

object_feats = ['ip_transform', 'user_name', 'device_num_transform']

df_train = pd.concat([df_train_info[object_feats], df_train[feats], df_train_extend], axis=1)
df_test = pd.concat([df_test[feats], df_test_extend], axis=1)

In [43]:
feats

Index(['op_diff_second1_log', 'op_diff_second1_log_log', 'op_times_groups',
       'system_op_diff_second1_log', 'system_op_diff_second1_log_log',
       'system_op_times_groups', 'system_system_op_times_groups_cumsum',
       'system_system_op_times_groups_error_code_cumsum',
       'system_system_op_times_groups_op_diff_second1_mean',
       'system_system_op_times_groups_op_diff_second1_std',
       ...
       'hour_cos', 'dayofweek', 'dayofweek_sin', 'dayofweek_cos', 'day',
       'day_cos', 'corpus_2', 'corpus_4', 'corpus_5', 'oof'],
      dtype='object', length=412)

## modeling

In [42]:
feats = df_train.columns.drop(['id', y_label]+object_feats)
feats = feats.drop(categorical_feats)
#feats 

In [37]:
feats = feats_importance.sort_values('importance', ascending=False)[:50]['name'].values

In [38]:

import time
from sklearn.metrics import roc_auc_score as auc
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold

In [39]:
params = {
    'learning_rate': 0.05,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'verbose': -1,
    'seed': 2222,
    'n_jobs': -1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'bagging_freq': 4,
    # 'min_child_weight': 10,
}

In [40]:
fold_num = 5
seeds = [2022]
oof = np.zeros(len(df_train))
importance = 0
pred_y = pd.DataFrame()
score = []
for seed in seeds:
    kf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=seed)
    # kf = KFold(n_splits=fold_num, shuffle=True, random_state=seed)
    for fold, (train_idx, val_idx) in enumerate(kf.split(df_train[feats], df_train[y_label])):
        print('-----------', fold)
        train = lgb.Dataset(df_train.loc[train_idx, feats],
                            df_train.loc[train_idx, y_label],
                           # categorical_feature=categorical_feats
                           )
        val = lgb.Dataset(df_train.loc[val_idx, feats],
                          df_train.loc[val_idx, y_label],
                          #categorical_feature=categorical_feats
                         )
        model = lgb.train(params, train, valid_sets=[val], 
                          num_boost_round=20000, early_stopping_rounds=100)

        oof[val_idx] += model.predict(df_train.loc[val_idx, feats]) / len(seeds)
        pred_y['fold_%d_seed_%d' % (fold, seed)] = model.predict(df_test[feats])
        importance += model.feature_importance(importance_type='gain') / fold_num
        score.append(auc(df_train.loc[val_idx, y_label], model.predict(df_train.loc[val_idx, feats])))
feats_importance = pd.DataFrame()
feats_importance['name'] = feats
feats_importance['importance'] = importance
display(feats_importance.sort_values('importance', ascending=False)[:30])

df_train['oof'] = oof
display(np.mean(score), np.std(score))

score = np.mean(score)
df_test[y_label] = pred_y.mean(axis=1).values
df_test = df_test.sort_values('id').reset_index(drop=True)

sub = pd.read_csv(path_sample_submission)
sub[y_label] = df_test[y_label].values
sub.to_csv(os.path.join(path_results_jupyter,time.strftime('lgb_%Y%m%d%H%M_')+'%.5f.csv'%score), index=False)

----------- 0
[1]	valid_0's auc: 0.935692
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.936441
[3]	valid_0's auc: 0.940583
[4]	valid_0's auc: 0.940294
[5]	valid_0's auc: 0.940425
[6]	valid_0's auc: 0.940208
[7]	valid_0's auc: 0.939996
[8]	valid_0's auc: 0.939812
[9]	valid_0's auc: 0.93979
[10]	valid_0's auc: 0.939819
[11]	valid_0's auc: 0.939843
[12]	valid_0's auc: 0.939459
[13]	valid_0's auc: 0.939466
[14]	valid_0's auc: 0.939403
[15]	valid_0's auc: 0.939393
[16]	valid_0's auc: 0.939702
[17]	valid_0's auc: 0.939737
[18]	valid_0's auc: 0.939746
[19]	valid_0's auc: 0.939764
[20]	valid_0's auc: 0.93979
[21]	valid_0's auc: 0.939812
[22]	valid_0's auc: 0.944237
[23]	valid_0's auc: 0.944227
[24]	valid_0's auc: 0.944384
[25]	valid_0's auc: 0.944731
[26]	valid_0's auc: 0.944678
[27]	valid_0's auc: 0.944709
[28]	valid_0's auc: 0.944761
[29]	valid_0's auc: 0.9448
[30]	valid_0's auc: 0.944751
[31]	valid_0's auc: 0.944779
[32]	valid_0's auc: 0.944244
[33]	vali

[168]	valid_0's auc: 0.941424
[169]	valid_0's auc: 0.941319
[170]	valid_0's auc: 0.94141
[171]	valid_0's auc: 0.941406
[172]	valid_0's auc: 0.941132
[173]	valid_0's auc: 0.9414
[174]	valid_0's auc: 0.941442
[175]	valid_0's auc: 0.941535
[176]	valid_0's auc: 0.941462
[177]	valid_0's auc: 0.941521
[178]	valid_0's auc: 0.941703
[179]	valid_0's auc: 0.94171
[180]	valid_0's auc: 0.941728
[181]	valid_0's auc: 0.941686
[182]	valid_0's auc: 0.941782
[183]	valid_0's auc: 0.941951
[184]	valid_0's auc: 0.941854
[185]	valid_0's auc: 0.941861
[186]	valid_0's auc: 0.941755
[187]	valid_0's auc: 0.94164
[188]	valid_0's auc: 0.941642
[189]	valid_0's auc: 0.941613
[190]	valid_0's auc: 0.94159
[191]	valid_0's auc: 0.941415
[192]	valid_0's auc: 0.941619
[193]	valid_0's auc: 0.941691
[194]	valid_0's auc: 0.941686
[195]	valid_0's auc: 0.941644
[196]	valid_0's auc: 0.941529
[197]	valid_0's auc: 0.941518
[198]	valid_0's auc: 0.941366
[199]	valid_0's auc: 0.941262
[200]	valid_0's auc: 0.941238
[201]	valid_0's 

[241]	valid_0's auc: 0.939967
[242]	valid_0's auc: 0.940104
[243]	valid_0's auc: 0.940109
[244]	valid_0's auc: 0.940171
[245]	valid_0's auc: 0.940206
[246]	valid_0's auc: 0.940075
[247]	valid_0's auc: 0.940103
[248]	valid_0's auc: 0.940121
[249]	valid_0's auc: 0.94021
[250]	valid_0's auc: 0.940275
[251]	valid_0's auc: 0.940318
[252]	valid_0's auc: 0.940333
[253]	valid_0's auc: 0.940348
[254]	valid_0's auc: 0.940502
[255]	valid_0's auc: 0.940463
[256]	valid_0's auc: 0.940296
[257]	valid_0's auc: 0.940271
[258]	valid_0's auc: 0.940256
[259]	valid_0's auc: 0.94023
[260]	valid_0's auc: 0.940201
[261]	valid_0's auc: 0.940143
[262]	valid_0's auc: 0.940126
[263]	valid_0's auc: 0.940048
[264]	valid_0's auc: 0.940091
[265]	valid_0's auc: 0.94031
[266]	valid_0's auc: 0.940313
[267]	valid_0's auc: 0.940261
[268]	valid_0's auc: 0.940259
[269]	valid_0's auc: 0.940205
[270]	valid_0's auc: 0.940184
[271]	valid_0's auc: 0.939953
[272]	valid_0's auc: 0.939995
[273]	valid_0's auc: 0.939991
[274]	valid_0

[18]	valid_0's auc: 0.939718
[19]	valid_0's auc: 0.939733
[20]	valid_0's auc: 0.939796
[21]	valid_0's auc: 0.939723
[22]	valid_0's auc: 0.941908
[23]	valid_0's auc: 0.941955
[24]	valid_0's auc: 0.942002
[25]	valid_0's auc: 0.942025
[26]	valid_0's auc: 0.942037
[27]	valid_0's auc: 0.942089
[28]	valid_0's auc: 0.942101
[29]	valid_0's auc: 0.942109
[30]	valid_0's auc: 0.942095
[31]	valid_0's auc: 0.94215
[32]	valid_0's auc: 0.942161
[33]	valid_0's auc: 0.94184
[34]	valid_0's auc: 0.94154
[35]	valid_0's auc: 0.941587
[36]	valid_0's auc: 0.941615
[37]	valid_0's auc: 0.941615
[38]	valid_0's auc: 0.941412
[39]	valid_0's auc: 0.941742
[40]	valid_0's auc: 0.941477
[41]	valid_0's auc: 0.941223
[42]	valid_0's auc: 0.94115
[43]	valid_0's auc: 0.94153
[44]	valid_0's auc: 0.941569
[45]	valid_0's auc: 0.941158
[46]	valid_0's auc: 0.941078
[47]	valid_0's auc: 0.940938
[48]	valid_0's auc: 0.940724
[49]	valid_0's auc: 0.940555
[50]	valid_0's auc: 0.940664
[51]	valid_0's auc: 0.940537
[52]	valid_0's auc:

Unnamed: 0,name,importance
0,system_system_op_times_groups_http_status_code...,83892.036866
1,op_month_ip_transform_device_num_transform_op_...,34090.015855
2,system_system_op_times_groups_http_status_code...,29493.262698
3,op_times_groups_ip_transform_url_wpsdoc.xxx.co...,17122.784873
4,op_times_groups_ip_transform_log_system_transf...,6552.22227
7,system_system_op_times_groups_op_diff_second1_...,6422.437326
6,op_month_ip_transform_browser_version_op_diff_...,5477.231935
5,hour_cos,4902.397981
8,system_op_days_http_status_code_200_cumsum,3945.466135
11,op_month_ip_transform_http_status_code_op_diff...,1947.542069


0.943557828929098

0.0020662637709650375

In [41]:
feats_importance.sort_values('importance', ascending=False)[:50]

Unnamed: 0,name,importance
0,system_system_op_times_groups_http_status_code...,83892.036866
1,op_month_ip_transform_device_num_transform_op_...,34090.015855
2,system_system_op_times_groups_http_status_code...,29493.262698
3,op_times_groups_ip_transform_url_wpsdoc.xxx.co...,17122.784873
4,op_times_groups_ip_transform_log_system_transf...,6552.22227
7,system_system_op_times_groups_op_diff_second1_...,6422.437326
6,op_month_ip_transform_browser_version_op_diff_...,5477.231935
5,hour_cos,4902.397981
8,system_op_days_http_status_code_200_cumsum,3945.466135
11,op_month_ip_transform_http_status_code_op_diff...,1947.542069


In [22]:
feats_importance.sort_values('importance', ascending=False)['name'].values

array(['system_system_op_times_groups_http_status_code_200_cumsum',
       'system_system_op_times_groups_http_status_code_op_diff_second1_std',
       'op_month_ip_transform_device_num_transform_op_diff_second1_std',
       'op_times_groups_ip_transform_ip_type_内网_cumsum',
       'system_system_op_times_groups_op_diff_second1_prod',
       'op_month_ip_transform_browser_version_op_diff_second1_std',
       'system_op_days_http_status_code_200_cumsum',
       'op_times_groups_ip_transform_url_wpsdoc.xxx.com/download_cumsum',
       'hour_cos',
       'op_times_groups_ip_transform_log_system_transform_dwS3cdn15GK4_cumsum',
       'op_month_ip_transform_http_status_code_op_diff_second1_std',
       'hour', 'op_month_ip_transform_browser_op_diff_second1_std',
       'system_system_op_times_groups_department_sales_cumsum',
       'system_op_days_ip_type_内网_cumsum',
       'op_days_ip_transform_http_status_code_op_diff_second1_std',
       'system_system_op_times_groups_op_city_op_diff_seco

In [23]:
df_train_info['oof']=df_train['oof']

df_train_info[abs(df_train_info['is_risk']-df_train_info['oof'])>0.7]['device_num_transform'].value_counts()

HL3vrsyu1H3Z    96
Rfv57YyO3vny    91
2EmjEhrepKLJ    90
0dV6LzVsv7pW    88
5DmlITfRNR36    87
                ..
6DDzOi2BV383     1
9nRsk1CCOdLt     1
3uBfpGYbfD3Q     1
7D0GPTvDM4Fn     1
PQZXB2FgV30B     1
Name: device_num_transform, Length: 720, dtype: int64

In [24]:
df_train_info[df_train_info['device_num_transform']=='0dV6LzVsv7pW']

Unnamed: 0,id,user_name,department,ip_transform,device_num_transform,browser_version,browser,os_type,os_version,op_datetime,ip_type,http_status_code,op_city,log_system_transform,url,op_month,is_risk,url_sit,url_page,oof
581,581,lufan2545,rd,5KbVyNsBf,0dV6LzVsv7pW,edge_93,edge,win,win10,2022-01-07 18:21:48,内网,200,杭州,nHrKgKdJ1Mzt,xxx.com/github,2022-01,0,xxx.com,github,0.908678
44887,44887,lufan2545,rd,5KbVyNsBf,0dV6LzVsv7pW,edge_93,edge,win,win10,2022-01-10 23:31:25,内网,200,杭州,fwM6KZKjrzjm,xxx.com/oa,2022-01,1,xxx.com,oa,0.020971
14489,14489,lufan2545,rd,5KbVyNsBf,0dV6LzVsv7pW,edge_93,edge,win,win10,2022-01-11 10:33:00,内网,200,杭州,dwS3cdn15GK4,wpsdoc.xxx.com/kdocs,2022-01,0,wpsdoc.xxx.com,kdocs,0.019711
36352,36352,lufan2545,rd,5KbVyNsBf,0dV6LzVsv7pW,edge_93,edge,win,win10,2022-01-11 10:33:01,内网,200,杭州,dwS3cdn15GK4,wpsdoc.xxx.com/download,2022-01,0,wpsdoc.xxx.com,download,0.020637
36353,36353,lufan2545,rd,5KbVyNsBf,0dV6LzVsv7pW,edge_93,edge,win,win10,2022-01-11 10:33:04,内网,200,杭州,dwS3cdn15GK4,wpsdoc.xxx.com/download,2022-01,0,wpsdoc.xxx.com,download,0.020079
36354,36354,lufan2545,rd,5KbVyNsBf,0dV6LzVsv7pW,edge_93,edge,win,win10,2022-01-11 10:33:06,内网,200,杭州,dwS3cdn15GK4,wpsdoc.xxx.com/download,2022-01,0,wpsdoc.xxx.com,download,0.01932
36355,36355,lufan2545,rd,5KbVyNsBf,0dV6LzVsv7pW,edge_93,edge,win,win10,2022-01-11 10:33:07,内网,200,杭州,dwS3cdn15GK4,wpsdoc.xxx.com/download,2022-01,0,wpsdoc.xxx.com,download,0.021659
36356,36356,lufan2545,rd,5KbVyNsBf,0dV6LzVsv7pW,edge_93,edge,win,win10,2022-01-11 10:33:10,内网,200,杭州,dwS3cdn15GK4,wpsdoc.xxx.com/download,2022-01,0,wpsdoc.xxx.com,download,0.035991
36357,36357,lufan2545,rd,5KbVyNsBf,0dV6LzVsv7pW,edge_93,edge,win,win10,2022-01-11 10:33:12,内网,400,杭州,dwS3cdn15GK4,wpsdoc.xxx.com/download,2022-01,1,wpsdoc.xxx.com,download,0.020268
36358,36358,lufan2545,rd,5KbVyNsBf,0dV6LzVsv7pW,edge_93,edge,win,win10,2022-01-11 10:33:14,内网,400,杭州,dwS3cdn15GK4,wpsdoc.xxx.com/download,2022-01,1,wpsdoc.xxx.com,download,0.021276
