In [1]:
import os
import re
import numpy as np
import pandas as pd
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

import scorpyo as sp

from null_importance import get_null_importance


pd.set_option('max_rows', 500, 'max_columns', 200)

In [2]:
path_project = r'/Users/liliangshan/workspace/python/01_datasets/ccf_system_access_risk_identification'

# path dir
path_row_data = os.path.join(path_project, 'row_data')
path_new_data = os.path.join(path_project, 'new_data')
path_results  = os.path.join(path_project, 'results')
path_results_jupyter  = os.path.join(path_results, 'jupyter')

# path row_data
path_train = os.path.join(path_row_data, 'train.csv')
path_test  = os.path.join(path_row_data, 'evaluation_public.csv')
path_sample_submission = os.path.join(path_row_data, 'submit_example.csv')


path_new_train = os.path.join(path_new_data, 'train_lightgbm_20221014.csv')
path_new_test  = os.path.join(path_new_data, 'test_lightgbm_20221014.csv')

## results
path_output_report = os.path.join(path_results, '01_原始数据探察_20221014.xlsx')

y_label = "is_risk"

In [3]:
df_row_train = sp.read_data(path_train)
df_row_val  = sp.read_data(path_test)

df_row_train['url_sit'] = df_row_train['url'].map(lambda x: x.split('/')[0])
df_row_train['url_page'] = df_row_train['url'].map(lambda x: x.split('/')[1])

df_row_val['url_sit'] = df_row_val['url'].map(lambda x: x.split('/')[0])
df_row_val['url_page'] = df_row_val['url'].map(lambda x: x.split('/')[1])



df_train_info = df_row_train.copy().sort_values(by='op_datetime')

## 定义一次/一天/一月
1. op\_times\_groups 一次
2. op\_days 一天
3. op\_month 一月

In [4]:
df = pd.concat([df_row_train, df_row_val]).reset_index(drop=True)
df = df.sort_values(by='op_datetime')

# 认证日期时间
df['op_datetime'] = pd.to_datetime(df['op_datetime'])
# 将数据分为每一天
df['op_days'] = df['op_datetime'].map(lambda x: x.strftime('%Y-%m-%d'))

# 按客户进行统计这次认证和上次认证的时间差
df['op_second'] = df['op_datetime']
df['op_second1'] = df.groupby('device_num_transform')['op_second'].shift(1)
df['op_diff_second1'] = (df['op_second'] - df['op_second1']).map(lambda x: x.total_seconds())
df['op_diff_second1_log'] = df['op_diff_second1'].apply(np.log)
df['op_diff_second1_log_log'] = df['op_diff_second1'].apply(np.log).apply(np.log)

df['op_diff_second1_tmp'] = df['op_diff_second1']>10
# 客户第几次登录
df['op_times_groups'] = df.groupby('device_num_transform')['op_diff_second1_tmp'].apply(lambda x: x.cumsum())

# 系统层面的一段时间
df['system_op_second'] = df['op_datetime']
df['system_op_second1'] = df['system_op_second'].shift(1)
df['system_op_diff_second1'] = (df['system_op_second'] - df['system_op_second1']).map(lambda x: x.total_seconds())

df['system_op_diff_second1_log'] = df['system_op_diff_second1'].apply(np.log)
df['system_op_diff_second1_log_log'] = df['system_op_diff_second1'].apply(np.log).apply(np.log)
df['system_op_diff_second1_tmp'] = df['system_op_diff_second1']>400
# 客户第几次登录
df['system_op_times_groups'] = df['system_op_diff_second1_tmp'].cumsum()

df = df.drop(columns=['op_second','op_second1','op_diff_second1_tmp', 
                      'system_op_second', 'system_op_second1', 'system_op_diff_second1_tmp'])

## 环境特征

In [5]:

time_feats = ['system_op_times_groups', 'op_days', 'op_month']

cate_feats = ['ip_transform', 'user_name', 'device_num_transform', 'department', 'browser_version', 'browser', 'os_type','os_version',
              'ip_type','http_status_code', 'op_city', 'log_system_transform', 'url', 'url_page','url_sit']

# 客户+时间+按时间cumsum/cumunique
# 累计量统计
df['helper'] = 1
# 是否为异常状态码
df['http_status_code_helper'] = df['http_status_code'].map(lambda x: 1 if x in [400, 500, 502, 404] else 0) 
# 给样本编号
df['sampler_index_helper'] = df['helper'].cumsum()

for i in time_feats:
    i_tmp = df.groupby([i])
    # 系统往前看，处理了多少事
    df['system_{}_cumsum'.format(i)] = i_tmp['helper'].cumsum()
    # 系统往前看，处理了多少坏事
    df['system_{}_error_code_cumsum'.format(i)] = i_tmp['http_status_code_helper'].cumsum()
    for method in ['mean', 'std', "prod"]:
        df['system_{}_op_diff_second1_{}'.format(i, method)] = i_tmp['op_diff_second1'].transform(method)
    for j in cate_feats:
        index_set = set(df.groupby([i, j],as_index=False).first()['sampler_index_helper'].values)
        df['tmp_helper'] = df['sampler_index_helper'].map(lambda x: 1 if x in index_set else 0)
        j_tmp = df.groupby([i, j])
        # 系统往前看，不同维度的处理了多少情况
        df['system_{}_{}_cumunique'.format(i, j)] = j_tmp['tmp_helper'].cumsum()
        
        for method in ['mean', 'std', "prod"]:
            df['system_{}_{}_op_diff_second1_{}'.format(i,j, method)] = j_tmp['op_diff_second1'].transform(method)

        if j not in ['ip_transform', 'user_name', 'device_num_transform']:
            for k in df[j].unique():
                tmp = df[df[j]==k].groupby([i])

                # 系统往前看，不同维度不同情况分别处理了多少次
                df['system_{}_{}_{}_cumsum'.format(i,j,k)] = tmp['helper'].cumsum()
                # 系统往前看，不同维度不同情况error_code分别处理了多少次
                df['system_{}_{}_{}_error_code_cumsum'.format(i,j,k)] = tmp['http_status_code_helper'].cumsum()

remove_cols = [x for x in df.columns if x[-6:]=='helper']

df = df.drop(columns=remove_cols)

## 对象层面

In [6]:

time_feats = ['op_times_groups', 'op_days', 'op_month']

object_feats = ['ip_transform', 'user_name', 'device_num_transform']

cate_feats = ['ip_transform', 'user_name', 'device_num_transform', 'department', 'browser_version', 'browser', 'os_type','os_version',
              'ip_type','http_status_code', 'op_city', 'log_system_transform', 'url', 'url_page','url_sit']

# 客户+时间+按时间cumsum/cumunique
# 累计量统计
df['helper'] = 1
# 是否为异常状态码
df['http_status_code_helper'] = df['http_status_code'].map(lambda x: 1 if x in [400, 500, 502, 404] else 0) 
# 给样本编号
df['sampler_index_helper'] = df['helper'].cumsum()


for i in time_feats:
    for j in object_feats:
        j_tmp = df.groupby([i,j])
        df['{}_{}_cumsum'.format(i,j)] = j_tmp['helper'].cumsum()
        df['{}_{}_error_code_cumsum'.format(i,j)] = j_tmp['http_status_code_helper'].cumsum()
        
        for method in ['mean', 'std', "prod"]:
            df['{}_{}_op_diff_second1_{}'.format(i, j, method)] = j_tmp['op_diff_second1'].transform(method)
        for k in cate_feats:
            if k == j: continue
            index_set = set(df.groupby([i,j,k], as_index=False).first()['sampler_index_helper'].values)            
            df['tmp_helper'] = df['sampler_index_helper'].map(lambda x: 1 if x in index_set else 0)
            k_tmp = df.groupby([i,j,k])
            df['{}_{}_{}_cumunique'.format(i,j,k)] = k_tmp['tmp_helper'].cumsum()
            for method in ['mean', 'std', "prod"]:
                df['{}_{}_{}_op_diff_second1_{}'.format(i, j, k, method)] = k_tmp['op_diff_second1'].transform(method)

            if k not in ['ip_transform', 'user_name', 'device_num_transform']:
                for v in df[k].unique():
                    v_tmp = df[df[k]==v].groupby([i,j])
                    df['{}_{}_{}_{}_cumsum'.format(i,j,k,v)] = v_tmp['helper'].cumsum()
                    df['{}_{}_{}_{}_error_code_cumsum'.format(i,j,k,v)] = v_tmp['http_status_code_helper'].cumsum()

remove_cols = [x for x in df.columns if x[-6:]=='helper']

df = df.drop(columns=remove_cols)

## 其它操作

In [7]:
# 几点钟
df['hour'] = df['op_datetime'].dt.hour
df['hour_sin'] = np.sin(df['hour']/24*2*np.pi)
df['hour_cos'] = np.cos(df['hour']/24*2*np.pi)

# 周几
df['dayofweek'] = df['op_datetime'].dt.dayofweek
df['dayofweek_sin'] = np.sin(df['dayofweek']/7*2*np.pi)
df['dayofweek_cos'] = np.cos(df['dayofweek']/7*2*np.pi)

# 一个月的第几天
df['day'] = df['op_datetime'].dt.day
df['day_sin'] = np.sin(df['day']/31*2*np.pi)
df['day_cos'] = np.cos(df['day']/31*2*np.pi)

## 特征筛选

In [8]:
# 删除时间及类别型变量过多的特征
remove_col = ['op_datetime', 'op_month', 'user_name', 'ip_transform', 'device_num_transform', 'op_days', 'ts', 'ts1', 'ts2','ts3']

remove_col = [x for x in df.columns if x in remove_col]
df = df.drop(columns=remove_col)
df.head()

Unnamed: 0,id,department,browser_version,browser,os_type,os_version,ip_type,http_status_code,op_city,log_system_transform,url,is_risk,url_sit,url_page,op_diff_second1,op_diff_second1_log,op_diff_second1_log_log,op_times_groups,system_op_diff_second1,system_op_diff_second1_log,system_op_diff_second1_log_log,system_op_times_groups,system_system_op_times_groups_cumsum,system_system_op_times_groups_error_code_cumsum,system_system_op_times_groups_op_diff_second1_mean,system_system_op_times_groups_op_diff_second1_std,system_system_op_times_groups_op_diff_second1_prod,system_system_op_times_groups_ip_transform_cumunique,system_system_op_times_groups_ip_transform_op_diff_second1_mean,system_system_op_times_groups_ip_transform_op_diff_second1_std,system_system_op_times_groups_ip_transform_op_diff_second1_prod,system_system_op_times_groups_user_name_cumunique,system_system_op_times_groups_user_name_op_diff_second1_mean,system_system_op_times_groups_user_name_op_diff_second1_std,system_system_op_times_groups_user_name_op_diff_second1_prod,system_system_op_times_groups_device_num_transform_cumunique,system_system_op_times_groups_device_num_transform_op_diff_second1_mean,system_system_op_times_groups_device_num_transform_op_diff_second1_std,system_system_op_times_groups_device_num_transform_op_diff_second1_prod,system_system_op_times_groups_department_cumunique,system_system_op_times_groups_department_op_diff_second1_mean,system_system_op_times_groups_department_op_diff_second1_std,system_system_op_times_groups_department_op_diff_second1_prod,system_system_op_times_groups_department_rd_cumsum,system_system_op_times_groups_department_rd_error_code_cumsum,system_system_op_times_groups_department_hr_cumsum,system_system_op_times_groups_department_hr_error_code_cumsum,system_system_op_times_groups_department_sales_cumsum,system_system_op_times_groups_department_sales_error_code_cumsum,system_system_op_times_groups_department_other_cumsum,system_system_op_times_groups_department_other_error_code_cumsum,system_system_op_times_groups_department_nan_cumsum,system_system_op_times_groups_department_nan_error_code_cumsum,system_system_op_times_groups_department_accounting_cumsum,system_system_op_times_groups_department_accounting_error_code_cumsum,system_system_op_times_groups_browser_version_cumunique,system_system_op_times_groups_browser_version_op_diff_second1_mean,system_system_op_times_groups_browser_version_op_diff_second1_std,system_system_op_times_groups_browser_version_op_diff_second1_prod,system_system_op_times_groups_browser_version_chrome_93_cumsum,system_system_op_times_groups_browser_version_chrome_93_error_code_cumsum,system_system_op_times_groups_browser_version_safari_13_cumsum,system_system_op_times_groups_browser_version_safari_13_error_code_cumsum,system_system_op_times_groups_browser_version_firefox_78_cumsum,system_system_op_times_groups_browser_version_firefox_78_error_code_cumsum,system_system_op_times_groups_browser_version_edge_93_cumsum,system_system_op_times_groups_browser_version_edge_93_error_code_cumsum,system_system_op_times_groups_browser_version_ie_9_cumsum,system_system_op_times_groups_browser_version_ie_9_error_code_cumsum,system_system_op_times_groups_browser_version_chrome_77_cumsum,system_system_op_times_groups_browser_version_chrome_77_error_code_cumsum,system_system_op_times_groups_browser_version_chrome_90_cumsum,system_system_op_times_groups_browser_version_chrome_90_error_code_cumsum,system_system_op_times_groups_browser_version_ie_11_cumsum,system_system_op_times_groups_browser_version_ie_11_error_code_cumsum,system_system_op_times_groups_browser_cumunique,system_system_op_times_groups_browser_op_diff_second1_mean,system_system_op_times_groups_browser_op_diff_second1_std,system_system_op_times_groups_browser_op_diff_second1_prod,system_system_op_times_groups_browser_chrome_cumsum,system_system_op_times_groups_browser_chrome_error_code_cumsum,system_system_op_times_groups_browser_safari_cumsum,system_system_op_times_groups_browser_safari_error_code_cumsum,system_system_op_times_groups_browser_firefox_cumsum,system_system_op_times_groups_browser_firefox_error_code_cumsum,system_system_op_times_groups_browser_edge_cumsum,system_system_op_times_groups_browser_edge_error_code_cumsum,system_system_op_times_groups_browser_ie_cumsum,system_system_op_times_groups_browser_ie_error_code_cumsum,system_system_op_times_groups_os_type_cumunique,system_system_op_times_groups_os_type_op_diff_second1_mean,system_system_op_times_groups_os_type_op_diff_second1_std,system_system_op_times_groups_os_type_op_diff_second1_prod,system_system_op_times_groups_os_type_win_cumsum,system_system_op_times_groups_os_type_win_error_code_cumsum,system_system_op_times_groups_os_type_macos_cumsum,system_system_op_times_groups_os_type_macos_error_code_cumsum,system_system_op_times_groups_os_version_cumunique,system_system_op_times_groups_os_version_op_diff_second1_mean,system_system_op_times_groups_os_version_op_diff_second1_std,...,op_month_device_num_transform_log_system_transform_nHrKgKdJ1Mzt_error_code_cumsum,op_month_device_num_transform_log_system_transform_a5G25puBl9xj_cumsum,op_month_device_num_transform_log_system_transform_a5G25puBl9xj_error_code_cumsum,op_month_device_num_transform_log_system_transform_sW0whYIx8LFM_cumsum,op_month_device_num_transform_log_system_transform_sW0whYIx8LFM_error_code_cumsum,op_month_device_num_transform_log_system_transform_2umVQwhiiwNJ_cumsum,op_month_device_num_transform_log_system_transform_2umVQwhiiwNJ_error_code_cumsum,op_month_device_num_transform_log_system_transform_fwM6KZKjrzjm_cumsum,op_month_device_num_transform_log_system_transform_fwM6KZKjrzjm_error_code_cumsum,op_month_device_num_transform_log_system_transform_nan_cumsum,op_month_device_num_transform_log_system_transform_nan_error_code_cumsum,op_month_device_num_transform_log_system_transform_dwS3cdn15GK4_cumsum,op_month_device_num_transform_log_system_transform_dwS3cdn15GK4_error_code_cumsum,op_month_device_num_transform_log_system_transform_RwHe8Q1R7AlB_cumsum,op_month_device_num_transform_log_system_transform_RwHe8Q1R7AlB_error_code_cumsum,op_month_device_num_transform_log_system_transform_9RAS6RNfETj5_cumsum,op_month_device_num_transform_log_system_transform_9RAS6RNfETj5_error_code_cumsum,op_month_device_num_transform_log_system_transform_2UNHLdxlhIzv_cumsum,op_month_device_num_transform_log_system_transform_2UNHLdxlhIzv_error_code_cumsum,op_month_device_num_transform_url_cumunique,op_month_device_num_transform_url_op_diff_second1_mean,op_month_device_num_transform_url_op_diff_second1_std,op_month_device_num_transform_url_op_diff_second1_prod,op_month_device_num_transform_url_xxx.com/github_cumsum,op_month_device_num_transform_url_xxx.com/github_error_code_cumsum,op_month_device_num_transform_url_hr.xxx.com/_cumsum,op_month_device_num_transform_url_hr.xxx.com/_error_code_cumsum,op_month_device_num_transform_url_work.xxx.com/task_cumsum,op_month_device_num_transform_url_work.xxx.com/task_error_code_cumsum,op_month_device_num_transform_url_xxx.com/mail_cumsum,op_month_device_num_transform_url_xxx.com/mail_error_code_cumsum,op_month_device_num_transform_url_xxx.com/oa_cumsum,op_month_device_num_transform_url_xxx.com/oa_error_code_cumsum,op_month_device_num_transform_url_xxx.com/getVerifyCode_cumsum,op_month_device_num_transform_url_xxx.com/getVerifyCode_error_code_cumsum,op_month_device_num_transform_url_xxx.com/loginAuth_cumsum,op_month_device_num_transform_url_xxx.com/loginAuth_error_code_cumsum,op_month_device_num_transform_url_wpsdoc.xxx.com/kdocs_cumsum,op_month_device_num_transform_url_wpsdoc.xxx.com/kdocs_error_code_cumsum,op_month_device_num_transform_url_business.xxx.com/_cumsum,op_month_device_num_transform_url_business.xxx.com/_error_code_cumsum,op_month_device_num_transform_url_xxx.com/checkingin_cumsum,op_month_device_num_transform_url_xxx.com/checkingin_error_code_cumsum,op_month_device_num_transform_url_xxx.com/getLoginType_cumsum,op_month_device_num_transform_url_xxx.com/getLoginType_error_code_cumsum,op_month_device_num_transform_url_wpsdoc.xxx.com/download_cumsum,op_month_device_num_transform_url_wpsdoc.xxx.com/download_error_code_cumsum,op_month_device_num_transform_url_xxx.com/accounting_cumsum,op_month_device_num_transform_url_xxx.com/accounting_error_code_cumsum,op_month_device_num_transform_url_page_cumunique,op_month_device_num_transform_url_page_op_diff_second1_mean,op_month_device_num_transform_url_page_op_diff_second1_std,op_month_device_num_transform_url_page_op_diff_second1_prod,op_month_device_num_transform_url_page_github_cumsum,op_month_device_num_transform_url_page_github_error_code_cumsum,op_month_device_num_transform_url_page__cumsum,op_month_device_num_transform_url_page__error_code_cumsum,op_month_device_num_transform_url_page_task_cumsum,op_month_device_num_transform_url_page_task_error_code_cumsum,op_month_device_num_transform_url_page_mail_cumsum,op_month_device_num_transform_url_page_mail_error_code_cumsum,op_month_device_num_transform_url_page_oa_cumsum,op_month_device_num_transform_url_page_oa_error_code_cumsum,op_month_device_num_transform_url_page_getVerifyCode_cumsum,op_month_device_num_transform_url_page_getVerifyCode_error_code_cumsum,op_month_device_num_transform_url_page_loginAuth_cumsum,op_month_device_num_transform_url_page_loginAuth_error_code_cumsum,op_month_device_num_transform_url_page_kdocs_cumsum,op_month_device_num_transform_url_page_kdocs_error_code_cumsum,op_month_device_num_transform_url_page_checkingin_cumsum,op_month_device_num_transform_url_page_checkingin_error_code_cumsum,op_month_device_num_transform_url_page_getLoginType_cumsum,op_month_device_num_transform_url_page_getLoginType_error_code_cumsum,op_month_device_num_transform_url_page_download_cumsum,op_month_device_num_transform_url_page_download_error_code_cumsum,op_month_device_num_transform_url_page_accounting_cumsum,op_month_device_num_transform_url_page_accounting_error_code_cumsum,op_month_device_num_transform_url_sit_cumunique,op_month_device_num_transform_url_sit_op_diff_second1_mean,op_month_device_num_transform_url_sit_op_diff_second1_std,op_month_device_num_transform_url_sit_op_diff_second1_prod,op_month_device_num_transform_url_sit_xxx.com_cumsum,op_month_device_num_transform_url_sit_xxx.com_error_code_cumsum,op_month_device_num_transform_url_sit_hr.xxx.com_cumsum,op_month_device_num_transform_url_sit_hr.xxx.com_error_code_cumsum,op_month_device_num_transform_url_sit_work.xxx.com_cumsum,op_month_device_num_transform_url_sit_work.xxx.com_error_code_cumsum,op_month_device_num_transform_url_sit_wpsdoc.xxx.com_cumsum,op_month_device_num_transform_url_sit_wpsdoc.xxx.com_error_code_cumsum,op_month_device_num_transform_url_sit_business.xxx.com_cumsum,op_month_device_num_transform_url_sit_business.xxx.com_error_code_cumsum,hour,hour_sin,hour_cos,dayofweek,dayofweek_sin,dayofweek_cos,day,day_sin,day_cos
44477,44477,rd,chrome_93,chrome,win,win10,内网,200,深圳,nHrKgKdJ1Mzt,xxx.com/github,1.0,xxx.com,github,,,,0,,,,0,1,0,,,1.0,1,,,1.0,1,,,1.0,1,,,1.0,1,,,1.0,1.0,0.0,,,,,,,,,,,1,,,1.0,1.0,0.0,,,,,,,,,,,,,,,1,,,1.0,1.0,0.0,,,,,,,,,1,,,1.0,1.0,0.0,,,1,,,...,0.0,,,,,,,,,,,,,,,,,,,1,42634.933333,51507.725287,1.30122e+122,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,1,42634.933333,51507.725287,1.30122e+122,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1,48144.097561,60210.665583,5.648623999999999e+168,1.0,0.0,,,,,,,,,2,0.5,0.866025,4,-0.433884,-0.900969,7,0.988468,0.151428
45489,45489,rd,safari_13,safari,macos,macos_big_sur_11,内网,200,成都,nHrKgKdJ1Mzt,xxx.com/github,1.0,xxx.com,github,,,,0,603.0,6.401917,1.856598,1,1,0,,,1.0,1,,,1.0,1,,,1.0,1,,,1.0,1,,,1.0,1.0,0.0,,,,,,,,,,,1,,,1.0,,,1.0,0.0,,,,,,,,,,,,,1,,,1.0,,,1.0,0.0,,,,,,,1,,,1.0,,,1.0,0.0,1,,,...,0.0,,,,,,,,,,,,,,,,,,,1,60340.166667,71604.782837,4.378261e+120,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,1,60340.166667,71604.782837,4.378261e+120,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1,50344.564103,65831.591909,3.897362e+149,1.0,0.0,,,,,,,,,2,0.5,0.866025,4,-0.433884,-0.900969,7,0.988468,0.151428
45706,45706,hr,firefox_78,firefox,win,win7,内网,200,深圳,a5G25puBl9xj,hr.xxx.com/,1.0,hr.xxx.com,,,,,0,384.0,5.950643,1.783499,1,2,0,,,1.0,1,,,1.0,1,,,1.0,1,,,1.0,1,,,1.0,,,1.0,0.0,,,,,,,,,1,,,1.0,,,,,1.0,0.0,,,,,,,,,,,1,,,1.0,,,,,1.0,0.0,,,,,1,,,1.0,1.0,0.0,,,1,,,...,,1.0,0.0,,,,,,,,,,,,,,,,,1,42199.647059,44769.681976,7.108256e+68,,,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1,42199.647059,44769.681976,7.108256e+68,,,1.0,0.0,,,,,,,,,,,,,,,,,,,,,1,42199.647059,44769.681976,7.108256e+68,,,1.0,0.0,,,,,,,3,0.707107,0.707107,4,-0.433884,-0.900969,7,0.988468,0.151428
45901,45901,rd,edge_93,edge,win,win10,内网,200,杭州,nHrKgKdJ1Mzt,xxx.com/github,1.0,xxx.com,github,,,,0,5318.0,8.578853,2.1493,2,1,0,,,1.0,1,,,1.0,1,,,1.0,1,,,1.0,1,,,1.0,1.0,0.0,,,,,,,,,,,1,,,1.0,,,,,,,1.0,0.0,,,,,,,,,1,,,1.0,,,,,,,1.0,0.0,,,1,,,1.0,1.0,0.0,,,1,,,...,0.0,,,,,,,,,,,,,,,,,,,1,35702.933333,55414.856364,2.986052e+108,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,1,35702.933333,55414.856364,2.986052e+108,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1,38517.446809,67366.099455,3.8445339999999997e+174,1.0,0.0,,,,,,,,,4,0.866025,0.5,4,-0.433884,-0.900969,7,0.988468,0.151428
43827,43827,sales,ie_9,ie,win,win10,内网,200,重庆,sW0whYIx8LFM,work.xxx.com/task,1.0,work.xxx.com,task,,,,0,2890.0,7.969012,2.07556,3,1,0,,,1.0,1,,,1.0,1,,,1.0,1,,,1.0,1,,,1.0,,,,,1.0,0.0,,,,,,,1,,,1.0,,,,,,,,,1.0,0.0,,,,,,,1,,,1.0,,,,,,,,,1.0,0.0,1,,,1.0,1.0,0.0,,,1,,,...,,,,1.0,0.0,,,,,,,,,,,,,,,1,36298.0,49621.220854,4771291000000.0,,,,,1.0,0.0,,,,,,,,,,,,,,,,,,,,,1,36298.0,49621.220854,4771291000000.0,,,,,1.0,0.0,,,,,,,,,,,,,,,,,,,1,36298.0,49621.220854,4771291000000.0,,,,,1.0,0.0,,,,,5,0.965926,0.258819,4,-0.433884,-0.900969,7,0.988468,0.151428


In [9]:
df_row_train = df[df[y_label].notna()].reset_index(drop=True)
df_row_val = df[df[y_label].isna()].reset_index(drop=True)

df_train, df_test, convert_cols = sp.transform_data_detail(df_row_train, df_row_val, y_label, excel_path=path_output_report)
df_train.head()

sheet05.可能为数值类型的object类型数据统计在/Users/liliangshan/workspace/python/01_datasets/ccf_system_access_risk_identification/results/01_原始数据探察_20221014.xlsx中已经存在，我们将对原文件进行覆盖
sheet06.数据预处理在/Users/liliangshan/workspace/python/01_datasets/ccf_system_access_risk_identification/results/01_原始数据探察_20221014.xlsx中已经存在，我们将对原文件进行覆盖


Unnamed: 0,id,department,browser_version,browser,os_version,op_city,log_system_transform,url,is_risk,url_sit,url_page,op_diff_second1,op_diff_second1_log,op_diff_second1_log_log,op_times_groups,system_op_diff_second1,system_op_diff_second1_log,system_op_diff_second1_log_log,system_op_times_groups,system_system_op_times_groups_cumsum,system_system_op_times_groups_error_code_cumsum,system_system_op_times_groups_op_diff_second1_mean,system_system_op_times_groups_op_diff_second1_std,system_system_op_times_groups_op_diff_second1_prod,system_system_op_times_groups_ip_transform_op_diff_second1_mean,system_system_op_times_groups_ip_transform_op_diff_second1_std,system_system_op_times_groups_ip_transform_op_diff_second1_prod,system_system_op_times_groups_user_name_op_diff_second1_mean,system_system_op_times_groups_user_name_op_diff_second1_std,system_system_op_times_groups_user_name_op_diff_second1_prod,system_system_op_times_groups_device_num_transform_op_diff_second1_mean,system_system_op_times_groups_device_num_transform_op_diff_second1_std,system_system_op_times_groups_device_num_transform_op_diff_second1_prod,system_system_op_times_groups_department_op_diff_second1_mean,system_system_op_times_groups_department_op_diff_second1_std,system_system_op_times_groups_department_op_diff_second1_prod,system_system_op_times_groups_department_rd_cumsum,system_system_op_times_groups_department_rd_error_code_cumsum,system_system_op_times_groups_department_sales_cumsum,system_system_op_times_groups_department_sales_error_code_cumsum,system_system_op_times_groups_browser_version_op_diff_second1_mean,system_system_op_times_groups_browser_version_op_diff_second1_std,system_system_op_times_groups_browser_version_op_diff_second1_prod,system_system_op_times_groups_browser_version_edge_93_cumsum,system_system_op_times_groups_browser_version_edge_93_error_code_cumsum,system_system_op_times_groups_browser_version_chrome_90_cumsum,system_system_op_times_groups_browser_version_chrome_90_error_code_cumsum,system_system_op_times_groups_browser_op_diff_second1_mean,system_system_op_times_groups_browser_op_diff_second1_std,system_system_op_times_groups_browser_op_diff_second1_prod,system_system_op_times_groups_browser_chrome_cumsum,system_system_op_times_groups_browser_chrome_error_code_cumsum,system_system_op_times_groups_browser_edge_cumsum,system_system_op_times_groups_browser_edge_error_code_cumsum,system_system_op_times_groups_os_type_op_diff_second1_mean,system_system_op_times_groups_os_type_op_diff_second1_std,system_system_op_times_groups_os_type_op_diff_second1_prod,system_system_op_times_groups_os_type_win_cumsum,system_system_op_times_groups_os_type_win_error_code_cumsum,system_system_op_times_groups_os_version_op_diff_second1_mean,system_system_op_times_groups_os_version_op_diff_second1_std,system_system_op_times_groups_os_version_op_diff_second1_prod,system_system_op_times_groups_os_version_win10_cumsum,system_system_op_times_groups_os_version_win10_error_code_cumsum,system_system_op_times_groups_os_version_win7_cumsum,system_system_op_times_groups_os_version_win7_error_code_cumsum,system_system_op_times_groups_ip_type_op_diff_second1_mean,system_system_op_times_groups_ip_type_op_diff_second1_std,system_system_op_times_groups_ip_type_op_diff_second1_prod,system_system_op_times_groups_ip_type_内网_cumsum,system_system_op_times_groups_ip_type_内网_error_code_cumsum,system_system_op_times_groups_http_status_code_op_diff_second1_mean,system_system_op_times_groups_http_status_code_op_diff_second1_std,system_system_op_times_groups_http_status_code_op_diff_second1_prod,system_system_op_times_groups_http_status_code_200_cumsum,system_system_op_times_groups_op_city_op_diff_second1_mean,system_system_op_times_groups_op_city_op_diff_second1_std,system_system_op_times_groups_op_city_op_diff_second1_prod,system_system_op_times_groups_op_city_深圳_cumsum,system_system_op_times_groups_op_city_深圳_error_code_cumsum,system_system_op_times_groups_op_city_成都_cumsum,system_system_op_times_groups_op_city_成都_error_code_cumsum,system_system_op_times_groups_op_city_杭州_cumsum,system_system_op_times_groups_op_city_杭州_error_code_cumsum,system_system_op_times_groups_op_city_北京_cumsum,system_system_op_times_groups_op_city_北京_error_code_cumsum,system_system_op_times_groups_log_system_transform_op_diff_second1_mean,system_system_op_times_groups_log_system_transform_op_diff_second1_std,system_system_op_times_groups_log_system_transform_op_diff_second1_prod,system_system_op_times_groups_log_system_transform_nHrKgKdJ1Mzt_cumsum,system_system_op_times_groups_log_system_transform_nHrKgKdJ1Mzt_error_code_cumsum,system_system_op_times_groups_log_system_transform_dwS3cdn15GK4_cumsum,system_system_op_times_groups_log_system_transform_dwS3cdn15GK4_error_code_cumsum,system_system_op_times_groups_url_op_diff_second1_mean,system_system_op_times_groups_url_op_diff_second1_std,system_system_op_times_groups_url_op_diff_second1_prod,system_system_op_times_groups_url_xxx.com/github_cumsum,system_system_op_times_groups_url_xxx.com/github_error_code_cumsum,system_system_op_times_groups_url_wpsdoc.xxx.com/download_cumsum,system_system_op_times_groups_url_wpsdoc.xxx.com/download_error_code_cumsum,...,op_month_device_num_transform_error_code_cumsum,op_month_device_num_transform_op_diff_second1_mean,op_month_device_num_transform_op_diff_second1_std,op_month_device_num_transform_op_diff_second1_prod,op_month_device_num_transform_ip_transform_op_diff_second1_mean,op_month_device_num_transform_ip_transform_op_diff_second1_std,op_month_device_num_transform_ip_transform_op_diff_second1_prod,op_month_device_num_transform_user_name_op_diff_second1_mean,op_month_device_num_transform_user_name_op_diff_second1_std,op_month_device_num_transform_user_name_op_diff_second1_prod,op_month_device_num_transform_department_op_diff_second1_mean,op_month_device_num_transform_department_op_diff_second1_std,op_month_device_num_transform_department_op_diff_second1_prod,op_month_device_num_transform_department_rd_cumsum,op_month_device_num_transform_department_rd_error_code_cumsum,op_month_device_num_transform_department_sales_cumsum,op_month_device_num_transform_department_sales_error_code_cumsum,op_month_device_num_transform_browser_version_op_diff_second1_mean,op_month_device_num_transform_browser_version_op_diff_second1_std,op_month_device_num_transform_browser_version_op_diff_second1_prod,op_month_device_num_transform_browser_version_edge_93_cumsum,op_month_device_num_transform_browser_version_edge_93_error_code_cumsum,op_month_device_num_transform_browser_version_chrome_90_cumsum,op_month_device_num_transform_browser_version_chrome_90_error_code_cumsum,op_month_device_num_transform_browser_op_diff_second1_mean,op_month_device_num_transform_browser_op_diff_second1_std,op_month_device_num_transform_browser_op_diff_second1_prod,op_month_device_num_transform_browser_chrome_cumsum,op_month_device_num_transform_browser_chrome_error_code_cumsum,op_month_device_num_transform_browser_edge_cumsum,op_month_device_num_transform_browser_edge_error_code_cumsum,op_month_device_num_transform_os_type_op_diff_second1_mean,op_month_device_num_transform_os_type_op_diff_second1_std,op_month_device_num_transform_os_type_op_diff_second1_prod,op_month_device_num_transform_os_type_win_cumsum,op_month_device_num_transform_os_type_win_error_code_cumsum,op_month_device_num_transform_os_version_op_diff_second1_mean,op_month_device_num_transform_os_version_op_diff_second1_std,op_month_device_num_transform_os_version_op_diff_second1_prod,op_month_device_num_transform_os_version_win10_cumsum,op_month_device_num_transform_os_version_win10_error_code_cumsum,op_month_device_num_transform_os_version_win7_cumsum,op_month_device_num_transform_os_version_win7_error_code_cumsum,op_month_device_num_transform_ip_type_op_diff_second1_mean,op_month_device_num_transform_ip_type_op_diff_second1_std,op_month_device_num_transform_ip_type_op_diff_second1_prod,op_month_device_num_transform_ip_type_内网_cumsum,op_month_device_num_transform_ip_type_内网_error_code_cumsum,op_month_device_num_transform_http_status_code_op_diff_second1_mean,op_month_device_num_transform_http_status_code_op_diff_second1_std,op_month_device_num_transform_http_status_code_op_diff_second1_prod,op_month_device_num_transform_http_status_code_200_cumsum,op_month_device_num_transform_op_city_op_diff_second1_mean,op_month_device_num_transform_op_city_op_diff_second1_std,op_month_device_num_transform_op_city_op_diff_second1_prod,op_month_device_num_transform_op_city_深圳_cumsum,op_month_device_num_transform_op_city_深圳_error_code_cumsum,op_month_device_num_transform_op_city_成都_cumsum,op_month_device_num_transform_op_city_成都_error_code_cumsum,op_month_device_num_transform_op_city_杭州_cumsum,op_month_device_num_transform_op_city_杭州_error_code_cumsum,op_month_device_num_transform_op_city_北京_cumsum,op_month_device_num_transform_op_city_北京_error_code_cumsum,op_month_device_num_transform_log_system_transform_op_diff_second1_mean,op_month_device_num_transform_log_system_transform_op_diff_second1_std,op_month_device_num_transform_log_system_transform_op_diff_second1_prod,op_month_device_num_transform_log_system_transform_nHrKgKdJ1Mzt_cumsum,op_month_device_num_transform_log_system_transform_nHrKgKdJ1Mzt_error_code_cumsum,op_month_device_num_transform_log_system_transform_dwS3cdn15GK4_cumsum,op_month_device_num_transform_log_system_transform_dwS3cdn15GK4_error_code_cumsum,op_month_device_num_transform_url_op_diff_second1_mean,op_month_device_num_transform_url_op_diff_second1_std,op_month_device_num_transform_url_op_diff_second1_prod,op_month_device_num_transform_url_xxx.com/github_cumsum,op_month_device_num_transform_url_xxx.com/github_error_code_cumsum,op_month_device_num_transform_url_wpsdoc.xxx.com/download_cumsum,op_month_device_num_transform_url_wpsdoc.xxx.com/download_error_code_cumsum,op_month_device_num_transform_url_page_op_diff_second1_mean,op_month_device_num_transform_url_page_op_diff_second1_std,op_month_device_num_transform_url_page_op_diff_second1_prod,op_month_device_num_transform_url_page_github_cumsum,op_month_device_num_transform_url_page_github_error_code_cumsum,op_month_device_num_transform_url_page_download_cumsum,op_month_device_num_transform_url_page_download_error_code_cumsum,op_month_device_num_transform_url_sit_op_diff_second1_mean,op_month_device_num_transform_url_sit_op_diff_second1_std,op_month_device_num_transform_url_sit_op_diff_second1_prod,op_month_device_num_transform_url_sit_xxx.com_cumsum,op_month_device_num_transform_url_sit_xxx.com_error_code_cumsum,op_month_device_num_transform_url_sit_wpsdoc.xxx.com_cumsum,op_month_device_num_transform_url_sit_wpsdoc.xxx.com_error_code_cumsum,hour,hour_sin,hour_cos,dayofweek,dayofweek_sin,dayofweek_cos,day,day_sin,day_cos
0,44477,rd,chrome_93,chrome,win10,深圳,nHrKgKdJ1Mzt,xxx.com/github,1.0,xxx.com,github,,,,0,,,,0,1,0,,,1.0,,,1.0,,,1.0,,,1.0,,,1.0,1.0,0.0,,,,,1.0,,,,,,,1.0,1.0,0.0,,,,,1.0,1.0,0.0,,,1.0,1.0,0.0,,,,,1.0,1.0,0.0,,,1.0,1.0,,,1.0,1.0,0.0,,,,,,,,,1.0,1.0,0.0,,,,,1.0,1.0,0.0,,,...,0,37129.545455,55416.665714,1.0991990000000001e+188,38452.735849,56036.133473,2.584153e+181,36381.45283,55264.776208,7.660154e+179,36381.45283,55264.776208,7.660154e+179,1.0,0.0,,,37129.545455,55416.665714,1.0991990000000001e+188,,,,,37129.545455,55416.665714,1.0991990000000001e+188,1.0,0.0,,,37129.545455,55416.665714,1.0991990000000001e+188,1.0,0.0,37129.545455,55416.665714,1.0991990000000001e+188,1.0,0.0,,,36381.45283,55264.776208,7.660154e+179,1.0,0.0,39271.557692,56266.221127,5.495995999999999e+187,1.0,37129.545455,55416.665714,1.0991990000000001e+188,1.0,0.0,,,,,,,42634.933333,51507.725287,1.30122e+122,1.0,0.0,,,42634.933333,51507.725287,1.30122e+122,1.0,0.0,,,42634.933333,51507.725287,1.30122e+122,1.0,0.0,,,48144.097561,60210.665583,5.648623999999999e+168,1.0,0.0,,,2,0.5,0.866025,4,-0.433884,-0.900969,7,0.988468,0.151428
1,45489,rd,safari_13,safari,macos_big_sur_11,成都,nHrKgKdJ1Mzt,xxx.com/github,1.0,xxx.com,github,,,,0,603.0,6.401917,1.856598,1,1,0,,,1.0,,,1.0,,,1.0,,,1.0,,,1.0,1.0,0.0,,,,,1.0,,,,,,,1.0,,,,,,,1.0,,,,,1.0,,,,,,,1.0,1.0,0.0,,,1.0,1.0,,,1.0,,,1.0,0.0,,,,,,,1.0,1.0,0.0,,,,,1.0,1.0,0.0,,,...,0,46443.636364,62987.038358,3.1406359999999997e+169,48602.142857,63685.848336,2.936996e+163,48849.0,65576.934888,2.217189e+152,48849.0,65576.934888,2.217189e+152,1.0,0.0,,,46443.636364,62987.038358,3.1406359999999997e+169,,,,,46443.636364,62987.038358,3.1406359999999997e+169,,,,,46443.636364,62987.038358,3.1406359999999997e+169,,,46443.636364,62987.038358,3.1406359999999997e+169,,,,,48849.0,65576.934888,2.217189e+152,1.0,0.0,46443.636364,62987.038358,3.1406359999999997e+169,1.0,46443.636364,62987.038358,3.1406359999999997e+169,,,1.0,0.0,,,,,60340.166667,71604.782837,4.378261e+120,1.0,0.0,,,60340.166667,71604.782837,4.378261e+120,1.0,0.0,,,60340.166667,71604.782837,4.378261e+120,1.0,0.0,,,50344.564103,65831.591909,3.897362e+149,1.0,0.0,,,2,0.5,0.866025,4,-0.433884,-0.900969,7,0.988468,0.151428
2,45706,hr,firefox_78,firefox,win7,深圳,a5G25puBl9xj,hr.xxx.com/,1.0,hr.xxx.com,,,,,0,384.0,5.950643,1.783499,1,2,0,,,1.0,,,1.0,,,1.0,,,1.0,,,1.0,,,,,,,1.0,,,,,,,1.0,,,,,,,1.0,1.0,0.0,,,1.0,,,1.0,0.0,,,1.0,2.0,0.0,,,1.0,2.0,,,1.0,1.0,0.0,,,,,,,,,1.0,,,,,,,1.0,,,,,...,0,30921.742424,49560.274727,5.0111590000000005e+213,31367.538462,49812.427023,2.576431e+210,29931.2,51079.557818,1.505465e+187,29931.2,51079.557818,1.505465e+187,,,,,30921.742424,49560.274727,5.0111590000000005e+213,,,,,30921.742424,49560.274727,5.0111590000000005e+213,,,,,30921.742424,49560.274727,5.0111590000000005e+213,1.0,0.0,30921.742424,49560.274727,5.0111590000000005e+213,,,1.0,0.0,29931.2,51079.557818,1.505465e+187,1.0,0.0,31887.953125,50028.986631,5.567954e+212,1.0,30921.742424,49560.274727,5.0111590000000005e+213,1.0,0.0,,,,,,,42199.647059,44769.681976,7.108256e+68,,,,,42199.647059,44769.681976,7.108256e+68,,,,,42199.647059,44769.681976,7.108256e+68,,,,,42199.647059,44769.681976,7.108256e+68,,,,,3,0.707107,0.707107,4,-0.433884,-0.900969,7,0.988468,0.151428
3,45901,rd,edge_93,edge,win10,杭州,nHrKgKdJ1Mzt,xxx.com/github,1.0,xxx.com,github,,,,0,5318.0,8.578853,2.1493,2,1,0,,,1.0,,,1.0,,,1.0,,,1.0,,,1.0,1.0,0.0,,,,,1.0,1.0,0.0,,,,,1.0,,,1.0,0.0,,,1.0,1.0,0.0,,,1.0,1.0,0.0,,,,,1.0,1.0,0.0,,,1.0,1.0,,,1.0,,,,,1.0,0.0,,,,,1.0,1.0,0.0,,,,,1.0,1.0,0.0,,,...,0,29992.328358,59448.52142,9.301113000000001e+204,30844.4,60165.310261,1.776476e+198,31518.435484,61527.749882,8.395597999999999e+185,31518.435484,61527.749882,8.395597999999999e+185,1.0,0.0,,,29992.328358,59448.52142,9.301113000000001e+204,1.0,0.0,,,29992.328358,59448.52142,9.301113000000001e+204,,,1.0,0.0,29992.328358,59448.52142,9.301113000000001e+204,1.0,0.0,29992.328358,59448.52142,9.301113000000001e+204,1.0,0.0,,,31518.435484,61527.749882,8.395597999999999e+185,1.0,0.0,30446.742424,59786.704706,9.301113000000001e+204,1.0,29992.328358,59448.52142,9.301113000000001e+204,,,,,1.0,0.0,,,35702.933333,55414.856364,2.986052e+108,1.0,0.0,,,35702.933333,55414.856364,2.986052e+108,1.0,0.0,,,35702.933333,55414.856364,2.986052e+108,1.0,0.0,,,38517.446809,67366.099455,3.8445339999999997e+174,1.0,0.0,,,4,0.866025,0.5,4,-0.433884,-0.900969,7,0.988468,0.151428
4,43827,sales,ie_9,ie,win10,重庆,sW0whYIx8LFM,work.xxx.com/task,1.0,work.xxx.com,task,,,,0,2890.0,7.969012,2.07556,3,1,0,,,1.0,,,1.0,,,1.0,,,1.0,,,1.0,,,1.0,0.0,,,1.0,,,,,,,1.0,,,,,,,1.0,1.0,0.0,,,1.0,1.0,0.0,,,,,1.0,1.0,0.0,,,1.0,1.0,,,1.0,,,,,,,,,,,1.0,,,,,,,1.0,,,,,...,0,51665.5,52774.068487,2.208497e+162,51665.5,52774.068487,2.208497e+162,52821.432432,53012.083656,2.4825730000000003e+158,52821.432432,53012.083656,2.4825730000000003e+158,,,1.0,0.0,51665.5,52774.068487,2.208497e+162,,,,,51665.5,52774.068487,2.208497e+162,,,,,51665.5,52774.068487,2.208497e+162,1.0,0.0,51665.5,52774.068487,2.208497e+162,1.0,0.0,,,52821.432432,53012.083656,2.4825730000000003e+158,1.0,0.0,51665.5,52774.068487,2.208497e+162,1.0,22356.4,35448.820154,2.219075e+19,,,,,,,,,36298.0,49621.220854,4771291000000.0,,,,,36298.0,49621.220854,4771291000000.0,,,,,36298.0,49621.220854,4771291000000.0,,,,,36298.0,49621.220854,4771291000000.0,,,,,5,0.965926,0.258819,4,-0.433884,-0.900969,7,0.988468,0.151428


In [10]:
# df_train = df[df[y_label].notna()].reset_index(drop=True)
# df_test = df[df[y_label].isna()].reset_index(drop=True)

df_train_extend = df_train[[y_label,'id']]
df_test_extend = df_test[[y_label,'id']]

In [11]:
df_train = df_train.fillna(-999)
df_test = df_test.fillna(-999)

### null importance特征筛选

In [12]:
feats, categorical_feats = get_null_importance(df_train.drop(columns=[y_label,'id']).copy(),
                                               df_train[y_label].copy(), 
                                               thresholds=15)

In [13]:

object_feats = ['ip_transform', 'user_name', 'device_num_transform']

df_train = pd.concat([df_train_info[object_feats], df_train[feats], df_train_extend], axis=1)
df_test = pd.concat([df_test[feats], df_test_extend], axis=1)

## modeling

In [14]:
feats = df_train.columns.drop(['id', y_label]+object_feats)
feats = feats.drop(categorical_feats)
#feats 

In [25]:
feats = feats_importance.sort_values('importance', ascending=False)[:20]['name'].values

In [26]:

import time
from sklearn.metrics import roc_auc_score as auc
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold

In [27]:
params = {
    'learning_rate': 0.05,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'verbose': -1,
    'seed': 2222,
    'n_jobs': -1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'bagging_freq': 4,
    # 'min_child_weight': 10,
}

In [28]:
fold_num = 5
seeds = [2022]
oof = np.zeros(len(df_train))
importance = 0
pred_y = pd.DataFrame()
score = []
for seed in seeds:
    kf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=seed)
    # kf = KFold(n_splits=fold_num, shuffle=True, random_state=seed)
    for fold, (train_idx, val_idx) in enumerate(kf.split(df_train[feats], df_train[y_label])):
        print('-----------', fold)
        train = lgb.Dataset(df_train.loc[train_idx, feats],
                            df_train.loc[train_idx, y_label],
                           # categorical_feature=categorical_feats
                           )
        val = lgb.Dataset(df_train.loc[val_idx, feats],
                          df_train.loc[val_idx, y_label],
                          #categorical_feature=categorical_feats
                         )
        model = lgb.train(params, train, valid_sets=[val], 
                          num_boost_round=20000, early_stopping_rounds=100)

        oof[val_idx] += model.predict(df_train.loc[val_idx, feats]) / len(seeds)
        pred_y['fold_%d_seed_%d' % (fold, seed)] = model.predict(df_test[feats])
        importance += model.feature_importance(importance_type='gain') / fold_num
        score.append(auc(df_train.loc[val_idx, y_label], model.predict(df_train.loc[val_idx, feats])))
feats_importance = pd.DataFrame()
feats_importance['name'] = feats
feats_importance['importance'] = importance
display(feats_importance.sort_values('importance', ascending=False)[:30])

df_train['oof'] = oof
display(np.mean(score), np.std(score))

score = np.mean(score)
df_test[y_label] = pred_y.mean(axis=1).values
df_test = df_test.sort_values('id').reset_index(drop=True)

sub = pd.read_csv(path_sample_submission)
sub[y_label] = df_test[y_label].values
sub.to_csv(os.path.join(path_results_jupyter,time.strftime('lgb_%Y%m%d%H%M_')+'%.5f.csv'%score), index=False)

----------- 0
[1]	valid_0's auc: 0.936586
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.939716
[3]	valid_0's auc: 0.93976
[4]	valid_0's auc: 0.940048
[5]	valid_0's auc: 0.941734
[6]	valid_0's auc: 0.941782
[7]	valid_0's auc: 0.941785
[8]	valid_0's auc: 0.942037
[9]	valid_0's auc: 0.942149
[10]	valid_0's auc: 0.942136
[11]	valid_0's auc: 0.942213
[12]	valid_0's auc: 0.942222
[13]	valid_0's auc: 0.942443
[14]	valid_0's auc: 0.94248
[15]	valid_0's auc: 0.942446
[16]	valid_0's auc: 0.942418
[17]	valid_0's auc: 0.942536
[18]	valid_0's auc: 0.942524
[19]	valid_0's auc: 0.94221
[20]	valid_0's auc: 0.942237
[21]	valid_0's auc: 0.942115
[22]	valid_0's auc: 0.942224
[23]	valid_0's auc: 0.942211
[24]	valid_0's auc: 0.9419
[25]	valid_0's auc: 0.941959
[26]	valid_0's auc: 0.941933
[27]	valid_0's auc: 0.942016
[28]	valid_0's auc: 0.942029
[29]	valid_0's auc: 0.941954
[30]	valid_0's auc: 0.941853
[31]	valid_0's auc: 0.941852
[32]	valid_0's auc: 0.941714
[33]	valid

[197]	valid_0's auc: 0.941572
[198]	valid_0's auc: 0.941439
[199]	valid_0's auc: 0.941423
[200]	valid_0's auc: 0.941273
[201]	valid_0's auc: 0.941237
[202]	valid_0's auc: 0.941348
[203]	valid_0's auc: 0.941346
[204]	valid_0's auc: 0.941349
[205]	valid_0's auc: 0.941255
[206]	valid_0's auc: 0.941302
[207]	valid_0's auc: 0.94116
[208]	valid_0's auc: 0.941086
[209]	valid_0's auc: 0.941054
[210]	valid_0's auc: 0.94119
[211]	valid_0's auc: 0.941236
[212]	valid_0's auc: 0.941283
Early stopping, best iteration is:
[112]	valid_0's auc: 0.942404
----------- 2
[1]	valid_0's auc: 0.933272
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.932652
[3]	valid_0's auc: 0.936373
[4]	valid_0's auc: 0.936825
[5]	valid_0's auc: 0.936545
[6]	valid_0's auc: 0.937309
[7]	valid_0's auc: 0.937251
[8]	valid_0's auc: 0.936943
[9]	valid_0's auc: 0.937244
[10]	valid_0's auc: 0.937205
[11]	valid_0's auc: 0.937262
[12]	valid_0's auc: 0.937267
[13]	valid_0's auc: 0.936827
[14]	valid_0'

[117]	valid_0's auc: 0.942522
[118]	valid_0's auc: 0.94259
[119]	valid_0's auc: 0.942563
[120]	valid_0's auc: 0.942572
[121]	valid_0's auc: 0.942654
[122]	valid_0's auc: 0.942565
[123]	valid_0's auc: 0.942521
[124]	valid_0's auc: 0.942902
[125]	valid_0's auc: 0.942964
[126]	valid_0's auc: 0.943443
[127]	valid_0's auc: 0.943383
[128]	valid_0's auc: 0.943239
[129]	valid_0's auc: 0.943145
[130]	valid_0's auc: 0.943001
[131]	valid_0's auc: 0.942636
[132]	valid_0's auc: 0.942661
[133]	valid_0's auc: 0.94266
[134]	valid_0's auc: 0.942681
[135]	valid_0's auc: 0.942715
[136]	valid_0's auc: 0.942716
[137]	valid_0's auc: 0.942513
[138]	valid_0's auc: 0.942888
[139]	valid_0's auc: 0.942741
[140]	valid_0's auc: 0.942771
Early stopping, best iteration is:
[40]	valid_0's auc: 0.944809
----------- 4
[1]	valid_0's auc: 0.934583
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.938127
[3]	valid_0's auc: 0.938183
[4]	valid_0's auc: 0.938161
[5]	valid_0's auc: 0.93909
[6]

Unnamed: 0,name,importance
0,system_system_op_times_groups_http_status_code...,83245.377554
1,system_system_op_times_groups_http_status_code...,28725.900483
2,system_op_month_ip_transform_op_diff_second1_std,27451.632842
3,op_times_groups_ip_transform_ip_type_内网_cumsum,18985.689243
4,op_month_ip_transform_op_diff_second1_std,12803.728352
7,system_system_op_times_groups_http_status_code...,9339.703713
5,op_times_groups_ip_transform_log_system_transf...,4941.74904
6,hour_cos,4603.206085
15,system_system_op_times_groups_os_type_op_diff_...,2155.340045
9,system_op_days_http_status_code_200_cumsum,1898.285466


0.9425855631286975

0.002010898620962687

In [29]:
feats_importance.sort_values('importance', ascending=False)[:50]

Unnamed: 0,name,importance
0,system_system_op_times_groups_http_status_code...,83245.377554
1,system_system_op_times_groups_http_status_code...,28725.900483
2,system_op_month_ip_transform_op_diff_second1_std,27451.632842
3,op_times_groups_ip_transform_ip_type_内网_cumsum,18985.689243
4,op_month_ip_transform_op_diff_second1_std,12803.728352
7,system_system_op_times_groups_http_status_code...,9339.703713
5,op_times_groups_ip_transform_log_system_transf...,4941.74904
6,hour_cos,4603.206085
15,system_system_op_times_groups_os_type_op_diff_...,2155.340045
9,system_op_days_http_status_code_200_cumsum,1898.285466


In [30]:
feats_importance.sort_values('importance', ascending=False)['name'].values

array(['system_system_op_times_groups_http_status_code_op_diff_second1_prod',
       'system_system_op_times_groups_http_status_code_200_cumsum',
       'system_op_month_ip_transform_op_diff_second1_std',
       'op_times_groups_ip_transform_ip_type_内网_cumsum',
       'op_month_ip_transform_op_diff_second1_std',
       'system_system_op_times_groups_http_status_code_op_diff_second1_std',
       'op_times_groups_ip_transform_log_system_transform_dwS3cdn15GK4_cumsum',
       'hour_cos',
       'system_system_op_times_groups_os_type_op_diff_second1_prod',
       'system_op_days_http_status_code_200_cumsum',
       'op_days_ip_transform_http_status_code_op_diff_second1_std',
       'system_op_days_ip_type_内网_cumsum',
       'op_month_ip_transform_op_city_op_diff_second1_prod',
       'system_op_days_ip_type_内网_error_code_cumsum',
       'op_times_groups_ip_transform_http_status_code_op_diff_second1_std',
       'op_month_ip_transform_http_status_code_op_diff_second1_mean',
       'system_s

In [None]:
df_train_info['oof']=df_train['oof']

df_train_info[abs(df_train_info['is_risk']-df_train_info['oof'])>0.7]['device_num_transform'].value_counts()

In [None]:
df_train_info[df_train_info['device_num_transform']=='0dV6LzVsv7pW']