In [1]:
import os
import re
import numpy as np
import pandas as pd
import datetime

import matplotlib.pyplot as plt
import seaborn as sns


import scorpyo as sp
from null_importance import get_null_importance
from time_sequence_feats import get_time_base, get_sequence_statis, get_sequence_groupby_statis

from gensim.models import word2vec

import warnings
warnings.filterwarnings("ignore")

pd.set_option('max_rows', 320, 'max_columns',100)

In [2]:
path_project = r'/Users/liliangshan/workspace/python/01_datasets/ccf_system_access_risk_identification'

# path dir
path_row_data = os.path.join(path_project, 'row_data')
path_new_data = os.path.join(path_project, 'new_data')
path_results  = os.path.join(path_project, 'results')
path_results_jupyter  = os.path.join(path_results, 'jupyter')

# path row_data
path_train = os.path.join(path_row_data, 'train.csv')
path_test  = os.path.join(path_row_data, 'evaluation_public.csv')
path_sample_submission = os.path.join(path_row_data, 'submit_example.csv')


path_new_train = os.path.join(path_new_data, 'train_lightgbm_20221014.csv')
path_new_test  = os.path.join(path_new_data, 'test_lightgbm_20221014.csv')

## results
path_output_report = os.path.join(path_results, '01_原始数据探察_20221014.xlsx')

y_label = "is_risk"

In [3]:
df_row_train = pd.read_csv(path_train)
df_row_val  = pd.read_csv(path_test)

In [None]:
df = pd.concat([df_row_train, df_row_val]).reset_index(drop=True)
df = df.sort_values(by='op_datetime')

# 按客户进行统计这次认证和上次认证的时间差
df['op_second'] = pd.to_datetime(df['op_datetime'])
df['op_second1'] = df.groupby('device_num_transform')['op_second'].shift(1)
df['op_second2'] = df.groupby('device_num_transform')['op_second'].shift(2)
df['op_diff_second1'] = (df['op_second'] - df['op_second1']).map(lambda x: x.total_seconds())
df['op_diff_second2'] = (df['op_second'] - df['op_second2']).map(lambda x: x.total_seconds())

# 系统层面的一段时间
df['system_op_second'] = pd.to_datetime(df['op_datetime'])
df['system_op_second1'] = df['system_op_second'].shift(1)
df['system_op_second2'] = df['system_op_second'].shift(2)
df['system_op_diff_second1'] = (df['system_op_second'] - df['system_op_second1']).map(lambda x: x.total_seconds())
df['system_op_diff_second2'] = (df['system_op_second'] - df['system_op_second2']).map(lambda x: x.total_seconds())


df = df.drop(columns=['op_second','op_second1', 'op_second2',
                      'system_op_second', 'system_op_second1','system_op_second2'])
df.head()

In [None]:
df = pd.concat([df_row_train, df_row_val]).reset_index(drop=True)
df = df.sort_values(by='op_datetime')

# 认证日期时间
df['op_datetime'] = pd.to_datetime(df['op_datetime'])
# 将数据分为每一天
df['op_days'] = df['op_datetime'].map(lambda x: x.strftime('%Y-%m-%d'))

# 按客户进行统计这次认证和上次认证的时间差
df['op_second'] = df['op_datetime']
df['op_second1'] = df.groupby('device_num_transform')['op_second'].shift(1)
df['op_second2'] = df.groupby('device_num_transform')['op_second'].shift(2)
df['op_diff_second1'] = (df['op_second'] - df['op_second1']).map(lambda x: x.total_seconds())
df['op_diff_second2'] = (df['op_second'] - df['op_second2']).map(lambda x: x.total_seconds())

df['op_diff_second1_tmp'] = df['op_diff_second1']>10
# 客户第几次登录
df['op_times_groups'] = df.groupby('device_num_transform')['op_diff_second1_tmp'].apply(lambda x: x.cumsum())

# 系统层面的一段时间
df['system_op_second'] = df['op_datetime']
df['system_op_second1'] = df['system_op_second'].shift(1)
df['system_op_second2'] = df['system_op_second'].shift(2)
df['system_op_diff_second1'] = (df['system_op_second'] - df['system_op_second1']).map(lambda x: x.total_seconds())
df['system_op_diff_second2'] = (df['system_op_second'] - df['system_op_second2']).map(lambda x: x.total_seconds())

df['system_op_diff_second1_tmp'] = df['system_op_diff_second1']>400
# 客户第几次登录
df['system_op_times_groups'] = df['system_op_diff_second1_tmp'].cumsum()

df = df.drop(columns=['op_second','op_second1','op_diff_second1_tmp', 'op_second2', 
                      'system_op_diff_second1', 'system_op_second2', 'system_op_second2',
                      'system_op_second', 'system_op_second1', 'system_op_diff_second1_tmp'])

In [None]:
cate_cols = ['ip_transform', 'user_name', 'device_num_transform', 'department', 'browser_version', 'browser', 'os_type','os_version',
              'ip_type','http_status_code', 'op_city', 'log_system_transform', 'url']

df = get_time_base(df, cols='op_datetime')
df = get_sequence_statis(df, col='system_op_diff_second1', n=5, freq=3 )
df = get_sequence_groupby_statis(df, col='system_op_diff_second1',cate_cols= cate_cols, n=5, freq=3)
#df = get_sequence_statis(df, col='system_op_diff_second2', n=5, freq=3 )
#df = get_sequence_groupby_statis(df, col='system_op_diff_second2',cate_cols= cate_cols, n=5, freq=3)
df.head()

In [None]:

time_feats = ['system_op_times_groups', 'op_days', 'op_month']

cate_feats = ['ip_transform', 'user_name', 'device_num_transform', 'department', 'browser_version', 'browser', 'os_type','os_version',
              'ip_type','http_status_code', 'op_city', 'log_system_transform', 'url']

# 客户+时间+按时间cumsum/cumunique
# 累计量统计
df['helper'] = 1
# 是否为异常状态码
df['http_status_code_helper'] = df['http_status_code'].map(lambda x: 1 if x in [400, 500, 502, 404] else 0) 
# 给样本编号
df['sampler_index_helper'] = df['helper'].cumsum()

for i in time_feats:
    i_tmp = df.groupby([i])
    # 系统往前看，处理了多少事
    df['system_{}_cumsum'.format(i)] = i_tmp['helper'].cumsum()
    # 系统往前看，处理了多少坏事
    df['system_{}_error_code_cumsum'.format(i)] = i_tmp['http_status_code_helper'].cumsum()
    
    for j in cate_feats:
        index_set = set(df.groupby([i, j],as_index=False).first()['sampler_index_helper'].values)
        df['tmp_helper'] = df['sampler_index_helper'].map(lambda x: 1 if x in index_set else 0)
        j_tmp = df.groupby([i, j])
        # 系统往前看，不同维度的处理了多少情况
        df['system_{}_{}_cumunique'.format(i, j)] = j_tmp['tmp_helper'].cumsum()
    

        if j not in ['ip_transform', 'user_name', 'device_num_transform']:
            for k in df[j].unique():
                tmp = df[df[j]==k].groupby([i])

                # 系统往前看，不同维度不同情况分别处理了多少次
                df['system_{}_{}_{}_cumsum'.format(i,j,k)] = tmp['helper'].cumsum()
                # 系统往前看，不同维度不同情况error_code分别处理了多少次
                df['system_{}_{}_{}_error_code_cumsum'.format(i,j,k)] = tmp['http_status_code_helper'].cumsum()

remove_cols = [x for x in df.columns if x[-6:]=='helper']

df = df.drop(columns=remove_cols)

In [None]:

time_feats = ['op_times_groups', 'op_days', 'op_month']

object_feats = ['ip_transform', 'user_name', 'device_num_transform']

cate_feats = ['ip_transform', 'user_name', 'device_num_transform', 'department', 'browser_version', 'browser', 'os_type','os_version',
              'ip_type','http_status_code', 'op_city', 'log_system_transform', 'url']

# 客户+时间+按时间cumsum/cumunique
# 累计量统计
df['helper'] = 1
# 是否为异常状态码
df['http_status_code_helper'] = df['http_status_code'].map(lambda x: 1 if x in [400, 500, 502, 404] else 0) 
# 给样本编号
df['sampler_index_helper'] = df['helper'].cumsum()


for i in time_feats:
    for j in object_feats:
        j_tmp = df.groupby([i,j])
        df['{}_{}_cumsum'.format(i,j)] = j_tmp['helper'].cumsum()
        df['{}_{}_error_code_cumsum'.format(i,j)] = j_tmp['http_status_code_helper'].cumsum()
        
        for k in cate_feats:
            if k == j: continue
            index_set = set(df.groupby([i,j,k], as_index=False).first()['sampler_index_helper'].values)            
            df['tmp_helper'] = df['sampler_index_helper'].map(lambda x: 1 if x in index_set else 0)
            k_tmp = df.groupby([i,j,k])
            df['{}_{}_{}_cumunique'.format(i,j,k)] = k_tmp['tmp_helper'].cumsum()

            if k not in ['ip_transform', 'user_name', 'device_num_transform']:
                for v in df[k].unique():
                    v_tmp = df[df[k]==v].groupby([i,j])
                    df['{}_{}_{}_{}_cumsum'.format(i,j,k,v)] = v_tmp['helper'].cumsum()
                    df['{}_{}_{}_{}_error_code_cumsum'.format(i,j,k,v)] = v_tmp['http_status_code_helper'].cumsum()

remove_cols = [x for x in df.columns if x[-6:]=='helper']

df = df.drop(columns=remove_cols)

In [None]:
path_embeding = os.path.join(path_new_data, 'corpus_txt')

res_cols = []
for col in cate_feats:
    df_tmp = pd.read_csv(os.path.join(path_embeding, 'models_{}_embedding.txt'.format(col)), skiprows=1, header=None, sep=' ')
    df_tmp.columns = ['{}_{}'.format(col, i) for i in df_tmp.columns]
    df = pd.merge(left=df, right=df_tmp, how='left', left_on=col, right_on='{}_0'.format(col))
    res_cols.append('{}_0'.format(col))
    

remove_cols = ['ip_transform', 'user_name', 'device_num_transform', 'department', 'browser_version', 'browser', 'os_type','os_version',
              'ip_type','http_status_code', 'op_city', 'log_system_transform', 'url', 'op_datetime', 'op_month'] + res_cols

df = df.drop(columns=remove_cols)

In [None]:
df_train = df[df[y_label].notna()].reset_index(drop=True)
df_test = df[df[y_label].isna()].reset_index(drop=True)

In [None]:
feats1 = ['op_datetime_hour',
 'median_user_name_system_op_diff_second1_15',
 'min_department_system_op_diff_second1_15',
 'min_log_system_transform_system_op_diff_second1_3',
 'skew_user_name_system_op_diff_second1_3',
 'median_op_city_system_op_diff_second1_6',
 'max_device_num_transform_system_op_diff_second1_9',
 'median_op_city_system_op_diff_second1_15',
 'skew_ip_transform_system_op_diff_second1_3',
 'min_op_city_system_op_diff_second1_15',
 'max_user_name_system_op_diff_second1_3',
 'median_browser_system_op_diff_second1_15',
 'min_device_num_transform_system_op_diff_second1_6',
 'max_browser_version_system_op_diff_second1_9',
 'median_ip_transform_system_op_diff_second1_12',
 'skew_log_system_transform_system_op_diff_second1_9',
 'min_url_system_op_diff_second1_9',
 'skew_os_type_system_op_diff_second1_6',
 'skew_log_system_transform_system_op_diff_second1_3',
 'min_url_system_op_diff_second1_15',
 'median_device_num_transform_system_op_diff_second1_6',
 'skew_ip_type_system_op_diff_second1_6',
 'min_user_name_system_op_diff_second1_6',
 'max_department_system_op_diff_second1_9',
 'kurt_department_system_op_diff_second1_15',
 'min_browser_system_op_diff_second1_15',
 'median_browser_version_system_op_diff_second1_6',
 'skew_http_status_code_system_op_diff_second1_15',
 'min_department_system_op_diff_second1_6',
 'skew_log_system_transform_system_op_diff_second1_15',
 'median_device_num_transform_system_op_diff_second1_3',
 'median_department_system_op_diff_second1_15',
 'kurt_ip_type_system_op_diff_second1_12',
 'max_device_num_transform_system_op_diff_second1_6',
 'avg_user_name_system_op_diff_second1_9',
 'op_diff_second1',
 'kurt_ip_transform_system_op_diff_second1_6',
 'skew_http_status_code_system_op_diff_second1_3',
 'op_datetime_dayofweek_sin',
 'avg_ip_transform_system_op_diff_second1_6',
 'std_browser_version_system_op_diff_second1_6',
 'kurt_http_status_code_system_op_diff_second1_12',
 'max_log_system_transform_system_op_diff_second1_15',
 'skew_url_system_op_diff_second1_6',
 'kurt_http_status_code_system_op_diff_second1_9',
 'min_op_city_system_op_diff_second1_6',
 'max_ip_transform_system_op_diff_second1_3',
 'skew_department_system_op_diff_second1_9',
 'median_user_name_system_op_diff_second1_9',
 'min_device_num_transform_system_op_diff_second1_15',
 'min_url_system_op_diff_second1_6',
 'op_datetime_hour_cos',
 'min_ip_type_system_op_diff_second1_6',
 'avg_browser_system_op_diff_second1_9',
 'kurt_url_system_op_diff_second1_6',
 'min_http_status_code_system_op_diff_second1_12',
 'avg_ip_type_system_op_diff_second1_3',
 'kurt_http_status_code_system_op_diff_second1_6',
 'min_op_city_system_op_diff_second1_3',
 'kurt_department_system_op_diff_second1_6',
 'avg_os_type_system_op_diff_second1_15',
 'skew_log_system_transform_system_op_diff_second1_6',
 'skew_browser_version_system_op_diff_second1_9',
 'max_op_city_system_op_diff_second1_9',
 'kurt_url_system_op_diff_second1_15',
 'min_browser_system_op_diff_second1_3',
 'skew_department_system_op_diff_second1_12',
 'avg_http_status_code_system_op_diff_second1_15',
 'avg_url_system_op_diff_second1_6',
 'min_ip_type_system_op_diff_second1_15']

In [None]:
feats2 = ['op_month_ip_transform_browser_version_edge_93_cumsum',
 'system_system_op_times_groups_browser_version_edge_93_cumsum',
 'op_times_groups_device_num_transform_op_city_成都_cumsum',
 'op_month_ip_transform_op_city_成都_cumsum',
 'system_op_days_log_system_transform_nHrKgKdJ1Mzt_cumsum',
 'system_op_days_browser_chrome_cumsum',
 'system_system_op_times_groups_op_city_深圳_cumsum',
 'op_days_user_name_browser_version_edge_93_cumsum',
 'op_times_groups_device_num_transform_url_wpsdoc.xxx.com/download_cumsum',
 'system_op_days_ip_type_内网_error_code_cumsum',
 'op_month_user_name_http_status_code_200_error_code_cumsum',
 'system_op_days_os_version_win10_error_code_cumsum',
 'op_times_groups_device_num_transform_department_rd_cumsum',
 'op_days_ip_transform_department_sales_cumsum',
 'op_month_device_num_transform_browser_version_chrome_90_error_code_cumsum',
 'op_month_ip_transform_op_city_深圳_cumsum',
 'op_times_groups_device_num_transform_browser_edge_cumsum',
 'op_month_ip_transform_op_city_北京_cumsum',
 'op_month_device_num_transform_op_city_北京_error_code_cumsum',
 'system_op_days_ip_type_内网_cumsum',
 'op_times_groups_user_name_browser_chrome_cumsum',
 'op_month_user_name_op_city_杭州_error_code_cumsum',
 'op_month_device_num_transform_cumsum',
 'op_days_ip_transform_cumsum',
 'op_days_user_name_ip_type_内网_cumsum',
 'op_month_ip_transform_cumsum',
 'system_op_days_op_city_杭州_cumsum',
 'op_month_user_name_op_city_深圳_error_code_cumsum',
 'op_days_device_num_transform_op_city_成都_cumsum',
 'system_system_op_times_groups_log_system_transform_nHrKgKdJ1Mzt_cumsum',
 'op_days_ip_transform_url_xxx.com/github_cumsum',
 'op_month_device_num_transform_ip_type_内网_error_code_cumsum',
 'op_days_user_name_department_rd_error_code_cumsum',
 'op_days_ip_transform_op_city_北京_cumsum',
 'op_days_device_num_transform_op_city_杭州_cumsum',
 'system_system_op_times_groups_op_city_杭州_cumsum',
 'system_op_days_op_city_北京_cumsum',
 'system_system_op_times_groups_browser_chrome_cumsum',
 'system_system_op_times_groups_http_status_code_200_cumsum',
 'system_op_days_op_city_深圳_cumsum',
 'op_days_user_name_browser_chrome_cumsum',
 'op_month_ip_transform_ip_type_内网_cumsum',
 'system_system_op_times_groups_department_rd_error_code_cumsum',
 'op_days_device_num_transform_op_city_深圳_cumsum',
 'op_days_ip_transform_http_status_code_200_cumsum']

In [None]:
feats3=['ip_transform_2',
 'ip_transform_3',
 'ip_transform_4',
 'ip_transform_5',
 'user_name_1',
 'user_name_3',
 'user_name_4',
 'device_num_transform_1',
 'device_num_transform_2',
 'device_num_transform_3',
 'device_num_transform_5',
 'department_1',
 'department_2',
 'department_3',
 'department_4',
 'department_5',
 'browser_version_2',
 'browser_version_3',
 'browser_version_5',
 'browser_1',
 'browser_3',
 'os_version_1',
 'os_version_2',
 'os_version_3',
 'os_version_4',
 'os_version_5',
 'op_city_3',
 'op_city_4',
 'log_system_transform_1',
 'url_1',
 'url_2',
 'url_4',
 'url_5']