In [2]:
import os
import re
import pickle as pkl
import numpy as np
import pandas as pd
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

import scorpyo as sp

from null_importance import get_null_importance

from gensim.models import word2vec


pd.set_option('max_rows', 500, 'max_columns', 200)

In [3]:
path_project = r'/Users/liliangshan/workspace/python/01_datasets/ccf_system_access_risk_identification'

# path dir
path_row_data = os.path.join(path_project, 'row_data')
path_new_data = os.path.join(path_project, 'new_data')
path_results  = os.path.join(path_project, 'results')
path_results_jupyter  = os.path.join(path_results, 'jupyter')

# path row_data
path_train = os.path.join(path_row_data, 'train.csv')
path_test  = os.path.join(path_row_data, 'evaluation_public.csv')
path_sample_submission = os.path.join(path_row_data, 'submit_example.csv')


path_new_train = os.path.join(path_new_data, 'train_lightgbm_20221014.csv')
path_new_test  = os.path.join(path_new_data, 'test_lightgbm_20221014.csv')

## results
path_output_report = os.path.join(path_results, '01_原始数据探察_20221014.xlsx')

y_label = "is_risk"

In [4]:
df_row_train = sp.read_data(path_train)
df_row_val  = sp.read_data(path_test)

df_row_train['url_sit'] = df_row_train['url'].map(lambda x: x.split('/')[0])
df_row_train['url_page'] = df_row_train['url'].map(lambda x: x.split('/')[1])

df_row_val['url_sit'] = df_row_val['url'].map(lambda x: x.split('/')[0])
df_row_val['url_page'] = df_row_val['url'].map(lambda x: x.split('/')[1])



df_train_info = df_row_train.copy().sort_values(by='op_datetime')

In [5]:
df = pd.concat([df_row_train, df_row_val]).reset_index(drop=True)
df = df.sort_values(by='op_datetime')

# 认证日期时间
df['op_datetime'] = pd.to_datetime(df['op_datetime'])
# 将数据分为每一天
df['op_days'] = df['op_datetime'].map(lambda x: x.strftime('%Y-%m-%d'))

# 按客户进行统计这次认证和上次认证的时间差
df['op_second'] = df['op_datetime']
df['op_second1'] = df.groupby('device_num_transform')['op_second'].shift(1)
df['op_diff_second1'] = (df['op_second'] - df['op_second1']).map(lambda x: x.total_seconds())
df['op_diff_second1_log'] = df['op_diff_second1'].apply(np.log)
df['op_diff_second1_log_log'] = df['op_diff_second1'].apply(np.log).apply(np.log)

df['op_diff_second1_tmp'] = df['op_diff_second1']>10
# 客户第几次登录
df['op_times_groups'] = df.groupby('device_num_transform')['op_diff_second1_tmp'].apply(lambda x: x.cumsum())

# 系统层面的一段时间
df['system_op_second'] = df['op_datetime']
df['system_op_second1'] = df['system_op_second'].shift(1)
df['system_op_diff_second1'] = (df['system_op_second'] - df['system_op_second1']).map(lambda x: x.total_seconds())

df['system_op_diff_second1_log'] = df['system_op_diff_second1'].apply(np.log)
df['system_op_diff_second1_log_log'] = df['system_op_diff_second1'].apply(np.log).apply(np.log)
df['system_op_diff_second1_tmp'] = df['system_op_diff_second1']>400
# 客户第几次登录
df['system_op_times_groups'] = df['system_op_diff_second1_tmp'].cumsum()

df = df.drop(columns=['op_second','op_second1','op_diff_second1_tmp', 
                      'system_op_second', 'system_op_second1', 'system_op_diff_second1_tmp'])

In [6]:
df.head()

Unnamed: 0,id,user_name,department,ip_transform,device_num_transform,browser_version,browser,os_type,os_version,op_datetime,ip_type,http_status_code,op_city,log_system_transform,url,op_month,is_risk,url_sit,url_page,op_days,op_diff_second1,op_diff_second1_log,op_diff_second1_log_log,op_times_groups,system_op_diff_second1,system_op_diff_second1_log,system_op_diff_second1_log_log,system_op_times_groups
44477,44477,xiongkai3397,rd,6H1iPLgBB,GCgxrFb69up7,chrome_93,chrome,win,win10,2022-01-07 02:44:29,内网,200,深圳,nHrKgKdJ1Mzt,xxx.com/github,2022-01,1.0,xxx.com,github,2022-01-07,,,,0,,,,0
45489,45489,zhengguiying7117,rd,0mjaEf4SB,8ftsXFm5I1Ej,safari_13,safari,macos,macos_big_sur_11,2022-01-07 02:54:32,内网,200,成都,nHrKgKdJ1Mzt,xxx.com/github,2022-01,1.0,xxx.com,github,2022-01-07,,,,0,603.0,6.401917,1.856598,1
45706,45706,yuanjun5870,hr,1Vk2kEa4X,W1Cstajd8x1s,firefox_78,firefox,win,win7,2022-01-07 03:00:56,内网,200,深圳,a5G25puBl9xj,hr.xxx.com/,2022-01,1.0,hr.xxx.com,,2022-01-07,,,,0,384.0,5.950643,1.783499,1
45901,45901,zhoutingting3694,rd,4Wj6uxLx3,H8NAVsdws95G,edge_93,edge,win,win10,2022-01-07 04:29:34,内网,200,杭州,nHrKgKdJ1Mzt,xxx.com/github,2022-01,1.0,xxx.com,github,2022-01-07,,,,0,5318.0,8.578853,2.1493,2
43827,43827,yanglin6562,sales,eK12oQmm8,GnkVqPSy5nnl,ie_9,ie,win,win10,2022-01-07 05:17:44,内网,200,重庆,sW0whYIx8LFM,work.xxx.com/task,2022-01,1.0,work.xxx.com,task,2022-01-07,,,,0,2890.0,7.969012,2.07556,3


## 生成词向量

In [7]:
col = 'ip_transform'

In [8]:

df_cols = df.groupby(['op_days'])[col].agg(lambda x: " ".join(list(x)))
df_cols.head()

op_days
2022-01-07    6H1iPLgBB 0mjaEf4SB 1Vk2kEa4X 4Wj6uxLx3 eK12oQ...
2022-01-08    yS1GaXpf3 4RwkugApp 7mXXPk8zL l5GL3bnl4 eK12oQ...
2022-01-09    edI04wZaT yS1GaXpf3 5eXRfNKRw G8OjKvHJz vqas26...
2022-01-10    3lOkeiuhp 73vwxAwdI DC2YYlNvE 7mXXPk8zL 5Si5Wm...
2022-01-11    H0TKapkPL M1JrVV1od 2GkUZeD9D TUrmuPTg4 VfRbHv...
Name: ip_transform, dtype: object

In [10]:
df_cols.head()

op_days
2022-01-07    6H1iPLgBB 0mjaEf4SB 1Vk2kEa4X 4Wj6uxLx3 eK12oQ...
2022-01-08    yS1GaXpf3 4RwkugApp 7mXXPk8zL l5GL3bnl4 eK12oQ...
2022-01-09    edI04wZaT yS1GaXpf3 5eXRfNKRw G8OjKvHJz vqas26...
2022-01-10    3lOkeiuhp 73vwxAwdI DC2YYlNvE 7mXXPk8zL 5Si5Wm...
2022-01-11    H0TKapkPL M1JrVV1od 2GkUZeD9D TUrmuPTg4 VfRbHv...
Name: ip_transform, dtype: object

In [11]:
df_cols.fillna('', inplace=True)
df_cols.head()

op_days
2022-01-07    6H1iPLgBB 0mjaEf4SB 1Vk2kEa4X 4Wj6uxLx3 eK12oQ...
2022-01-08    yS1GaXpf3 4RwkugApp 7mXXPk8zL l5GL3bnl4 eK12oQ...
2022-01-09    edI04wZaT yS1GaXpf3 5eXRfNKRw G8OjKvHJz vqas26...
2022-01-10    3lOkeiuhp 73vwxAwdI DC2YYlNvE 7mXXPk8zL 5Si5Wm...
2022-01-11    H0TKapkPL M1JrVV1od 2GkUZeD9D TUrmuPTg4 VfRbHv...
Name: ip_transform, dtype: object

In [12]:
df_res = df_cols
df_res.head()

op_days
2022-01-07    6H1iPLgBB 0mjaEf4SB 1Vk2kEa4X 4Wj6uxLx3 eK12oQ...
2022-01-08    yS1GaXpf3 4RwkugApp 7mXXPk8zL l5GL3bnl4 eK12oQ...
2022-01-09    edI04wZaT yS1GaXpf3 5eXRfNKRw G8OjKvHJz vqas26...
2022-01-10    3lOkeiuhp 73vwxAwdI DC2YYlNvE 7mXXPk8zL 5Si5Wm...
2022-01-11    H0TKapkPL M1JrVV1od 2GkUZeD9D TUrmuPTg4 VfRbHv...
Name: ip_transform, dtype: object

In [13]:
df_num = df_res.apply(lambda x: len(x.split()))
df_num

op_days
2022-01-07    513
2022-01-08     36
2022-01-09     34
2022-01-10    542
2022-01-11    624
2022-01-12    630
2022-01-13    562
2022-01-14    613
2022-01-15     40
2022-01-16     42
2022-01-17    653
2022-01-18    495
2022-01-19    645
2022-01-20    580
2022-01-21    612
2022-01-22     35
2022-01-23     35
2022-01-24    582
2022-01-25    544
2022-01-26    549
2022-01-27    647
2022-01-28    631
2022-01-29    669
2022-01-30    575
2022-01-31     34
2022-02-01     31
2022-02-02     35
2022-02-03     38
2022-02-04     44
2022-02-05     33
2022-02-06     43
2022-02-07    668
2022-02-08    535
2022-02-09    553
2022-02-10    607
2022-02-11    544
2022-02-12     43
2022-02-13     23
2022-02-14    576
2022-02-15    598
2022-02-16    589
2022-02-17    603
2022-02-18    616
2022-02-19     34
2022-02-20     37
2022-02-21    604
2022-02-22    605
2022-02-23    563
2022-02-24    647
2022-02-25    599
2022-02-26     32
2022-02-27     44
2022-02-28    638
2022-03-01    611
2022-03-02    709
20

In [14]:
df_res = df_res[df_num > 3]
df_res.head()

op_days
2022-01-07    6H1iPLgBB 0mjaEf4SB 1Vk2kEa4X 4Wj6uxLx3 eK12oQ...
2022-01-08    yS1GaXpf3 4RwkugApp 7mXXPk8zL l5GL3bnl4 eK12oQ...
2022-01-09    edI04wZaT yS1GaXpf3 5eXRfNKRw G8OjKvHJz vqas26...
2022-01-10    3lOkeiuhp 73vwxAwdI DC2YYlNvE 7mXXPk8zL 5Si5Wm...
2022-01-11    H0TKapkPL M1JrVV1od 2GkUZeD9D TUrmuPTg4 VfRbHv...
Name: ip_transform, dtype: object

In [15]:
df_res.to_csv(os.path.join(path_new_data, 'corpus.csv'), index=False, header=False, sep='\t')

In [16]:
def train_embedding(path_corpus, path_save_models, path_save_txt, col):
    sentences = word2vec.Text8Corpus(path_corpus)  # 原始语料路径,已分词
    # 训练代码
    model = word2vec.Word2Vec(sentences, sg=1, vector_size=5, window=12, min_count=1,
                              hs=0,  workers=10, epochs=10)
    # save
    path_embedding_model = os.path.join(path_save_models, 'models_{}.model'.format(str(col)))
    path_embedding_vocab = os.path.join(path_save_txt, 'models_{}_embedding.txt'.format(str(col)))

    model.save(path_embedding_model)
    model.wv.save_word2vec_format(path_embedding_vocab)
    print('词向量训练完成：{}'.format(str(col)))

In [17]:
train_embedding(
    path_corpus = os.path.join(path_new_data, 'corpus.csv'),
    path_save_models = os.path.join(path_new_data, 'corpus_models'),
    path_save_txt = os.path.join(path_new_data, 'corpus_txt'),
    col = 'device_num_transform',
               )

词向量训练完成：device_num_transform


In [18]:
df_tmp = pd.read_csv('/Users/liliangshan/workspace/python/01_datasets/ccf_system_access_risk_identification/new_data/corpus_txt/models_device_num_transform_embedding.txt', skiprows=1, header=None, sep=' ' )
df_tmp

Unnamed: 0,0,1,2,3,4,5
0,u9diCFdYZ,1.191250,0.884346,0.960100,-0.587945,-0.437067
1,w2CfuqTz3,-0.310487,0.538982,1.129704,-1.143607,0.040968
2,YBCE8ld50,0.031544,1.011728,-0.039371,-1.519553,-0.246772
3,5KbVyNsBf,0.134305,0.212648,0.355266,-1.386184,-0.776587
4,FQND8WWo5,0.745319,0.974483,0.725129,-0.819523,-1.404120
...,...,...,...,...,...,...
3105,4XpDhY9N8,0.447430,0.264440,0.598619,-1.016775,-0.057525
3106,b8PyvUL9p,0.446785,0.555997,0.494677,-0.910030,-0.297620
3107,6CzA6Vd7a,0.249200,0.495751,0.629262,-0.913831,-0.121946
3108,6zzVVzNbn,0.279105,0.324233,0.421833,-1.026533,-0.260341


In [None]:
# 构建符合本项目的词向量
embeddings = np.random.rand(len(word2id), int(config['train_test_settings']['embedding_dim']))
f = open(path_row_embedding, 'r', encoding='UTF-8')
for i, line in enumerate(f):
    if i == 0:  # 若第一行是标题，则跳过
        continue
    lin = line.strip().split(" ")  # 预训练词向量
    if lin[0] in word2id:
        idx = word2id[lin[0]]
        emb = [float(x) for x in lin[1: int(config['train_test_settings']['embedding_dim']) + 1]]
        embeddings[idx] = np.asarray(emb, dtype="float32")
f.close()

In [26]:
df_res.head(10)

device_num_transform
01GOhHDp463z    xxx.com/mail xxx.com/loginAuth xxx.com/github ...
04F4iKnBDqb6    xxx.com/mail wpsdoc.xxx.com/kdocs work.xxx.com...
04MrZwVR5Dh4                                 xxx.com/getLoginType
05DiseR8wyyh                                 xxx.com/getLoginType
05SghMEzzxQQ                                xxx.com/getVerifyCode
09qlLew78XXp                                 xxx.com/getLoginType
0AG7bOroVGrh                                xxx.com/getVerifyCode
0B1EZlyCIjEA                                xxx.com/getVerifyCode
0BJZnGywzJAc                                 xxx.com/getLoginType
0DSr7NTlrmpE                                 xxx.com/getLoginType
dtype: object

In [29]:
df_num[df_num>3]

device_num_transform
01GOhHDp463z    410
04F4iKnBDqb6    393
0L5KlIYwu0Yy    423
0bdar6YzF7TR    321
0dV6LzVsv7pW    477
0jQzrW0ob3tw    378
0vwTFOBrbvIs    383
19sK60AbnGgR    422
1BmyW1drMraT    374
1NnRdFzp9NWJ    368
1SRtvCb9DNFr    412
1baNbqxMWcCu    409
1gUBfR9p6HLE    363
1lTglzPf2SLB    423
21DcH4wj4tEf    338
2AECzhJLRR4Y    385
2EmjEhrepKLJ    436
2PQLbdJAk4QW    306
2ZOGB1eOX3vU    419
2pOKemyYTkvI    369
2u1rhQmiknmk    386
36AnzTaMQafK    386
39OE3BHMPdqV    345
3ANg3oEpqSxD    355
3nvH2PHqvZrr    311
3rYjJlAh0Q1x    327
3wDqyLqvVCn1    433
42y4i2PwNyBY    361
44BDMyay4hg1    394
4BWxjoSreaOm    468
4ERrLGUd7rMh    361
4G2Bw0Csw5lz    413
4Jv2QioonsJM    318
4Qy11wvgpGFs    418
4SXmah16O6AH    403
5DmlITfRNR36    445
5Qli6KaoYmgU    356
5SgCho991yJr    388
5nWnCuken0p8    384
5yApVeSk7drn    375
651TrktciC9i    409
6MwEE0egsCb7    300
6NRAoXZogVDX    403
6T3i25megG6k    364
6YuqJIvp6wuX    409
6m7Wp9p29RXI    377
6ypVTOMezZPI    339
6zUqVT1ygOEZ    443
72IsjEv4ThEB    334

In [None]:

    df_res = df_user_create.apply(lambda x: ' '.join(list(filter(None, list(x)))), axis=1)
    df_num = df_res.apply(lambda x: len(x.split()))

In [None]:
df = df.dropna(columns=[cols])
df_cols = df.groupby(['op_days','device_num_transform'])[col].agg(lambda x: " ".join(list(x)))
df_cols = df_cols.unstack()
df_cols = 

In [None]:
def get_corpus(df, col, path_save, freq=0.95):
    """
    生成用于训练词向量的语料
    :param df:
    :param col:
    :param path_save:
    :param freq:
    :return:
    """
    df = df.dropna(columns=[cols])
    df_cols = df.groupby(['op_days','device_num_transform'])[col].agg(lambda x: " ".join(list(x)))
    df_cols = df_cols.unstack()
    df_cols = 

In [5]:

def get_corpus(config, col, freq=0.95):
    """
    生成用于训练词向量的语料
    :param freq:
    :param config:
    :param col:
    :return:
    """
    path_col = os.path.join(config['path_pipeline_settings']['path_pipeline_col_click'],
                            '{}_corpus_{}.csv'.format(col, str(int(freq * 100))))
    path_save = os.path.join(config['path_pipeline_settings']['path_col_corpus'],
                             '{}_corpus.txt'.format(col))
    if os.path.exists(path_save):
        logger.debug('03 训练语料已存在： {}_corpus_{}.csv'.format(col, str(int(freq * 100))))
        return
    df_col = pd.read_csv(path_col, na_values=r'\N', dtype={col: str}, usecols=['time', 'user_id', col])
    df_col = df_col.dropna()
    df_user_create = df_col.groupby(['user_id', 'time'])[col].agg(lambda x: ' '.join(list(x)))
    df_user_create = df_user_create.unstack()
    df_user_create = df_user_create[list(range(1, 92))]
    df_user_create.fillna('', inplace=True)

    df_res = df_user_create.apply(lambda x: ' '.join(list(filter(None, list(x)))), axis=1)
    df_num = df_res.apply(lambda x: len(x.split()))
    #df_res = df_res[df_num > 7]

    df_res.to_csv(path_save, index=False, header=False, sep='\t')

    logger.debug('03 训练语料已生成： {}_corpus_{}.csv'.format(col, str(int(freq * 100))))


Unnamed: 0,id,user_name,department,ip_transform,device_num_transform,browser_version,browser,os_type,os_version,op_datetime,ip_type,http_status_code,op_city,log_system_transform,url,op_month,is_risk,url_sit,url_page,op_days,op_diff_second1,op_diff_second1_log,op_diff_second1_log_log,op_times_groups,system_op_diff_second1,system_op_diff_second1_log,system_op_diff_second1_log_log,system_op_times_groups
44477,44477,xiongkai3397,rd,6H1iPLgBB,GCgxrFb69up7,chrome_93,chrome,win,win10,2022-01-07 02:44:29,内网,200,深圳,nHrKgKdJ1Mzt,xxx.com/github,2022-01,1.0,xxx.com,github,2022-01-07,,,,0,,,,0
45489,45489,zhengguiying7117,rd,0mjaEf4SB,8ftsXFm5I1Ej,safari_13,safari,macos,macos_big_sur_11,2022-01-07 02:54:32,内网,200,成都,nHrKgKdJ1Mzt,xxx.com/github,2022-01,1.0,xxx.com,github,2022-01-07,,,,0,603.0,6.401917,1.856598,1
45706,45706,yuanjun5870,hr,1Vk2kEa4X,W1Cstajd8x1s,firefox_78,firefox,win,win7,2022-01-07 03:00:56,内网,200,深圳,a5G25puBl9xj,hr.xxx.com/,2022-01,1.0,hr.xxx.com,,2022-01-07,,,,0,384.0,5.950643,1.783499,1
45901,45901,zhoutingting3694,rd,4Wj6uxLx3,H8NAVsdws95G,edge_93,edge,win,win10,2022-01-07 04:29:34,内网,200,杭州,nHrKgKdJ1Mzt,xxx.com/github,2022-01,1.0,xxx.com,github,2022-01-07,,,,0,5318.0,8.578853,2.1493,2
43827,43827,yanglin6562,sales,eK12oQmm8,GnkVqPSy5nnl,ie_9,ie,win,win10,2022-01-07 05:17:44,内网,200,重庆,sW0whYIx8LFM,work.xxx.com/task,2022-01,1.0,work.xxx.com,task,2022-01-07,,,,0,2890.0,7.969012,2.07556,3


In [None]:
df