In [1]:
import os
import re
import pickle as pkl
import numpy as np
import pandas as pd
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

import scorpyo as sp

from null_importance import get_null_importance

from gensim.models import word2vec


pd.set_option('max_rows', 500, 'max_columns', 200)

In [2]:
path_project = r'/Users/liliangshan/workspace/python/01_datasets/ccf_system_access_risk_identification'

# path dir
path_row_data = os.path.join(path_project, 'row_data')
path_new_data = os.path.join(path_project, 'new_data')
path_results  = os.path.join(path_project, 'results')
path_results_jupyter  = os.path.join(path_results, 'jupyter')

# path row_data
path_train = os.path.join(path_row_data, 'train.csv')
path_test  = os.path.join(path_row_data, 'evaluation_public.csv')
path_sample_submission = os.path.join(path_row_data, 'submit_example.csv')


path_new_train = os.path.join(path_new_data, 'train_lightgbm_20221014.csv')
path_new_test  = os.path.join(path_new_data, 'test_lightgbm_20221014.csv')

## results
path_output_report = os.path.join(path_results, '01_原始数据探察_20221014.xlsx')

y_label = "is_risk"

In [3]:
df_row_train = sp.read_data(path_train)
df_row_val  = sp.read_data(path_test)

df_row_train['url_sit'] = df_row_train['url'].map(lambda x: x.split('/')[0])
df_row_train['url_page'] = df_row_train['url'].map(lambda x: x.split('/')[1])

df_row_val['url_sit'] = df_row_val['url'].map(lambda x: x.split('/')[0])
df_row_val['url_page'] = df_row_val['url'].map(lambda x: x.split('/')[1])



df_train_info = df_row_train.copy().sort_values(by='op_datetime')

In [4]:
df = pd.concat([df_row_train, df_row_val]).reset_index(drop=True)
df = df.sort_values(by='op_datetime')

# 认证日期时间
df['op_datetime'] = pd.to_datetime(df['op_datetime'])
# 将数据分为每一天
df['op_days'] = df['op_datetime'].map(lambda x: x.strftime('%Y-%m-%d'))

# 按客户进行统计这次认证和上次认证的时间差
df['op_second'] = df['op_datetime']
df['op_second1'] = df.groupby('device_num_transform')['op_second'].shift(1)
df['op_diff_second1'] = (df['op_second'] - df['op_second1']).map(lambda x: x.total_seconds())
df['op_diff_second1_log'] = df['op_diff_second1'].apply(np.log)
df['op_diff_second1_log_log'] = df['op_diff_second1'].apply(np.log).apply(np.log)

df['op_diff_second1_tmp'] = df['op_diff_second1']>10
# 客户第几次登录
df['op_times_groups'] = df.groupby('device_num_transform')['op_diff_second1_tmp'].apply(lambda x: x.cumsum())

# 系统层面的一段时间
df['system_op_second'] = df['op_datetime']
df['system_op_second1'] = df['system_op_second'].shift(1)
df['system_op_diff_second1'] = (df['system_op_second'] - df['system_op_second1']).map(lambda x: x.total_seconds())

df['system_op_diff_second1_log'] = df['system_op_diff_second1'].apply(np.log)
df['system_op_diff_second1_log_log'] = df['system_op_diff_second1'].apply(np.log).apply(np.log)
df['system_op_diff_second1_tmp'] = df['system_op_diff_second1']>400
# 客户第几次登录
df['system_op_times_groups'] = df['system_op_diff_second1_tmp'].cumsum()

df = df.drop(columns=['op_second','op_second1','op_diff_second1_tmp', 
                      'system_op_second', 'system_op_second1', 'system_op_diff_second1_tmp'])

## 生成词向量

In [5]:
col = 'url'

In [6]:

df_cols = df.groupby(['device_num_transform', 'op_days'])[col].agg(lambda x: " ".join(list(x)))
df_cols.head()

device_num_transform  op_days   
01GOhHDp463z          2022-01-07    xxx.com/mail xxx.com/loginAuth xxx.com/github ...
                      2022-01-10                                       xxx.com/github
                      2022-01-11                                       xxx.com/github
                      2022-01-12                                         xxx.com/mail
                      2022-01-13    xxx.com/github xxx.com/loginAuth xxx.com/githu...
Name: url, dtype: object

In [7]:
df_cols = df_cols.unstack()

In [8]:
df_cols.head()

op_days,2022-01-07,2022-01-08,2022-01-09,2022-01-10,2022-01-11,2022-01-12,2022-01-13,2022-01-14,2022-01-15,2022-01-16,2022-01-17,2022-01-18,2022-01-19,2022-01-20,2022-01-21,2022-01-22,2022-01-23,2022-01-24,2022-01-25,2022-01-26,2022-01-27,2022-01-28,2022-01-29,2022-01-30,2022-01-31,2022-02-01,2022-02-02,2022-02-03,2022-02-04,2022-02-05,2022-02-06,2022-02-07,2022-02-08,2022-02-09,2022-02-10,2022-02-11,2022-02-12,2022-02-13,2022-02-14,2022-02-15,2022-02-16,2022-02-17,2022-02-18,2022-02-19,2022-02-20,2022-02-21,2022-02-22,2022-02-23,2022-02-24,2022-02-25,2022-02-26,2022-02-27,2022-02-28,2022-03-01,2022-03-02,2022-03-03,2022-03-04,2022-03-05,2022-03-06,2022-03-07,2022-03-08,2022-03-09,2022-03-10,2022-03-11,2022-03-12,2022-03-13,2022-03-14,2022-03-15,2022-03-16,2022-03-17,2022-03-18,2022-03-19,2022-03-20,2022-03-21,2022-03-22,2022-03-23,2022-03-24,2022-03-25,2022-03-26,2022-03-27,2022-03-28,2022-03-29,2022-03-30,2022-03-31,2022-04-01,2022-04-02,2022-04-03,2022-04-04,2022-04-05,2022-04-06,2022-04-07,2022-04-08,2022-04-09,2022-04-10,2022-04-11,2022-04-12,2022-04-13,2022-04-14,2022-04-15,2022-04-16,2022-04-17,2022-04-18,2022-04-19,2022-04-20,2022-04-21,2022-04-22,2022-04-23,2022-04-24,2022-04-25,2022-04-26,2022-04-27,2022-04-28,2022-04-29,2022-04-30,2022-05-01,2022-05-02,2022-05-03,2022-05-04,2022-05-05,2022-05-06,2022-05-07,2022-05-08,2022-05-09,2022-05-10,2022-05-11,2022-05-12,2022-05-13,2022-05-14,2022-05-15,2022-05-16,2022-05-17,2022-05-18,2022-05-19,2022-05-20,2022-05-21,2022-05-22,2022-05-23,2022-05-24,2022-05-25,2022-05-26,2022-05-27,2022-05-28,2022-05-29,2022-05-30,2022-05-31,2022-06-01,2022-06-02,2022-06-03,2022-06-04,2022-06-05,2022-06-06,2022-06-07,2022-06-08,2022-06-09,2022-06-10,2022-06-11,2022-06-12,2022-06-13,2022-06-14,2022-06-15,2022-06-16,2022-06-17,2022-06-18,2022-06-19,2022-06-20,2022-06-21,2022-06-22,2022-06-23,2022-06-24,2022-06-25,2022-06-26,2022-06-27,2022-06-28,2022-06-29,2022-06-30
device_num_transform,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1
01GOhHDp463z,xxx.com/mail xxx.com/loginAuth xxx.com/github ...,,,xxx.com/github,xxx.com/github,xxx.com/mail,xxx.com/github xxx.com/loginAuth xxx.com/githu...,xxx.com/oa,,xxx.com/github xxx.com/github,xxx.com/github xxx.com/oa,xxx.com/mail wpsdoc.xxx.com/kdocs,xxx.com/github xxx.com/mail work.xxx.com/task ...,xxx.com/mail work.xxx.com/task xxx.com/github,xxx.com/github,,xxx.com/github,work.xxx.com/task xxx.com/github,xxx.com/mail work.xxx.com/task,,work.xxx.com/task xxx.com/github work.xxx.com/...,xxx.com/github xxx.com/mail xxx.com/mail,xxx.com/loginAuth xxx.com/mail xxx.com/loginAu...,xxx.com/github xxx.com/mail xxx.com/mail,,xxx.com/mail,,,,,xxx.com/github,,xxx.com/mail xxx.com/github xxx.com/github,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,xxx.com/mail xxx.com/github xxx.com/github xxx...,xxx.com/github wpsdoc.xxx.com/kdocs xxx.com/gi...,,,xxx.com/github,,xxx.com/mail,xxx.com/github wpsdoc.xxx.com/kdocs wpsdoc.xxx...,work.xxx.com/task,,wpsdoc.xxx.com/kdocs,work.xxx.com/task xxx.com/github wpsdoc.xxx.co...,xxx.com/mail work.xxx.com/task,work.xxx.com/task xxx.com/github xxx.com/mail,xxx.com/github xxx.com/github,,,,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,xxx.com/github,work.xxx.com/task,xxx.com/loginAuth xxx.com/github,xxx.com/github xxx.com/oa xxx.com/loginAuth xx...,xxx.com/github,,xxx.com/github,,xxx.com/loginAuth xxx.com/github,xxx.com/github xxx.com/mail xxx.com/github xxx...,work.xxx.com/task,,,xxx.com/github,,xxx.com/github xxx.com/mail xxx.com/loginAuth ...,xxx.com/github work.xxx.com/task work.xxx.com/...,xxx.com/loginAuth xxx.com/mail xxx.com/oa work...,work.xxx.com/task,,work.xxx.com/task work.xxx.com/task xxx.com/gi...,xxx.com/loginAuth xxx.com/github,xxx.com/github xxx.com/mail xxx.com/github,xxx.com/github work.xxx.com/task work.xxx.com/...,xxx.com/github xxx.com/github,,,xxx.com/github xxx.com/github work.xxx.com/tas...,xxx.com/github,xxx.com/mail work.xxx.com/task,,xxx.com/github,xxx.com/github work.xxx.com/task xxx.com/mail,,,,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/kdocs,work.xxx.com/task xxx.com/github,xxx.com/mail wpsdoc.xxx.com/kdocs wpsdoc.xxx.c...,,,xxx.com/oa xxx.com/github xxx.com/github,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,work.xxx.com/task xxx.com/mail xxx.com/github ...,xxx.com/mail xxx.com/github,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,,,xxx.com/github work.xxx.com/task xxx.com/githu...,xxx.com/oa,xxx.com/github xxx.com/github xxx.com/github x...,xxx.com/github,xxx.com/mail xxx.com/github xxx.com/github xxx...,,xxx.com/github xxx.com/github xxx.com/oa,xxx.com/github xxx.com/github xxx.com/mail,xxx.com/github xxx.com/github,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,xxx.com/github,xxx.com/oa xxx.com/github work.xxx.com/task xx...,,xxx.com/oa,,wpsdoc.xxx.com/kdocs,,xxx.com/github,xxx.com/mail,work.xxx.com/task,,xxx.com/github xxx.com/github,xxx.com/github xxx.com/github,xxx.com/mail xxx.com/github xxx.com/github xxx...,work.xxx.com/task xxx.com/github xxx.com/github,xxx.com/github,,,xxx.com/mail xxx.com/github xxx.com/oa xxx.com...,xxx.com/oa xxx.com/github xxx.com/github,,xxx.com/github wpsdoc.xxx.com/kdocs,xxx.com/loginAuth xxx.com/github xxx.com/mail ...,,,xxx.com/mail xxx.com/mail xxx.com/github work....,xxx.com/github xxx.com/mail,xxx.com/github xxx.com/github xxx.com/github w...,xxx.com/github xxx.com/github xxx.com/loginAut...,xxx.com/mail xxx.com/mail,,,xxx.com/mail xxx.com/github work.xxx.com/task ...,xxx.com/github xxx.com/oa,xxx.com/oa xxx.com/loginAuth work.xxx.com/task,xxx.com/github xxx.com/mail xxx.com/github,xxx.com/github,xxx.com/github,,xxx.com/github work.xxx.com/task xxx.com/github,xxx.com/github xxx.com/github work.xxx.com/tas...,xxx.com/github wpsdoc.xxx.com/kdocs xxx.com/lo...,xxx.com/github work.xxx.com/task,work.xxx.com/task wpsdoc.xxx.com/kdocs wpsdoc....,,,xxx.com/mail xxx.com/github,xxx.com/github xxx.com/github,xxx.com/github work.xxx.com/task xxx.com/mail ...,,xxx.com/github work.xxx.com/task xxx.com/githu...,,xxx.com/github,xxx.com/oa xxx.com/github xxx.com/oa,xxx.com/mail,xxx.com/mail xxx.com/github,xxx.com/oa xxx.com/github xxx.com/github xxx.c...,xxx.com/oa xxx.com/github xxx.com/github xxx.c...,xxx.com/oa,,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,xxx.com/loginAuth xxx.com/mail xxx.com/github ...,,xxx.com/github
04F4iKnBDqb6,xxx.com/mail wpsdoc.xxx.com/kdocs work.xxx.com...,,,,xxx.com/loginAuth xxx.com/github,work.xxx.com/task xxx.com/github xxx.com/login...,xxx.com/loginAuth work.xxx.com/task,xxx.com/github xxx.com/github xxx.com/github x...,xxx.com/github,wpsdoc.xxx.com/kdocs,xxx.com/loginAuth wpsdoc.xxx.com/kdocs,wpsdoc.xxx.com/kdocs xxx.com/github,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/kdocs xxx....,xxx.com/github xxx.com/github xxx.com/loginAut...,xxx.com/loginAuth xxx.com/github xxx.com/github,,,wpsdoc.xxx.com/kdocs xxx.com/github xxx.com/oa...,xxx.com/github xxx.com/mail,,xxx.com/mail xxx.com/github xxx.com/github xxx...,xxx.com/github,xxx.com/github wpsdoc.xxx.com/kdocs wpsdoc.xxx...,xxx.com/github wpsdoc.xxx.com/kdocs wpsdoc.xxx...,,,,,,,,wpsdoc.xxx.com/kdocs,xxx.com/mail xxx.com/loginAuth work.xxx.com/ta...,xxx.com/github work.xxx.com/task,xxx.com/github work.xxx.com/task xxx.com/mail ...,,,,xxx.com/github xxx.com/github xxx.com/mail xxx...,work.xxx.com/task xxx.com/github wpsdoc.xxx.co...,xxx.com/github wpsdoc.xxx.com/kdocs,xxx.com/github,,,,xxx.com/github xxx.com/mail wpsdoc.xxx.com/kdo...,xxx.com/github xxx.com/oa,xxx.com/github xxx.com/mail xxx.com/github xxx...,xxx.com/github xxx.com/loginAuth xxx.com/github,,,,xxx.com/oa xxx.com/github work.xxx.com/task wo...,xxx.com/github xxx.com/mail xxx.com/mail work....,xxx.com/mail xxx.com/loginAuth work.xxx.com/ta...,xxx.com/oa xxx.com/oa xxx.com/github xxx.com/g...,xxx.com/mail xxx.com/mail,,,,wpsdoc.xxx.com/kdocs xxx.com/github,xxx.com/github xxx.com/mail xxx.com/mail,xxx.com/github xxx.com/loginAuth xxx.com/github,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,,,xxx.com/mail xxx.com/github,xxx.com/mail xxx.com/github xxx.com/github xxx...,work.xxx.com/task work.xxx.com/task xxx.com/oa...,xxx.com/github wpsdoc.xxx.com/kdocs xxx.com/gi...,xxx.com/loginAuth work.xxx.com/task,,,xxx.com/github,wpsdoc.xxx.com/kdocs xxx.com/mail wpsdoc.xxx.c...,xxx.com/github xxx.com/github xxx.com/mail xxx...,xxx.com/mail xxx.com/mail xxx.com/github xxx.c...,xxx.com/github,,xxx.com/github,work.xxx.com/task xxx.com/github,xxx.com/github wpsdoc.xxx.com/kdocs,xxx.com/loginAuth xxx.com/github xxx.com/githu...,xxx.com/github,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,xxx.com/mail xxx.com/mail xxx.com/github,xxx.com/oa,,,wpsdoc.xxx.com/kdocs xxx.com/github work.xxx.c...,xxx.com/oa xxx.com/loginAuth xxx.com/github xx...,wpsdoc.xxx.com/kdocs xxx.com/github xxx.com/mail,,,xxx.com/github,work.xxx.com/task xxx.com/oa,xxx.com/github,xxx.com/github xxx.com/github xxx.com/github,xxx.com/loginAuth work.xxx.com/task,,xxx.com/github,xxx.com/loginAuth wpsdoc.xxx.com/kdocs xxx.com...,wpsdoc.xxx.com/kdocs xxx.com/github xxx.com/ma...,wpsdoc.xxx.com/kdocs,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,xxx.com/github xxx.com/github,,xxx.com/github,,xxx.com/github xxx.com/github,xxx.com/mail xxx.com/github,xxx.com/mail work.xxx.com/task xxx.com/mail xx...,,,,,,,,xxx.com/mail xxx.com/github xxx.com/github xxx...,xxx.com/github work.xxx.com/task xxx.com/oa,,xxx.com/github,xxx.com/github xxx.com/github,wpsdoc.xxx.com/kdocs xxx.com/github work.xxx.c...,xxx.com/loginAuth xxx.com/github xxx.com/github,work.xxx.com/task wpsdoc.xxx.com/kdocs wpsdoc....,,,xxx.com/github,xxx.com/loginAuth xxx.com/github xxx.com/github,xxx.com/github xxx.com/github,xxx.com/oa,xxx.com/github xxx.com/github xxx.com/github,,,,xxx.com/github work.xxx.com/task xxx.com/githu...,xxx.com/github xxx.com/mail,xxx.com/github xxx.com/github,work.xxx.com/task xxx.com/github,,,,xxx.com/github xxx.com/github xxx.com/github x...,xxx.com/github xxx.com/oa,xxx.com/github work.xxx.com/task,,,,xxx.com/github,xxx.com/github xxx.com/github xxx.com/github,xxx.com/mail xxx.com/loginAuth xxx.com/github,xxx.com/github xxx.com/github,,,,,xxx.com/github xxx.com/github xxx.com/mail,work.xxx.com/task,xxx.com/github xxx.com/github xxx.com/github x...,xxx.com/github xxx.com/github,xxx.com/oa xxx.com/github,,,xxx.com/github xxx.com/oa,xxx.com/github xxx.com/github,xxx.com/github,xxx.com/github,,,xxx.com/github,wpsdoc.xxx.com/kdocs,xxx.com/github xxx.com/loginAuth xxx.com/mail ...,xxx.com/mail xxx.com/github xxx.com/oa xxx.com...
04MrZwVR5Dh4,,,,,,,,,,,,,,,xxx.com/getLoginType,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
05DiseR8wyyh,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,xxx.com/getLoginType,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
05SghMEzzxQQ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,xxx.com/getVerifyCode,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [9]:
df_cols.fillna('', inplace=True)
df_cols.head()

op_days,2022-01-07,2022-01-08,2022-01-09,2022-01-10,2022-01-11,2022-01-12,2022-01-13,2022-01-14,2022-01-15,2022-01-16,2022-01-17,2022-01-18,2022-01-19,2022-01-20,2022-01-21,2022-01-22,2022-01-23,2022-01-24,2022-01-25,2022-01-26,2022-01-27,2022-01-28,2022-01-29,2022-01-30,2022-01-31,2022-02-01,2022-02-02,2022-02-03,2022-02-04,2022-02-05,2022-02-06,2022-02-07,2022-02-08,2022-02-09,2022-02-10,2022-02-11,2022-02-12,2022-02-13,2022-02-14,2022-02-15,2022-02-16,2022-02-17,2022-02-18,2022-02-19,2022-02-20,2022-02-21,2022-02-22,2022-02-23,2022-02-24,2022-02-25,2022-02-26,2022-02-27,2022-02-28,2022-03-01,2022-03-02,2022-03-03,2022-03-04,2022-03-05,2022-03-06,2022-03-07,2022-03-08,2022-03-09,2022-03-10,2022-03-11,2022-03-12,2022-03-13,2022-03-14,2022-03-15,2022-03-16,2022-03-17,2022-03-18,2022-03-19,2022-03-20,2022-03-21,2022-03-22,2022-03-23,2022-03-24,2022-03-25,2022-03-26,2022-03-27,2022-03-28,2022-03-29,2022-03-30,2022-03-31,2022-04-01,2022-04-02,2022-04-03,2022-04-04,2022-04-05,2022-04-06,2022-04-07,2022-04-08,2022-04-09,2022-04-10,2022-04-11,2022-04-12,2022-04-13,2022-04-14,2022-04-15,2022-04-16,2022-04-17,2022-04-18,2022-04-19,2022-04-20,2022-04-21,2022-04-22,2022-04-23,2022-04-24,2022-04-25,2022-04-26,2022-04-27,2022-04-28,2022-04-29,2022-04-30,2022-05-01,2022-05-02,2022-05-03,2022-05-04,2022-05-05,2022-05-06,2022-05-07,2022-05-08,2022-05-09,2022-05-10,2022-05-11,2022-05-12,2022-05-13,2022-05-14,2022-05-15,2022-05-16,2022-05-17,2022-05-18,2022-05-19,2022-05-20,2022-05-21,2022-05-22,2022-05-23,2022-05-24,2022-05-25,2022-05-26,2022-05-27,2022-05-28,2022-05-29,2022-05-30,2022-05-31,2022-06-01,2022-06-02,2022-06-03,2022-06-04,2022-06-05,2022-06-06,2022-06-07,2022-06-08,2022-06-09,2022-06-10,2022-06-11,2022-06-12,2022-06-13,2022-06-14,2022-06-15,2022-06-16,2022-06-17,2022-06-18,2022-06-19,2022-06-20,2022-06-21,2022-06-22,2022-06-23,2022-06-24,2022-06-25,2022-06-26,2022-06-27,2022-06-28,2022-06-29,2022-06-30
device_num_transform,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1
01GOhHDp463z,xxx.com/mail xxx.com/loginAuth xxx.com/github ...,,,xxx.com/github,xxx.com/github,xxx.com/mail,xxx.com/github xxx.com/loginAuth xxx.com/githu...,xxx.com/oa,,xxx.com/github xxx.com/github,xxx.com/github xxx.com/oa,xxx.com/mail wpsdoc.xxx.com/kdocs,xxx.com/github xxx.com/mail work.xxx.com/task ...,xxx.com/mail work.xxx.com/task xxx.com/github,xxx.com/github,,xxx.com/github,work.xxx.com/task xxx.com/github,xxx.com/mail work.xxx.com/task,,work.xxx.com/task xxx.com/github work.xxx.com/...,xxx.com/github xxx.com/mail xxx.com/mail,xxx.com/loginAuth xxx.com/mail xxx.com/loginAu...,xxx.com/github xxx.com/mail xxx.com/mail,,xxx.com/mail,,,,,xxx.com/github,,xxx.com/mail xxx.com/github xxx.com/github,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,xxx.com/mail xxx.com/github xxx.com/github xxx...,xxx.com/github wpsdoc.xxx.com/kdocs xxx.com/gi...,,,xxx.com/github,,xxx.com/mail,xxx.com/github wpsdoc.xxx.com/kdocs wpsdoc.xxx...,work.xxx.com/task,,wpsdoc.xxx.com/kdocs,work.xxx.com/task xxx.com/github wpsdoc.xxx.co...,xxx.com/mail work.xxx.com/task,work.xxx.com/task xxx.com/github xxx.com/mail,xxx.com/github xxx.com/github,,,,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,xxx.com/github,work.xxx.com/task,xxx.com/loginAuth xxx.com/github,xxx.com/github xxx.com/oa xxx.com/loginAuth xx...,xxx.com/github,,xxx.com/github,,xxx.com/loginAuth xxx.com/github,xxx.com/github xxx.com/mail xxx.com/github xxx...,work.xxx.com/task,,,xxx.com/github,,xxx.com/github xxx.com/mail xxx.com/loginAuth ...,xxx.com/github work.xxx.com/task work.xxx.com/...,xxx.com/loginAuth xxx.com/mail xxx.com/oa work...,work.xxx.com/task,,work.xxx.com/task work.xxx.com/task xxx.com/gi...,xxx.com/loginAuth xxx.com/github,xxx.com/github xxx.com/mail xxx.com/github,xxx.com/github work.xxx.com/task work.xxx.com/...,xxx.com/github xxx.com/github,,,xxx.com/github xxx.com/github work.xxx.com/tas...,xxx.com/github,xxx.com/mail work.xxx.com/task,,xxx.com/github,xxx.com/github work.xxx.com/task xxx.com/mail,,,,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/kdocs,work.xxx.com/task xxx.com/github,xxx.com/mail wpsdoc.xxx.com/kdocs wpsdoc.xxx.c...,,,xxx.com/oa xxx.com/github xxx.com/github,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,work.xxx.com/task xxx.com/mail xxx.com/github ...,xxx.com/mail xxx.com/github,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,,,xxx.com/github work.xxx.com/task xxx.com/githu...,xxx.com/oa,xxx.com/github xxx.com/github xxx.com/github x...,xxx.com/github,xxx.com/mail xxx.com/github xxx.com/github xxx...,,xxx.com/github xxx.com/github xxx.com/oa,xxx.com/github xxx.com/github xxx.com/mail,xxx.com/github xxx.com/github,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,xxx.com/github,xxx.com/oa xxx.com/github work.xxx.com/task xx...,,xxx.com/oa,,wpsdoc.xxx.com/kdocs,,xxx.com/github,xxx.com/mail,work.xxx.com/task,,xxx.com/github xxx.com/github,xxx.com/github xxx.com/github,xxx.com/mail xxx.com/github xxx.com/github xxx...,work.xxx.com/task xxx.com/github xxx.com/github,xxx.com/github,,,xxx.com/mail xxx.com/github xxx.com/oa xxx.com...,xxx.com/oa xxx.com/github xxx.com/github,,xxx.com/github wpsdoc.xxx.com/kdocs,xxx.com/loginAuth xxx.com/github xxx.com/mail ...,,,xxx.com/mail xxx.com/mail xxx.com/github work....,xxx.com/github xxx.com/mail,xxx.com/github xxx.com/github xxx.com/github w...,xxx.com/github xxx.com/github xxx.com/loginAut...,xxx.com/mail xxx.com/mail,,,xxx.com/mail xxx.com/github work.xxx.com/task ...,xxx.com/github xxx.com/oa,xxx.com/oa xxx.com/loginAuth work.xxx.com/task,xxx.com/github xxx.com/mail xxx.com/github,xxx.com/github,xxx.com/github,,xxx.com/github work.xxx.com/task xxx.com/github,xxx.com/github xxx.com/github work.xxx.com/tas...,xxx.com/github wpsdoc.xxx.com/kdocs xxx.com/lo...,xxx.com/github work.xxx.com/task,work.xxx.com/task wpsdoc.xxx.com/kdocs wpsdoc....,,,xxx.com/mail xxx.com/github,xxx.com/github xxx.com/github,xxx.com/github work.xxx.com/task xxx.com/mail ...,,xxx.com/github work.xxx.com/task xxx.com/githu...,,xxx.com/github,xxx.com/oa xxx.com/github xxx.com/oa,xxx.com/mail,xxx.com/mail xxx.com/github,xxx.com/oa xxx.com/github xxx.com/github xxx.c...,xxx.com/oa xxx.com/github xxx.com/github xxx.c...,xxx.com/oa,,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,xxx.com/loginAuth xxx.com/mail xxx.com/github ...,,xxx.com/github
04F4iKnBDqb6,xxx.com/mail wpsdoc.xxx.com/kdocs work.xxx.com...,,,,xxx.com/loginAuth xxx.com/github,work.xxx.com/task xxx.com/github xxx.com/login...,xxx.com/loginAuth work.xxx.com/task,xxx.com/github xxx.com/github xxx.com/github x...,xxx.com/github,wpsdoc.xxx.com/kdocs,xxx.com/loginAuth wpsdoc.xxx.com/kdocs,wpsdoc.xxx.com/kdocs xxx.com/github,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/kdocs xxx....,xxx.com/github xxx.com/github xxx.com/loginAut...,xxx.com/loginAuth xxx.com/github xxx.com/github,,,wpsdoc.xxx.com/kdocs xxx.com/github xxx.com/oa...,xxx.com/github xxx.com/mail,,xxx.com/mail xxx.com/github xxx.com/github xxx...,xxx.com/github,xxx.com/github wpsdoc.xxx.com/kdocs wpsdoc.xxx...,xxx.com/github wpsdoc.xxx.com/kdocs wpsdoc.xxx...,,,,,,,,wpsdoc.xxx.com/kdocs,xxx.com/mail xxx.com/loginAuth work.xxx.com/ta...,xxx.com/github work.xxx.com/task,xxx.com/github work.xxx.com/task xxx.com/mail ...,,,,xxx.com/github xxx.com/github xxx.com/mail xxx...,work.xxx.com/task xxx.com/github wpsdoc.xxx.co...,xxx.com/github wpsdoc.xxx.com/kdocs,xxx.com/github,,,,xxx.com/github xxx.com/mail wpsdoc.xxx.com/kdo...,xxx.com/github xxx.com/oa,xxx.com/github xxx.com/mail xxx.com/github xxx...,xxx.com/github xxx.com/loginAuth xxx.com/github,,,,xxx.com/oa xxx.com/github work.xxx.com/task wo...,xxx.com/github xxx.com/mail xxx.com/mail work....,xxx.com/mail xxx.com/loginAuth work.xxx.com/ta...,xxx.com/oa xxx.com/oa xxx.com/github xxx.com/g...,xxx.com/mail xxx.com/mail,,,,wpsdoc.xxx.com/kdocs xxx.com/github,xxx.com/github xxx.com/mail xxx.com/mail,xxx.com/github xxx.com/loginAuth xxx.com/github,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,,,xxx.com/mail xxx.com/github,xxx.com/mail xxx.com/github xxx.com/github xxx...,work.xxx.com/task work.xxx.com/task xxx.com/oa...,xxx.com/github wpsdoc.xxx.com/kdocs xxx.com/gi...,xxx.com/loginAuth work.xxx.com/task,,,xxx.com/github,wpsdoc.xxx.com/kdocs xxx.com/mail wpsdoc.xxx.c...,xxx.com/github xxx.com/github xxx.com/mail xxx...,xxx.com/mail xxx.com/mail xxx.com/github xxx.c...,xxx.com/github,,xxx.com/github,work.xxx.com/task xxx.com/github,xxx.com/github wpsdoc.xxx.com/kdocs,xxx.com/loginAuth xxx.com/github xxx.com/githu...,xxx.com/github,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,xxx.com/mail xxx.com/mail xxx.com/github,xxx.com/oa,,,wpsdoc.xxx.com/kdocs xxx.com/github work.xxx.c...,xxx.com/oa xxx.com/loginAuth xxx.com/github xx...,wpsdoc.xxx.com/kdocs xxx.com/github xxx.com/mail,,,xxx.com/github,work.xxx.com/task xxx.com/oa,xxx.com/github,xxx.com/github xxx.com/github xxx.com/github,xxx.com/loginAuth work.xxx.com/task,,xxx.com/github,xxx.com/loginAuth wpsdoc.xxx.com/kdocs xxx.com...,wpsdoc.xxx.com/kdocs xxx.com/github xxx.com/ma...,wpsdoc.xxx.com/kdocs,wpsdoc.xxx.com/kdocs wpsdoc.xxx.com/download w...,xxx.com/github xxx.com/github,,xxx.com/github,,xxx.com/github xxx.com/github,xxx.com/mail xxx.com/github,xxx.com/mail work.xxx.com/task xxx.com/mail xx...,,,,,,,,xxx.com/mail xxx.com/github xxx.com/github xxx...,xxx.com/github work.xxx.com/task xxx.com/oa,,xxx.com/github,xxx.com/github xxx.com/github,wpsdoc.xxx.com/kdocs xxx.com/github work.xxx.c...,xxx.com/loginAuth xxx.com/github xxx.com/github,work.xxx.com/task wpsdoc.xxx.com/kdocs wpsdoc....,,,xxx.com/github,xxx.com/loginAuth xxx.com/github xxx.com/github,xxx.com/github xxx.com/github,xxx.com/oa,xxx.com/github xxx.com/github xxx.com/github,,,,xxx.com/github work.xxx.com/task xxx.com/githu...,xxx.com/github xxx.com/mail,xxx.com/github xxx.com/github,work.xxx.com/task xxx.com/github,,,,xxx.com/github xxx.com/github xxx.com/github x...,xxx.com/github xxx.com/oa,xxx.com/github work.xxx.com/task,,,,xxx.com/github,xxx.com/github xxx.com/github xxx.com/github,xxx.com/mail xxx.com/loginAuth xxx.com/github,xxx.com/github xxx.com/github,,,,,xxx.com/github xxx.com/github xxx.com/mail,work.xxx.com/task,xxx.com/github xxx.com/github xxx.com/github x...,xxx.com/github xxx.com/github,xxx.com/oa xxx.com/github,,,xxx.com/github xxx.com/oa,xxx.com/github xxx.com/github,xxx.com/github,xxx.com/github,,,xxx.com/github,wpsdoc.xxx.com/kdocs,xxx.com/github xxx.com/loginAuth xxx.com/mail ...,xxx.com/mail xxx.com/github xxx.com/oa xxx.com...
04MrZwVR5Dh4,,,,,,,,,,,,,,,xxx.com/getLoginType,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
05DiseR8wyyh,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,xxx.com/getLoginType,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
05SghMEzzxQQ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,xxx.com/getVerifyCode,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [10]:
df_res = df_cols.apply(lambda x: ' '.join(list(filter(None, list(x)))), axis=1)
df_res

device_num_transform
01GOhHDp463z    xxx.com/mail xxx.com/loginAuth xxx.com/github ...
04F4iKnBDqb6    xxx.com/mail wpsdoc.xxx.com/kdocs work.xxx.com...
04MrZwVR5Dh4                                 xxx.com/getLoginType
05DiseR8wyyh                                 xxx.com/getLoginType
05SghMEzzxQQ                                xxx.com/getVerifyCode
                                      ...                        
zOb9NlDd0LNL                                xxx.com/getVerifyCode
zZaMUpAM22FA                                 xxx.com/getLoginType
zavzUQni8jCv                                    xxx.com/loginAuth
znuxvoiO6WKQ                                 xxx.com/getLoginType
zv5gASBc2N4X                                xxx.com/getVerifyCode
Length: 1162, dtype: object

In [11]:
df_num = df_res.apply(lambda x: len(x.split()))
df_num

device_num_transform
01GOhHDp463z    410
04F4iKnBDqb6    393
04MrZwVR5Dh4      1
05DiseR8wyyh      1
05SghMEzzxQQ      1
               ... 
zOb9NlDd0LNL      1
zZaMUpAM22FA      1
zavzUQni8jCv      1
znuxvoiO6WKQ      1
zv5gASBc2N4X      1
Length: 1162, dtype: int64

In [12]:
df_res = df_res[df_num > 3]
df_res.head()

device_num_transform
01GOhHDp463z    xxx.com/mail xxx.com/loginAuth xxx.com/github ...
04F4iKnBDqb6    xxx.com/mail wpsdoc.xxx.com/kdocs work.xxx.com...
0L5KlIYwu0Yy    xxx.com/github wpsdoc.xxx.com/kdocs xxx.com/ma...
0bdar6YzF7TR    xxx.com/checkingin hr.xxx.com/ hr.xxx.com/ xxx...
0dV6LzVsv7pW    xxx.com/github xxx.com/oa wpsdoc.xxx.com/kdocs...
dtype: object

In [13]:
df_res.to_csv(os.path.join(path_new_data, 'corpus.csv'), index=False, header=False, sep='\t')

In [18]:
def train_embedding(path_corpus, path_save_models, path_save_txt, col):
    sentences = word2vec.Text8Corpus(path_corpus)  # 原始语料路径,已分词
    # 训练代码
    model = word2vec.Word2Vec(sentences, sg=1, vector_size=5, window=12, min_count=1,
                              hs=0,  workers=10, epochs=10)
    # save
    path_embedding_model = os.path.join(path_save_models, 'models_{}.model'.format(str(col)))
    path_embedding_vocab = os.path.join(path_save_txt, 'models_{}_embedding.txt'.format(str(col)))

    model.save(path_embedding_model)
    model.wv.save_word2vec_format(path_embedding_vocab)
    print('词向量训练完成：{}'.format(str(col)))

In [24]:
train_embedding(
    path_corpus = os.path.join(path_new_data, 'corpus.csv'),
    path_save_models = os.path.join(path_new_data, 'corpus_models'),
    path_save_txt = os.path.join(path_new_data, 'corpus_txt'),
    col = 'device_num_transform',
               )

词向量训练完成：device_num_transform


In [25]:
df_tmp = pd.read_csv('/Users/liliangshan/workspace/python/01_datasets/ccf_system_access_risk_identification/new_data/corpus_txt/models_device_num_transform_embedding.txt', skiprows=1, header=None, sep=' ' )
df_tmp

Unnamed: 0,0,1,2,3,4,5
0,xxx.com/github,0.372258,-0.109154,0.838936,-0.696688,-0.470712
1,wpsdoc.xxx.com/download,0.032606,-0.594723,0.087753,-0.636999,-0.691368
2,xxx.com/mail,-0.16487,-0.815885,0.099441,-0.390733,-0.795901
3,work.xxx.com/task,-0.050692,-0.610392,0.22983,-0.714795,-0.653477
4,business.xxx.com/,1.130417,-0.261384,-0.789448,-1.736327,-1.130289
5,wpsdoc.xxx.com/kdocs,0.023941,-0.726561,0.008695,-0.478485,-0.75325
6,xxx.com/loginAuth,-0.07038,-0.705889,0.146878,-0.437643,-0.728691
7,xxx.com/oa,-0.162362,-0.831574,-0.039809,-0.394129,-0.879894
8,xxx.com/checkingin,-1.0423,0.347731,-0.085625,0.438875,-2.706073
9,xxx.com/accounting,1.61995,-0.605472,0.112955,1.702297,-1.913541


In [None]:
# 构建符合本项目的词向量
embeddings = np.random.rand(len(word2id), int(config['train_test_settings']['embedding_dim']))
f = open(path_row_embedding, 'r', encoding='UTF-8')
for i, line in enumerate(f):
    if i == 0:  # 若第一行是标题，则跳过
        continue
    lin = line.strip().split(" ")  # 预训练词向量
    if lin[0] in word2id:
        idx = word2id[lin[0]]
        emb = [float(x) for x in lin[1: int(config['train_test_settings']['embedding_dim']) + 1]]
        embeddings[idx] = np.asarray(emb, dtype="float32")
f.close()

In [26]:
df_res.head(10)

device_num_transform
01GOhHDp463z    xxx.com/mail xxx.com/loginAuth xxx.com/github ...
04F4iKnBDqb6    xxx.com/mail wpsdoc.xxx.com/kdocs work.xxx.com...
04MrZwVR5Dh4                                 xxx.com/getLoginType
05DiseR8wyyh                                 xxx.com/getLoginType
05SghMEzzxQQ                                xxx.com/getVerifyCode
09qlLew78XXp                                 xxx.com/getLoginType
0AG7bOroVGrh                                xxx.com/getVerifyCode
0B1EZlyCIjEA                                xxx.com/getVerifyCode
0BJZnGywzJAc                                 xxx.com/getLoginType
0DSr7NTlrmpE                                 xxx.com/getLoginType
dtype: object

In [29]:
df_num[df_num>3]

device_num_transform
01GOhHDp463z    410
04F4iKnBDqb6    393
0L5KlIYwu0Yy    423
0bdar6YzF7TR    321
0dV6LzVsv7pW    477
0jQzrW0ob3tw    378
0vwTFOBrbvIs    383
19sK60AbnGgR    422
1BmyW1drMraT    374
1NnRdFzp9NWJ    368
1SRtvCb9DNFr    412
1baNbqxMWcCu    409
1gUBfR9p6HLE    363
1lTglzPf2SLB    423
21DcH4wj4tEf    338
2AECzhJLRR4Y    385
2EmjEhrepKLJ    436
2PQLbdJAk4QW    306
2ZOGB1eOX3vU    419
2pOKemyYTkvI    369
2u1rhQmiknmk    386
36AnzTaMQafK    386
39OE3BHMPdqV    345
3ANg3oEpqSxD    355
3nvH2PHqvZrr    311
3rYjJlAh0Q1x    327
3wDqyLqvVCn1    433
42y4i2PwNyBY    361
44BDMyay4hg1    394
4BWxjoSreaOm    468
4ERrLGUd7rMh    361
4G2Bw0Csw5lz    413
4Jv2QioonsJM    318
4Qy11wvgpGFs    418
4SXmah16O6AH    403
5DmlITfRNR36    445
5Qli6KaoYmgU    356
5SgCho991yJr    388
5nWnCuken0p8    384
5yApVeSk7drn    375
651TrktciC9i    409
6MwEE0egsCb7    300
6NRAoXZogVDX    403
6T3i25megG6k    364
6YuqJIvp6wuX    409
6m7Wp9p29RXI    377
6ypVTOMezZPI    339
6zUqVT1ygOEZ    443
72IsjEv4ThEB    334

In [None]:

    df_res = df_user_create.apply(lambda x: ' '.join(list(filter(None, list(x)))), axis=1)
    df_num = df_res.apply(lambda x: len(x.split()))

In [None]:
df = df.dropna(columns=[cols])
df_cols = df.groupby(['op_days','device_num_transform'])[col].agg(lambda x: " ".join(list(x)))
df_cols = df_cols.unstack()
df_cols = 

In [None]:
def get_corpus(df, col, path_save, freq=0.95):
    """
    生成用于训练词向量的语料
    :param df:
    :param col:
    :param path_save:
    :param freq:
    :return:
    """
    df = df.dropna(columns=[cols])
    df_cols = df.groupby(['op_days','device_num_transform'])[col].agg(lambda x: " ".join(list(x)))
    df_cols = df_cols.unstack()
    df_cols = 

In [5]:

def get_corpus(config, col, freq=0.95):
    """
    生成用于训练词向量的语料
    :param freq:
    :param config:
    :param col:
    :return:
    """
    path_col = os.path.join(config['path_pipeline_settings']['path_pipeline_col_click'],
                            '{}_corpus_{}.csv'.format(col, str(int(freq * 100))))
    path_save = os.path.join(config['path_pipeline_settings']['path_col_corpus'],
                             '{}_corpus.txt'.format(col))
    if os.path.exists(path_save):
        logger.debug('03 训练语料已存在： {}_corpus_{}.csv'.format(col, str(int(freq * 100))))
        return
    df_col = pd.read_csv(path_col, na_values=r'\N', dtype={col: str}, usecols=['time', 'user_id', col])
    df_col = df_col.dropna()
    df_user_create = df_col.groupby(['user_id', 'time'])[col].agg(lambda x: ' '.join(list(x)))
    df_user_create = df_user_create.unstack()
    df_user_create = df_user_create[list(range(1, 92))]
    df_user_create.fillna('', inplace=True)

    df_res = df_user_create.apply(lambda x: ' '.join(list(filter(None, list(x)))), axis=1)
    df_num = df_res.apply(lambda x: len(x.split()))
    #df_res = df_res[df_num > 7]

    df_res.to_csv(path_save, index=False, header=False, sep='\t')

    logger.debug('03 训练语料已生成： {}_corpus_{}.csv'.format(col, str(int(freq * 100))))


Unnamed: 0,id,user_name,department,ip_transform,device_num_transform,browser_version,browser,os_type,os_version,op_datetime,ip_type,http_status_code,op_city,log_system_transform,url,op_month,is_risk,url_sit,url_page,op_days,op_diff_second1,op_diff_second1_log,op_diff_second1_log_log,op_times_groups,system_op_diff_second1,system_op_diff_second1_log,system_op_diff_second1_log_log,system_op_times_groups
44477,44477,xiongkai3397,rd,6H1iPLgBB,GCgxrFb69up7,chrome_93,chrome,win,win10,2022-01-07 02:44:29,内网,200,深圳,nHrKgKdJ1Mzt,xxx.com/github,2022-01,1.0,xxx.com,github,2022-01-07,,,,0,,,,0
45489,45489,zhengguiying7117,rd,0mjaEf4SB,8ftsXFm5I1Ej,safari_13,safari,macos,macos_big_sur_11,2022-01-07 02:54:32,内网,200,成都,nHrKgKdJ1Mzt,xxx.com/github,2022-01,1.0,xxx.com,github,2022-01-07,,,,0,603.0,6.401917,1.856598,1
45706,45706,yuanjun5870,hr,1Vk2kEa4X,W1Cstajd8x1s,firefox_78,firefox,win,win7,2022-01-07 03:00:56,内网,200,深圳,a5G25puBl9xj,hr.xxx.com/,2022-01,1.0,hr.xxx.com,,2022-01-07,,,,0,384.0,5.950643,1.783499,1
45901,45901,zhoutingting3694,rd,4Wj6uxLx3,H8NAVsdws95G,edge_93,edge,win,win10,2022-01-07 04:29:34,内网,200,杭州,nHrKgKdJ1Mzt,xxx.com/github,2022-01,1.0,xxx.com,github,2022-01-07,,,,0,5318.0,8.578853,2.1493,2
43827,43827,yanglin6562,sales,eK12oQmm8,GnkVqPSy5nnl,ie_9,ie,win,win10,2022-01-07 05:17:44,内网,200,重庆,sW0whYIx8LFM,work.xxx.com/task,2022-01,1.0,work.xxx.com,task,2022-01-07,,,,0,2890.0,7.969012,2.07556,3


In [None]:
df