In [1]:
import warnings
warnings.simplefilter('ignore')

import os
import re
import gc
import json

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 200)
pd.set_option('float_format', lambda x: '%.3f' % x)
from tqdm.notebook import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

In [2]:
train = pd.read_csv('raw_data/train_dataset.csv', sep='\t')
print(train.shape)
train.head()

(15016, 19)


Unnamed: 0,session_id,op_date,user_name,action,auth_type,ip,ip_location_type_keyword,ip_risk_level,location,client_type,browser_source,device_model,os_type,os_version,browser_type,browser_version,bus_system_code,op_target,risk_label
0,access:test_d:20180101111639:bBp1,2018/1/1 11:16,test_d,login,otp,192.168.100.101,内网,1级,"{""first_lvl"":""成都分公司"",""sec_lvl"":""9楼"",""third_lvl...",web,desktop,think_pad_e460,windows,windows 10,chrome,chrome 90,coremail,management,0
1,access:test_d:20180101121524:OBSg,2018/1/1 12:15,test_d,login,qr,192.168.100.101,内网,1级,"{""first_lvl"":""成都分公司"",""sec_lvl"":""9楼"",""third_lvl...",web,desktop,think_pad_e460,windows,windows 10,edge,edge 93,order-mgnt,sales,0
2,access:test_d:20180101151333:BpQN,2018/1/1 15:13,test_d,login,qr,192.168.100.101,内网,1级,"{""first_lvl"":""成都分公司"",""sec_lvl"":""9楼"",""third_lvl...",web,desktop,think_pad_e460,windows,windows 10,chrome,chrome 90,order-mgnt,sales,0
3,access:test_d:20180101124502:hYQm,2018/1/1 12:45,test_d,sso,,192.168.100.101,内网,1级,"{""first_lvl"":""成都分公司"",""sec_lvl"":""9楼"",""third_lvl...",web,desktop,think_pad_e460,windows,windows 10,edge,edge 93,oa,management,0
4,access:test_d:20180101202749:FkDK,2018/1/1 20:27,test_d,sso,,192.168.100.101,内网,1级,"{""first_lvl"":""成都分公司"",""sec_lvl"":""9楼"",""third_lvl...",web,desktop,think_pad_e460,windows,windows 10,edge,edge 93,order-mgnt,sales,0


In [3]:
test = pd.read_csv('raw_data/test_dataset.csv', sep='\t')
print(test.shape)
test.head()

(10000, 18)


Unnamed: 0,session_id,op_date,user_name,action,auth_type,ip,ip_location_type_keyword,ip_risk_level,location,client_type,browser_source,device_model,os_type,os_version,browser_type,browser_version,bus_system_code,op_target
0,access:test_c:20191023212545:H2in,2019/10/23 21:25,test_c,sso,,27.10.135.254,代理IP,3级,"{""first_lvl"":""重庆"",""sec_lvl"":""重庆市"",""third_lvl"":...",web,desktop,macbook,macOS,macOS Big Sur 11,safari,safari 13,order-mgnt,sales
1,access:test_c:20191023095634:ylxO,2019/10/23 9:56,test_c,sso,,27.10.135.254,代理IP,3级,"{""first_lvl"":""重庆"",""sec_lvl"":""重庆市"",""third_lvl"":...",web,desktop,macbook,macOS,macOS Big Sur 11,safari,safari 13,order-mgnt,sales
2,access:test_c:20191023104233:tc9Y,2019/10/23 10:42,test_c,login,sms,27.10.135.254,代理IP,3级,"{""first_lvl"":""重庆"",""sec_lvl"":""重庆市"",""third_lvl"":...",web,desktop,macbook,macOS,macOS Big Sur 11,safari,safari 13,order-mgnt,sales
3,access:test_c:20191023142416:8rjC,2019/10/23 14:24,test_c,sso,,27.10.135.254,代理IP,3级,"{""first_lvl"":""重庆"",""sec_lvl"":""重庆市"",""third_lvl"":...",web,desktop,macbook,macOS,macOS Big Sur 11,safari,safari 13,coremail,management
4,access:test_c:20191023210513:cOCi,2019/10/23 21:05,test_c,sso,,27.10.135.254,代理IP,3级,"{""first_lvl"":""重庆"",""sec_lvl"":""重庆市"",""third_lvl"":...",web,desktop,macbook,macOS,macOS Big Sur 11,safari,safari 13,reimbursement,finance


In [4]:
train['risk_label'].value_counts(dropna=False)

0    12076
1     2940
Name: risk_label, dtype: int64

In [5]:
for f in ['user_name', 'action', 'auth_type', 'ip',
          'ip_location_type_keyword', 'ip_risk_level', 'location', 'client_type',
          'browser_source', 'device_model', 'os_type', 'os_version',
          'browser_type', 'browser_version', 'bus_system_code', 'op_target']:
    for v in train[f].unique():
        print(f, v, train[train[f] == v]['risk_label'].mean())
    print('='*50)

user_name test_d 0.190810465858328
user_name test_c 0.2004201680672269
user_name test_a 0.19375305026842363
user_name test_b 0.20043763676148796
user_name test_g 0.195578231292517
user_name test_e 0.1988888888888889
user_name test_f 0.19234116623150566
action login 0.1932896671567972
action sso 0.19827471798274718
auth_type otp 0.19203491543917076
auth_type qr 0.1888772298006296
auth_type nan nan
auth_type sms 0.19239013933547697
auth_type pwd 0.19989339019189764
ip 192.168.100.101 0.19682539682539682
ip 14.196.145.66 0.18600867678958785
ip 27.10.135.254 0.1939799331103679
ip 192.168.100.103 0.20709105560032232
ip 192.168.0.100 0.18235294117647058
ip_location_type_keyword 内网 0.19747828991315966
ip_location_type_keyword 家庭宽带 0.18600867678958785
ip_location_type_keyword 代理IP 0.1939799331103679
ip_risk_level 1级 0.19792024750773463
ip_risk_level 2级 0.18543956043956045
ip_risk_level 3级 0.1939799331103679
location {"first_lvl":"成都分公司","sec_lvl":"9楼","third_lvl":"销售部"} 0.19682539682539682
loc

In [6]:
data = pd.concat([train, test])
print(data.shape)

(25016, 19)


In [7]:
data['location_first_lvl'] = data['location'].astype(str).apply(lambda x: json.loads(x)['first_lvl'])
data['location_sec_lvl'] = data['location'].astype(str).apply(lambda x: json.loads(x)['sec_lvl'])
data['location_third_lvl'] = data['location'].astype(str).apply(lambda x: json.loads(x)['third_lvl'])

data.drop(['client_type', 'browser_source'], axis=1, inplace=True)
data['auth_type'].fillna('__NaN__', inplace=True)

for col in tqdm(['user_name', 'action', 'auth_type', 'ip', 
                 'ip_location_type_keyword', 'ip_risk_level', 'location', 'device_model',
                 'os_type', 'os_version', 'browser_type', 'browser_version',
                 'bus_system_code', 'op_target', 'location_first_lvl', 'location_sec_lvl', 
                 'location_third_lvl']):
    lbl = LabelEncoder()
    data[col] = lbl.fit_transform(data[col])

HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))




In [8]:
data['op_date'] = pd.to_datetime(data['op_date'])
data['op_ts'] = data["op_date"].values.astype(np.int64) // 10 ** 9
data = data.sort_values(by=['user_name', 'op_ts']).reset_index(drop=True)
data['last_ts'] = data.groupby(['user_name'])['op_ts'].shift(1)
data['ts_diff1'] = data['op_ts'] - data['last_ts']

In [9]:
for f in ['ip', 'location', 'device_model', 'os_version', 'browser_version']:
    data[f'user_{f}_nunique'] = data.groupby(['user_name'])[f].transform('nunique')

In [10]:
for method in ['mean', 'max', 'min', 'std']:
    data[f'ts_diff1_{method}'] = data.groupby('user_name')['ts_diff1'].transform(method)

In [11]:
train = data[data['risk_label'].notna()]
test = data[data['risk_label'].isna()]

print(train.shape, test.shape)

(15016, 32) (10000, 32)


In [12]:
ycol = 'risk_label'
feature_names = list(
    filter(lambda x: x not in [ycol, 'session_id', 'op_date', 'last_ts'], train.columns))

model = lgb.LGBMClassifier(objective='binary',
                           boosting_type='gbdt',
                           tree_learner='serial',
                           num_leaves=32,
                           max_depth=6,
                           learning_rate=0.1,
                           n_estimators=10000,
                           subsample=0.8,
                           feature_fraction=0.6,
                           reg_alpha=0.,
                           reg_lambda=0.,
                           random_state=1983,
                           is_unbalance=True,
                           metric='auc')


oof = []
prediction = test[['session_id']]
prediction[ycol] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1983)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train[feature_names], train[ycol])):
    X_train = train.iloc[trn_idx][feature_names]
    Y_train = train.iloc[trn_idx][ycol]

    X_val = train.iloc[val_idx][feature_names]
    Y_val = train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=500,
                          eval_metric='auc',
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)
    df_oof = train.iloc[val_idx][['session_id', ycol]].copy()
    df_oof['pred'] = pred_val[:, 1]
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(
        test[feature_names], num_iteration=lgb_model.best_iteration_)
    prediction[ycol] += pred_test[:, 1] / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()
    
    
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance



Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[12]	train's auc: 0.648341	valid's auc: 0.51527


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	train's auc: 0.563305	valid's auc: 0.524088


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	train's auc: 0.602498	valid's auc: 0.517259


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	train's auc: 0.576999	valid's auc: 0.512423


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[15]	train's auc: 0.656338	valid's auc: 0.522001


Unnamed: 0,column,importance
0,ts_diff1,50.4
1,op_ts,41.2
2,bus_system_code,20.0
3,auth_type,16.4
4,browser_version,9.0
5,ts_diff1_mean,6.8
6,user_name,6.6
7,op_target,6.4
8,location_third_lvl,5.6
9,ip,5.4


In [13]:
df_oof = pd.concat(oof)
print('roc_auc_score', roc_auc_score(df_oof[ycol], df_oof['pred']))

roc_auc_score 0.5070717091076244


In [14]:
prediction['id'] = range(len(prediction))
prediction['id'] = prediction['id'] + 1
prediction = prediction[['id', 'risk_label']].copy()
prediction.columns = ['id', 'ret']
prediction.head()

Unnamed: 0,id,ret
6147,1,0.324
6148,2,0.32
6149,3,0.324
6150,4,0.335
6151,5,0.32


In [15]:
# prediction['rank'] = prediction['risk_label'].rank()
# prediction['ret'] = 0
# prediction.loc[prediction['rank'] <= int(prediction.shape[0] * train['risk_label'].mean()), 'ret'] = 1

# prediction = prediction[['session_id', 'ret']].copy()
# prediction.columns = ['id', 'ret']
# prediction['id'] = range(len(prediction))
# prediction['id'] = prediction['id'] + 1
# prediction.head()

In [16]:
# print(prediction['ret'].value_counts())
prediction.to_csv('bottomline.csv', index=False)