In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


In [2]:
train_data_path = 'data/train.csv'
test_data_path = 'data/evaluation_public.csv'
submission_path = 'data/submit_example.csv'

In [3]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
print(f'train_data.shape = {train_data.shape}, test_data.shape = {test_data.shape}')

train_data.shape = (47660, 17), test_data.shape = (25710, 16)


In [4]:
train_data['is_risk'].value_counts()

0    39964
1     7696
Name: is_risk, dtype: int64

In [5]:
train_data.head()

Unnamed: 0,id,user_name,department,ip_transform,device_num_transform,browser_version,browser,os_type,os_version,op_datetime,ip_type,http_status_code,op_city,log_system_transform,url,op_month,is_risk
0,0,guojianping9672,rd,GVhZtW4i1,rqRxAjAL1RYC,firefox_78,firefox,win,win10,2022-01-18 19:10:41,内网,200,成都,2umVQwhiiwNJ,xxx.com/mail,2022-01,0
1,1,yangtao1740,sales,l3MuTMPoQ,iKPTa3su50y7,chrome_93,chrome,win,win11,2022-04-01 17:04:00,内网,200,深圳,RwHe8Q1R7AlB,business.xxx.com/,2022-04,0
2,2,wangying9098,rd,4uHWcskWv,1baNbqxMWcCu,ie_11,ie,win,win10,2022-03-01 15:53:49,内网,200,成都,dwS3cdn15GK4,wpsdoc.xxx.com/kdocs,2022-03,0
3,3,liguixiang3860,rd,mQh3NwtY7,C04Llg4lKl4C,edge_93,edge,win,win10,2022-02-07 19:46:25,内网,200,北京,nHrKgKdJ1Mzt,xxx.com/github,2022-02,0
4,4,guanyu9205,sales,C2QtgDKAZ,kSscjiRSz1aD,edge_93,edge,win,win10,2022-04-12 10:05:19,内网,200,成都,RwHe8Q1R7AlB,business.xxx.com/,2022-04,0


In [6]:
train_data.groupby(['user_name']).agg({
    'device_num_transform': pd.Series.nunique, 
    'ip_transform': pd.Series.nunique,
    'browser_version': pd.Series.nunique, 
    'browser': pd.Series.nunique,
    'os_type': pd.Series.nunique, 
    'os_version': pd.Series.nunique,
    'ip_type': pd.Series.nunique,
    'http_status_code': pd.Series.nunique, 
    'op_city': pd.Series.nunique,
    'log_system_transform': pd.Series.nunique, 
    'url': pd.Series.nunique,
})

Unnamed: 0_level_0,device_num_transform,ip_transform,browser_version,browser,os_type,os_version,ip_type,http_status_code,op_city,log_system_transform,url
user_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
baojianhua2916,1,11,1,1,1,1,1,3,1,5,6
caili5590,1,8,1,1,1,1,1,3,1,5,6
caohui3132,1,7,1,1,1,1,1,3,1,6,7
caoyu4082,1,14,1,1,1,1,1,3,1,5,6
cendandan2851,1,7,1,1,1,1,1,3,1,5,6
...,...,...,...,...,...,...,...,...,...,...,...
zhaoshuhua2855,1,5,1,1,1,1,1,3,1,5,6
zhaoxiang7127,1,12,1,1,1,1,1,3,1,5,6
zhengguiying7117,1,7,1,1,1,1,1,2,1,5,6
zhoutingting3694,1,5,1,1,1,1,1,3,1,5,6


In [7]:
# from: https://zhuanlan.zhihu.com/p/463778333
test_data['is_risk'] = -1
data = pd.concat([train_data, test_data])
data['op_datetime'] = pd.to_datetime(data['op_datetime'])
# day, hour, minute
data['timestamp'] = data["op_datetime"].values.astype(np.int64) // 10 ** 9
data['day'] = data['op_datetime'].dt.day
data['hour'] = data['op_datetime'].dt.hour
data['minute'] = data['op_datetime'].dt.minute

data['day_sin'] = np.sin(2 * np.pi * data['day']/24.0) 
data['day_cos'] = np.cos(2 * np.pi * data['day']/24.0)
data['hour_sin'] = np.sin(2 * np.pi * data['hour']/24.0) 
data['hour_cos'] = np.cos(2 * np.pi * data['hour']/24.0)
data['minute_sin'] = np.sin(2 * np.pi * data['minute']/60.0) 
data['minute_cos'] = np.cos(2 * np.pi * data['minute']/60.0)

In [8]:
data.head()

Unnamed: 0,id,user_name,department,ip_transform,device_num_transform,browser_version,browser,os_type,os_version,op_datetime,...,timestamp,day,hour,minute,day_sin,day_cos,hour_sin,hour_cos,minute_sin,minute_cos
0,0,guojianping9672,rd,GVhZtW4i1,rqRxAjAL1RYC,firefox_78,firefox,win,win10,2022-01-18 19:10:41,...,1642533041,18,19,10,-1.0,-1.83697e-16,-0.965926,0.258819,0.866025,0.5
1,1,yangtao1740,sales,l3MuTMPoQ,iKPTa3su50y7,chrome_93,chrome,win,win11,2022-04-01 17:04:00,...,1648832640,1,17,4,0.258819,0.9659258,-0.965926,-0.258819,0.406737,0.913545
2,2,wangying9098,rd,4uHWcskWv,1baNbqxMWcCu,ie_11,ie,win,win10,2022-03-01 15:53:49,...,1646150029,1,15,53,0.258819,0.9659258,-0.707107,-0.707107,-0.669131,0.743145
3,3,liguixiang3860,rd,mQh3NwtY7,C04Llg4lKl4C,edge_93,edge,win,win10,2022-02-07 19:46:25,...,1644263185,7,19,46,0.9659258,-0.258819,-0.965926,0.258819,-0.994522,0.104528
4,4,guanyu9205,sales,C2QtgDKAZ,kSscjiRSz1aD,edge_93,edge,win,win10,2022-04-12 10:05:19,...,1649757919,12,10,5,1.224647e-16,-1.0,0.5,-0.866025,0.5,0.866025


In [9]:
cat_columns = [
    'user_name', 'department', 'ip_transform', 'device_num_transform',
    'browser_version', 'browser', 'os_type', 'os_version',
    'ip_type', 'http_status_code', 'op_city', 'log_system_transform', 'url'
]

In [10]:
record = dict()
res = dict()
data = data.sort_values(by=['user_name', 'timestamp']).reset_index(drop=True)
for idx, row in tqdm(data.iterrows()):
    user_name = row['user_name']
    for col in cat_columns:
        key = str(user_name) + "_" + str(row[col])
        
        if key not in record:
            record[key] = [row['timestamp']]
        else:
            record[key].append(row['timestamp'])
        for idx in range(1, 4):
            column = f'user_name_{col}_diff_{idx}'
            if column not in res:
                res[column] = [0]
            else:
                if len(record[key]) < idx + 1:
                    res[column].append(0)
                else:
                    res[column].append(row['timestamp'] - record[key][-(idx+1)])
for key in res.keys():
    data[key] = res[key]

73370it [00:16, 4553.81it/s]


In [11]:
num_columns = [col for col in data.columns if col not in cat_columns and 
                col not in ['id', 'op_datetime', 'op_month', 'timestamp', 'is_risk']]
target = 'is_risk'
feature = cat_columns + num_columns

In [12]:
for col in cat_columns:
    lab = LabelEncoder()
    data[col] = lab.fit_transform(data[col])

In [13]:
x_train = data[(data['is_risk'] != -1) & (data['op_month'] != '2022-04')][feature]
y_train = data[(data['is_risk'] != -1) & (data['op_month'] != '2022-04')][target]
x_val = data[(data['is_risk'] != -1) & (data['op_month'] == '2022-04')][feature]
y_val = data[(data['is_risk'] != -1) & (data['op_month'] == '2022-04')][target]

In [14]:
x_test = data[data['is_risk'] == -1][feature]
train = data[data['is_risk'] != -1][feature]
label = data[data['is_risk'] != -1][target]

In [15]:
data['is_risk'].value_counts()

 0    39964
-1    25710
 1     7696
Name: is_risk, dtype: int64

In [16]:
def model_train(model, model_name, kfold=5):
    oof_preds = np.zeros((train_data.shape[0]))
    test_preds = np.zeros(test_data.shape[0])
    skf = StratifiedKFold(n_splits=kfold, shuffle=True)
    print(f'Model = {model_name}')
    for k, (train_index, test_index) in enumerate(skf.split(train, label)):
        x_train, x_val = train.iloc[train_index, :], train.iloc[test_index, :]
        y_train, y_val = label.iloc[train_index], label.iloc[test_index]

        model.fit(x_train, y_train)

        y_pred = model.predict_proba(x_val)[:,1]
        oof_preds[test_index] = y_pred.ravel()
        auc = roc_auc_score(y_val, y_pred)
        print("KFold = %d, val_auc = %.4f" % (k, auc))
        test_fold_preds = model.predict_proba(x_test)[:, 1]
        test_preds += test_fold_preds.ravel()
    print("Overall Model = %s, AUC = %.4f" % (model_name, roc_auc_score(label, oof_preds)))
    return test_preds / kfold

In [17]:
xgbc = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    n_estimators=25, 
    max_depth=6, 
    learning_rate=0.1
)
xgbc_test_preds = model_train(xgbc, "XGBClassifier", 10)

Model = XGBClassifier
KFold = 0, val_auc = 0.9143
KFold = 1, val_auc = 0.9378
KFold = 2, val_auc = 0.9156
KFold = 3, val_auc = 0.9179
KFold = 4, val_auc = 0.9187
KFold = 5, val_auc = 0.9268
KFold = 6, val_auc = 0.9242
KFold = 7, val_auc = 0.9191
KFold = 8, val_auc = 0.9184
KFold = 9, val_auc = 0.9230
Overall Model = XGBClassifier, AUC = 0.9207


In [18]:
gbm = LGBMClassifier(
    objective='binary',
    num_leaves=35, 
    learning_rate=0.1, 
    n_estimators=100, 
    metrics='auc'
)
gbm_test_preds = model_train(gbm, "LGBMClassifier", 10)

Model = LGBMClassifier
KFold = 0, val_auc = 0.9177
KFold = 1, val_auc = 0.9293
KFold = 2, val_auc = 0.9221
KFold = 3, val_auc = 0.9282
KFold = 4, val_auc = 0.9249
KFold = 5, val_auc = 0.9270
KFold = 6, val_auc = 0.9286
KFold = 7, val_auc = 0.9376
KFold = 8, val_auc = 0.9316
KFold = 9, val_auc = 0.9353
Overall Model = LGBMClassifier, AUC = 0.9280


In [19]:
cbc = CatBoostClassifier(
    iterations=100, 
    depth=10, 
    learning_rate=0.1, 
    loss_function='Logloss',
    verbose=0
)
cbc_test_preds = model_train(cbc, "CatBoostClassifier", 10)

Model = CatBoostClassifier
KFold = 0, val_auc = 0.9335
KFold = 1, val_auc = 0.9380
KFold = 2, val_auc = 0.9274
KFold = 3, val_auc = 0.9152
KFold = 4, val_auc = 0.9197
KFold = 5, val_auc = 0.9330
KFold = 6, val_auc = 0.9229
KFold = 7, val_auc = 0.9274
KFold = 8, val_auc = 0.9305
KFold = 9, val_auc = 0.9254
Overall Model = CatBoostClassifier, AUC = 0.9271


In [20]:
preds = (xgbc_test_preds + gbm_test_preds + cbc_test_preds) / 3

In [21]:
submission = pd.DataFrame({
    'id': data[data['is_risk'] == -1]['id'],
    'is_risk': np.array(preds)
}).sort_values(by=['id']).reset_index(drop=True)

In [22]:
submission.to_csv('submission.csv', index=False)