In [46]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from tqdm.notebook import tqdm
from sklearn import preprocessing
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from gensim.models.word2vec import Word2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import matplotlib.pyplot as plt
from pyod.models.iforest import IForest
import gc
import warnings
warnings.simplefilter('ignore')

In [47]:
train = pd.read_csv('train.csv')
test = pd.read_csv('evaluation_public.csv')
sub = pd.read_csv('submit_sample.csv')


data = pd.concat([train,test]).reset_index(drop = True)
data['op_datetime'] = pd.to_datetime(data['op_datetime'])
data = data.sort_values(by=['user_name', 'op_datetime']).reset_index(drop=True)

In [48]:
data['min'] = data['op_datetime'].apply(lambda x: int(str(x)[-5:-3]))
data['min_sin'] = np.sin(data['min']/60*2*np.pi)
data['min_cos'] = np.cos(data['min']/60*2*np.pi)

In [50]:
data['hour'] = data['op_datetime'].dt.hour
data['weekday'] = data['op_datetime'].dt.weekday
data['year'] = data['op_datetime'].dt.year
data['month'] = data['op_datetime'].dt.month
data['day'] = data['op_datetime'].dt.day
data['dayofyear'] = data['op_datetime'].dt.dayofyear
data['weekofyear'] = data['op_datetime'].dt.weekofyear
data = data.sort_values(by=['user_name', 'op_datetime']).reset_index(drop=True)

data['op_ts'] = data["op_datetime"].values.astype(np.int64) // 10 ** 9
data = data.sort_values(by=['user_name', 'op_ts']).reset_index(drop=True)
data['last_ts'] = data.groupby(['user_name'])['op_ts'].shift(1)
data['last_ts2'] = data.groupby(['user_name'])['op_ts'].shift(2)
data['last_ts3'] = data.groupby(['user_name'])['op_ts'].shift(3)
data['ts_diff'] = data['op_ts'] - data['last_ts']
data['ts_diff2'] = data['op_ts'] - data['last_ts2']
data['ts_diff3'] = data['op_ts'] - data['last_ts3']

In [51]:
for col in tqdm(['user_name', 'department', 'ip_transform', 'device_num_transform',
       'browser_version', 'browser', 'os_type', 'os_version', 
       'ip_type', 'http_status_code', 'op_city', 'log_system_transform', 'url',]):
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

  0%|          | 0/13 [00:00<?, ?it/s]

In [63]:
train = data[data['is_risk'].notnull()].reset_index(drop = True)
test = data[~data['is_risk'].notnull()].reset_index(drop = True)
feas=[i  for i in train.columns.tolist() if i not in ['id', 'is_risk','ip_type','op_month','op_datetime','dayofyear','weekofyear',
                                                        'month','op_ts','year','last_ts','last_ts2','last_ts3','day',
                                                        ]]

x_train = train[feas]
y_train = train['is_risk']
x_test = test[feas]

In [77]:
x_train.shape

(47660, 20)

In [73]:
def cv_model(clf, train_x, train_y, test_x):
    folds = 5
    seed = 1111
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []
    test_pre = []
    Feass = pd.DataFrame()

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        
        
        train_matrix = clf.Dataset(trn_x, label=trn_y)
        valid_matrix = clf.Dataset(val_x, label=val_y)

        fea = pd.DataFrame()

        params = {
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'tree_learner':'serial',
            'metric': 'AUC',
            'min_child_weight': 4,
            'num_leaves': 2 ** 4,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 4,
            'learning_rate': 0.02,
            'seed': seed,
            'nthread': 32,
            'n_jobs':8,
            'silent': True,
            'verbose': -1,
        }

        model = clf.train(params, train_matrix, num_boost_round=2000, valid_sets=[train_matrix, valid_matrix], 
                          categorical_feature =[] ,verbose_eval=200,early_stopping_rounds=200)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        test_pre.append(test_pred)
        fea['feas'] = train_x.columns.tolist()
        fea['sorce'] = model.feature_importance()
        Feass = pd.concat([Feass,fea],axis = 0)
        print(list(sorted(zip(train_x.columns.tolist(), model.feature_importance()), key=lambda x: x[1], reverse=True))[:20])
            
            
        train[valid_index] = val_pred
        test = test_pred
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        
        print(cv_scores)
    test = sum(test_pre) / folds
    print("scotrainre_list:" , cv_scores)
    print("score_mean:" ,np.mean(cv_scores))
    print("score_std:" ,np.std(cv_scores))
    
    return train, test, Feass

lgb_train, lgb_test, Feass = cv_model(lgb,x_train, y_train, x_test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.921324	valid_1's auc: 0.901206
[400]	training's auc: 0.932121	valid_1's auc: 0.903044
[600]	training's auc: 0.940511	valid_1's auc: 0.904142
[800]	training's auc: 0.947135	valid_1's auc: 0.905227
[1000]	training's auc: 0.952392	valid_1's auc: 0.905614
[1200]	training's auc: 0.956827	valid_1's auc: 0.906144
[1400]	training's auc: 0.960944	valid_1's auc: 0.906337
[1600]	training's auc: 0.963982	valid_1's auc: 0.906249
Early stopping, best iteration is:
[1424]	training's auc: 0.961305	valid_1's auc: 0.90652
[('ip_transform', 2552), ('ts_diff', 2410), ('ts_diff3', 2060), ('hour', 2039), ('ts_diff2', 1880), ('device_num_transform', 1664), ('user_name', 1569), ('min', 1332), ('min_cos', 1051), ('min_sin', 987), ('weekday', 939), ('op_city', 512), ('url', 502), ('browser_version', 449), ('department', 416), ('http_status_code', 406),

In [74]:
lgb_test

array([0.19835248, 0.06393448, 0.01864974, ..., 0.01557806, 0.04257049,
       0.03008062])

In [75]:
sub.columns = ['id','is_risk']
test['is_risk'] = lgb_test
test = test.sort_values(['id']).reset_index(drop = True)
sub['is_risk'] = test['is_risk'].values
sub.to_csv('base_0905_0.91719.csv', index = False)

In [76]:
sub

Unnamed: 0,id,is_risk
0,0,0.198352
1,1,0.063934
2,2,0.018650
3,3,0.414391
4,4,0.254039
...,...,...
25705,25705,0.020595
25706,25706,0.016024
25707,25707,0.015578
25708,25708,0.042570
