In [42]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings(action='ignore')

# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    #mim  = int(x[10:12])
    #sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        return 0
    else:
        return int(x)

In [84]:
train_err  = pd.read_csv('../preprocessed data/new_train_err.csv', parse_dates=['time'])
display(train_err.head())

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,10000,2020-11-01 02:56:16,model_3,05.15.2138,15,1
1,10000,2020-11-01 03:03:09,model_3,05.15.2138,12,1
2,10000,2020-11-01 03:03:09,model_3,05.15.2138,11,1
3,10000,2020-11-01 05:05:14,model_3,05.15.2138,16,1
4,10000,2020-11-01 05:05:15,model_3,05.15.2138,4,0


In [85]:
id_error = train_err[['user_id','errtype']].values
id_error

array([[10000,    15],
       [10000,    12],
       [10000,    11],
       ...,
       [24999,     4],
       [24999,     4],
       [24999,    15]], dtype=int64)

In [86]:
# 데이터 설명을 확인하면
# ueser_id가 10000부터 24999까지 총 15000개가 연속적으로 존재.
display(train_err.head())
train_user_id_max = 24999
train_user_id_min = 10000
train_user_number = 15000

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,10000,2020-11-01 02:56:16,model_3,05.15.2138,15,1
1,10000,2020-11-01 03:03:09,model_3,05.15.2138,12,1
2,10000,2020-11-01 03:03:09,model_3,05.15.2138,11,1
3,10000,2020-11-01 05:05:14,model_3,05.15.2138,16,1
4,10000,2020-11-01 05:05:15,model_3,05.15.2138,4,0


In [87]:
error = np.zeros((train_user_number,42))

In [88]:
for person_idx, err in tqdm(id_error):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    error[person_idx - train_user_id_min,err - 1] += 1
error.shape

100%|██████████| 15368001/15368001 [00:26<00:00, 585475.86it/s]


(15000, 42)

In [89]:
train_prob = pd.read_csv('../preprocessed data/new_train_problem.csv', parse_dates=['time'])

In [90]:
problem = np.zeros(15000)
problem[train_prob.user_id.unique()-train_user_id_min] = 1 

In [91]:
train = pd.DataFrame(error)
train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41
0,0.0,0.0,8.0,104.0,0.0,1.0,1.0,0.0,0.0,7.0,15.0,16.0,1.0,10.0,59.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,48.0,1.0,1.0,0.0,0.0,0.0,10.0,11.0,1.0,8.0,143.0,128.0,0.0,4.0,1.0,3.0,1.0,756.0,751.0,5.0,1.0,22.0,0.0,0.0,0.0,0.0,250.0,0.0,10.0,18.0,0.0,1.0,1.0,0.0,0.0,113.0,56.0,1.0
2,0.0,0.0,2.0,131.0,1.0,2.0,1.0,0.0,0.0,1.0,13.0,14.0,1.0,4.0,52.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,25.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,9.0,9.0,0.0,0.0,45.0,30.0,1.0,0.0,0.0,0.0,0.0,28.0,19.0,0.0,0.0,57.0,0.0,0.0,0.0,0.0,65.0,0.0,8.0,0.0,0.0,1.0,1.0,2.0,0.0,17.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,3.0,4.0,0.0,0.0,0.0,16.0,19.0,3.0,5.0,143.0,91.0,0.0,0.0,0.0,0.0,0.0,140.0,119.0,0.0,0.0,33.0,0.0,0.0,0.0,0.0,176.0,0.0,16.0,0.0,0.0,1.0,1.0,0.0,0.0,4.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0.0,0.0,0.0,0.0,2.0,5.0,5.0,0.0,0.0,0.0,5.0,10.0,5.0,8.0,26.0,18.0,0.0,0.0,0.0,0.0,0.0,17.0,12.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,31.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,7.0,4.0
14996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14997,0.0,0.0,0.0,1.0,8.0,1.0,1.0,0.0,0.0,0.0,16.0,18.0,3.0,1.0,181.0,138.0,0.0,0.0,0.0,0.0,0.0,50.0,29.0,0.0,0.0,48.0,0.0,0.0,0.0,0.0,223.0,0.0,16.0,12.0,0.0,1.0,1.0,0.0,0.0,58.0,7.0,5.0
14998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,15.0,0.0,1.0,51.0,12.0,0.0,0.0,0.0,0.0,0.0,7.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,14.0,0.0,15.0,2.0,0.0,1.0,1.0,0.0,0.0,6.0,0.0,0.0


In [92]:
change_model =  train_err[['user_id', 'model_nm']]
change_model = change_model.drop_duplicates(subset=None, keep='first')
change_model

Unnamed: 0,user_id,model_nm
0,10000,model_3
316,10001,model_2
2661,10002,model_3
2966,10003,model_2
3263,10004,model_0
...,...,...
15366278,24995,model_2
15366469,24996,model_3
15366473,24997,model_0
15367291,24998,model_0


In [93]:
user_model_num = change_model.user_id.value_counts()
user_has_2models = list(user_model_num.loc[user_model_num > 1].to_frame().index)

len(user_has_2models)

703

In [94]:
user_2models = np.zeros(15000)
user_2models[np.array(user_has_2models)-train_user_id_min] = 1 
user_2models

array([0., 0., 0., ..., 0., 0., 0.])

In [95]:
user_2models[8525]

1.0

In [96]:
train['user_2models'] = user_2models

In [97]:
train.iloc[8525, -1]

1.0

In [98]:
# period
train_err['day'] = train_err.time.dt.day
train_err['hour'] = train_err.time.dt.hour

user_day_df = train_err.groupby('user_id')['day'].unique().to_frame().reset_index()
user_day_df['period'] = 0 

for i in range(len(user_day_df.index)):
   user_day_df['period'][i] = len(user_day_df['day'][i])

user_day_df

Unnamed: 0,user_id,day,period
0,10000,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",30
1,10001,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",30
2,10002,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",29
3,10003,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",30
4,10004,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",30
...,...,...,...
14995,24995,"[20, 21, 22, 23, 25, 26, 27, 28, 29, 30]",10
14996,24996,[30],1
14997,24997,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",30
14998,24998,"[1, 3, 5, 7, 9, 10, 11, 13, 14, 15, 16, 18, 19...",21


In [99]:
user_day_df.index = user_day_df['user_id'] - 10000
user_day_df.drop(columns=['user_id', 'day'], inplace=True)

user_day_df

Unnamed: 0_level_0,period
user_id,Unnamed: 1_level_1
0,30
1,30
2,29
3,30
4,30
...,...
14995,10
14996,1
14997,30
14998,21


In [101]:
user_day = user_day_df.period.to_numpy()

In [102]:
train['period'] = user_day
train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,user_2models,period
0,0.0,0.0,8.0,104.0,0.0,1.0,1.0,0.0,0.0,7.0,15.0,16.0,1.0,10.0,59.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30
1,0.0,0.0,0.0,0.0,48.0,1.0,1.0,0.0,0.0,0.0,10.0,11.0,1.0,8.0,143.0,128.0,0.0,4.0,1.0,3.0,1.0,756.0,751.0,5.0,1.0,22.0,0.0,0.0,0.0,0.0,250.0,0.0,10.0,18.0,0.0,1.0,1.0,0.0,0.0,113.0,56.0,1.0,0.0,30
2,0.0,0.0,2.0,131.0,1.0,2.0,1.0,0.0,0.0,1.0,13.0,14.0,1.0,4.0,52.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,25.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29
3,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,9.0,9.0,0.0,0.0,45.0,30.0,1.0,0.0,0.0,0.0,0.0,28.0,19.0,0.0,0.0,57.0,0.0,0.0,0.0,0.0,65.0,0.0,8.0,0.0,0.0,1.0,1.0,2.0,0.0,17.0,1.0,0.0,0.0,30
4,0.0,0.0,0.0,1.0,0.0,3.0,4.0,0.0,0.0,0.0,16.0,19.0,3.0,5.0,143.0,91.0,0.0,0.0,0.0,0.0,0.0,140.0,119.0,0.0,0.0,33.0,0.0,0.0,0.0,0.0,176.0,0.0,16.0,0.0,0.0,1.0,1.0,0.0,0.0,4.0,0.0,2.0,0.0,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0.0,0.0,0.0,0.0,2.0,5.0,5.0,0.0,0.0,0.0,5.0,10.0,5.0,8.0,26.0,18.0,0.0,0.0,0.0,0.0,0.0,17.0,12.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,31.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,7.0,4.0,0.0,10
14996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
14997,0.0,0.0,0.0,1.0,8.0,1.0,1.0,0.0,0.0,0.0,16.0,18.0,3.0,1.0,181.0,138.0,0.0,0.0,0.0,0.0,0.0,50.0,29.0,0.0,0.0,48.0,0.0,0.0,0.0,0.0,223.0,0.0,16.0,12.0,0.0,1.0,1.0,0.0,0.0,58.0,7.0,5.0,0.0,30
14998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,15.0,0.0,1.0,51.0,12.0,0.0,0.0,0.0,0.0,0.0,7.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,14.0,0.0,15.0,2.0,0.0,1.0,1.0,0.0,0.0,6.0,0.0,0.0,0.0,21


In [None]:
train = train.to_numpy()

In [105]:
# 변수 이름 변경
# error  -> train_x
# problem-> train_y

train_x = train
train_y = problem
del error, problem
print(train_x.shape)
print(train_y.shape)

(15000, 44)
(15000,)


In [106]:
# Train
#-------------------------------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5
# 파라미터 설정
params =      {
                'boosting_type' : 'gbdt',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'seed': 1015
                }
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x[train_idx]
    y = train_y[train_idx]
    valid_x = train_x[val_idx]
    valid_y = train_y[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 3
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

[LightGBM] [Info] Number of positive: 3633, number of negative: 8367
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4334
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302750 -> initscore=-0.834237
[LightGBM] [Info] Start training from score -0.834237
Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.781217	valid_0's pr_auc: 0.793598
[LightGBM] [Info] Number of positive: 4828, number of negative: 7172
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4377
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.402333 -> initscore=-0.395752
[LightGBM] [Info] Start training from score -0.395752
Training until validation scores don't improve for 3 rounds
Early stopping

In [107]:
print(np.mean(auc_scores))

0.7941299353180167


In [108]:
train_x = pd.DataFrame(train_x)
train_y = pd.DataFrame(train_y)
train_y.columns = ['problem']
train = pd.concat([train_x,train_y], axis =1)
train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,problem
0,0.0,0.0,8.0,104.0,0.0,1.0,1.0,0.0,0.0,7.0,15.0,16.0,1.0,10.0,59.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0
1,0.0,0.0,0.0,0.0,48.0,1.0,1.0,0.0,0.0,0.0,10.0,11.0,1.0,8.0,143.0,128.0,0.0,4.0,1.0,3.0,1.0,756.0,751.0,5.0,1.0,22.0,0.0,0.0,0.0,0.0,250.0,0.0,10.0,18.0,0.0,1.0,1.0,0.0,0.0,113.0,56.0,1.0,0.0,30.0,1.0
2,0.0,0.0,2.0,131.0,1.0,2.0,1.0,0.0,0.0,1.0,13.0,14.0,1.0,4.0,52.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,25.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.0,0.0
3,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,9.0,9.0,0.0,0.0,45.0,30.0,1.0,0.0,0.0,0.0,0.0,28.0,19.0,0.0,0.0,57.0,0.0,0.0,0.0,0.0,65.0,0.0,8.0,0.0,0.0,1.0,1.0,2.0,0.0,17.0,1.0,0.0,0.0,30.0,0.0
4,0.0,0.0,0.0,1.0,0.0,3.0,4.0,0.0,0.0,0.0,16.0,19.0,3.0,5.0,143.0,91.0,0.0,0.0,0.0,0.0,0.0,140.0,119.0,0.0,0.0,33.0,0.0,0.0,0.0,0.0,176.0,0.0,16.0,0.0,0.0,1.0,1.0,0.0,0.0,4.0,0.0,2.0,0.0,30.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0.0,0.0,0.0,0.0,2.0,5.0,5.0,0.0,0.0,0.0,5.0,10.0,5.0,8.0,26.0,18.0,0.0,0.0,0.0,0.0,0.0,17.0,12.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,31.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,7.0,4.0,0.0,10.0,0.0
14996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
14997,0.0,0.0,0.0,1.0,8.0,1.0,1.0,0.0,0.0,0.0,16.0,18.0,3.0,1.0,181.0,138.0,0.0,0.0,0.0,0.0,0.0,50.0,29.0,0.0,0.0,48.0,0.0,0.0,0.0,0.0,223.0,0.0,16.0,12.0,0.0,1.0,1.0,0.0,0.0,58.0,7.0,5.0,0.0,30.0,1.0
14998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,15.0,0.0,1.0,51.0,12.0,0.0,0.0,0.0,0.0,0.0,7.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,14.0,0.0,15.0,2.0,0.0,1.0,1.0,0.0,0.0,6.0,0.0,0.0,0.0,21.0,1.0


In [109]:
#clf = setup(data = train, target = "problem") 
clf = setup(train, target = 'problem', train_size = 0.85, silent=True)

Unnamed: 0,Description,Value
0,session_id,8595
1,Target,problem
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(15000, 45)"
5,Missing Values,False
6,Numeric Features,43
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [110]:
best_3 = compare_models(sort = 'AUC', n_select = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7937,0.8113,0.5141,0.7954,0.6244,0.4911,0.5134,0.073
gbc,Gradient Boosting Classifier,0.7914,0.8108,0.4781,0.8221,0.6045,0.476,0.5083,0.304
rf,Random Forest Classifier,0.788,0.8081,0.4927,0.7938,0.6079,0.4733,0.4987,0.231
et,Extra Trees Classifier,0.7897,0.8059,0.4937,0.7995,0.6102,0.4771,0.5032,0.194
ada,Ada Boost Classifier,0.7826,0.7977,0.4767,0.7878,0.5939,0.4575,0.4845,0.081
lda,Linear Discriminant Analysis,0.7655,0.7567,0.3592,0.8535,0.5054,0.3835,0.4454,0.026
qda,Quadratic Discriminant Analysis,0.7722,0.7551,0.429,0.7936,0.5567,0.4213,0.4575,0.018
nb,Naive Bayes,0.7411,0.7479,0.3091,0.7844,0.443,0.3138,0.3717,0.015
lr,Logistic Regression,0.7496,0.7401,0.3759,0.7484,0.5001,0.3569,0.3946,0.532
dt,Decision Tree Classifier,0.6975,0.6615,0.5541,0.5463,0.5499,0.3222,0.3224,0.033


In [111]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7859,0.7978,0.4759,0.802,0.5973,0.4641,0.4935
1,0.8031,0.8231,0.5159,0.8299,0.6362,0.5112,0.5383
2,0.7937,0.8235,0.4924,0.8168,0.6144,0.4851,0.5141
3,0.7918,0.8065,0.4736,0.8292,0.6028,0.4756,0.5099
4,0.7929,0.8159,0.4918,0.8132,0.6129,0.483,0.5114
Mean,0.7935,0.8134,0.4899,0.8182,0.6127,0.4838,0.5134
Std,0.0056,0.0099,0.0151,0.0105,0.0133,0.0155,0.0144


In [112]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7925,0.8334,0.4906,0.8079,0.6105,0.4804,0.5081


In [113]:
final_model = finalize_model(blended)

In [114]:
submission = pd.read_csv(r"C:\Users\gmlkd\data\시스템 품질 변화로 인한 사용자 불편 예지 AI 경진대회_data/sample_submission.csv")
submission

Unnamed: 0,user_id,problem
0,30000,0
1,30001,0
2,30002,0
3,30003,0
4,30004,0
...,...,...
14994,44994,0
14995,44995,0
14996,44996,0
14997,44997,0


In [115]:
submission.shape

(14999, 2)

In [116]:
test_err  = pd.read_csv('../preprocessed data/new_test_err.csv', parse_dates=['time'])
display(test_err.head())

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,30000,2020-11-01 03:02:27,model_1,04.16.3553,31,1
1,30000,2020-11-01 03:02:27,model_1,04.16.3553,33,2
2,30000,2020-11-01 03:02:28,model_1,04.16.3553,15,1
3,30000,2020-11-01 03:02:56,model_1,04.16.3553,22,1
4,30000,2020-11-01 03:03:00,model_1,04.16.3553,11,1


In [117]:
id_error = test_err[['user_id','errtype']].values
id_error

array([[30000,    31],
       [30000,    33],
       [30000,    15],
       ...,
       [44998,    15],
       [44998,    16],
       [44998,    31]], dtype=int64)

In [118]:
# 데이터 설명을 확인하면
# ueser_id가 10000부터 24999까지 총 15000개가 연속적으로 존재.
display(train_err.head())
train_user_id_max = 44998
train_user_id_min = 30000
train_user_number = 14999

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode,day,hour
0,10000,2020-11-01 02:56:16,model_3,05.15.2138,15,1,1,2
1,10000,2020-11-01 03:03:09,model_3,05.15.2138,12,1,1,3
2,10000,2020-11-01 03:03:09,model_3,05.15.2138,11,1,1,3
3,10000,2020-11-01 05:05:14,model_3,05.15.2138,16,1,1,5
4,10000,2020-11-01 05:05:15,model_3,05.15.2138,4,0,1,5


In [119]:
error = np.zeros((train_user_number,42))

In [121]:
for person_idx, err in tqdm(id_error):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    error[person_idx - train_user_id_min,err - 1] += 1
error.shape

100%|██████████| 15527221/15527221 [00:26<00:00, 578982.64it/s]


(14999, 42)

In [122]:
submission = pd.DataFrame(error)
submission

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,0.0,0.0,0.0,0.0,88.0,2.0,2.0,0.0,0.0,0.0,...,32.0,0.0,2.0,6.0,6.0,0.0,4.0,184.0,0.0,4.0
1,0.0,0.0,6.0,196.0,6.0,8.0,8.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,82.0,4.0,2.0,0.0,0.0,0.0,...,32.0,0.0,0.0,2.0,2.0,0.0,28.0,226.0,2.0,4.0
3,0.0,0.0,0.0,0.0,146.0,4.0,4.0,0.0,0.0,0.0,...,8.0,14.0,6.0,2.0,2.0,0.0,0.0,80.0,16.0,0.0
4,0.0,0.0,0.0,4.0,12.0,2.0,2.0,0.0,0.0,0.0,...,22.0,10.0,2.0,2.0,2.0,2.0,0.0,96.0,180.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14994,0.0,0.0,0.0,4.0,24.0,10.0,10.0,0.0,0.0,0.0,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,200.0,174.0,0.0
14995,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,...,28.0,2.0,0.0,2.0,2.0,0.0,0.0,24.0,34.0,0.0
14996,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,...,24.0,1604.0,0.0,2.0,2.0,2.0,0.0,658.0,4.0,6.0
14997,0.0,0.0,0.0,2.0,8.0,4.0,4.0,0.0,0.0,0.0,...,28.0,0.0,0.0,2.0,2.0,6.0,0.0,60.0,98.0,0.0


In [123]:
change_model =  test_err[['user_id', 'model_nm']]
change_model = change_model.drop_duplicates(subset=None, keep='first')
change_model

Unnamed: 0,user_id,model_nm
0,30000,model_1
2669,30000,model_2
2731,30001,model_3
3015,30002,model_0
3918,30003,model_0
...,...,...
15498135,44994,model_1
15499250,44995,model_0
15499765,44996,model_0
15501680,44997,model_0


In [124]:
user_model_num = change_model.user_id.value_counts()
user_has_2models = list(user_model_num.loc[user_model_num > 1].to_frame().index)

len(user_has_2models)

659

In [125]:
user_2models = np.zeros(train_user_number)
user_2models[np.array(user_has_2models)-train_user_id_min] = 1 
submission['user_2models'] = user_2models
submission

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,user_2models
0,0.0,0.0,0.0,0.0,88.0,2.0,2.0,0.0,0.0,0.0,...,0.0,2.0,6.0,6.0,0.0,4.0,184.0,0.0,4.0,1.0
1,0.0,0.0,6.0,196.0,6.0,8.0,8.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,82.0,4.0,2.0,0.0,0.0,0.0,...,0.0,0.0,2.0,2.0,0.0,28.0,226.0,2.0,4.0,0.0
3,0.0,0.0,0.0,0.0,146.0,4.0,4.0,0.0,0.0,0.0,...,14.0,6.0,2.0,2.0,0.0,0.0,80.0,16.0,0.0,0.0
4,0.0,0.0,0.0,4.0,12.0,2.0,2.0,0.0,0.0,0.0,...,10.0,2.0,2.0,2.0,2.0,0.0,96.0,180.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14994,0.0,0.0,0.0,4.0,24.0,10.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,200.0,174.0,0.0,0.0
14995,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,...,2.0,0.0,2.0,2.0,0.0,0.0,24.0,34.0,0.0,0.0
14996,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,...,1604.0,0.0,2.0,2.0,2.0,0.0,658.0,4.0,6.0,0.0
14997,0.0,0.0,0.0,2.0,8.0,4.0,4.0,0.0,0.0,0.0,...,0.0,0.0,2.0,2.0,6.0,0.0,60.0,98.0,0.0,0.0


In [126]:
submission.rename(columns = {'user_2models':42},inplace=True)

In [133]:
# period
test_err['day'] = test_err.time.dt.day
test_err['hour'] = test_err.time.dt.hour

user_day_df = test_err.groupby('user_id')['day'].unique().to_frame().reset_index()
user_day_df['period'] = 0 

for i in range(len(user_day_df.index)):
   user_day_df['period'][i] = len(user_day_df['day'][i])

user_day_df

Unnamed: 0,user_id,day,period
0,30000,"[1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...",29
1,30001,"[1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 1...",28
2,30002,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",30
3,30003,"[1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 1...",28
4,30004,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",30
...,...,...,...
14993,44994,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",30
14994,44995,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",30
14995,44996,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",30
14996,44997,"[1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...",28


In [134]:
user_day_df.index = user_day_df['user_id'] - 30000
user_day_df.drop(columns=['user_id', 'day'], inplace=True)

user_day_df

Unnamed: 0_level_0,period
user_id,Unnamed: 1_level_1
0,29
1,28
2,30
3,28
4,30
...,...
14994,30
14995,30
14996,30
14997,28


In [136]:
# 43262번 채워주기
mid = pd.DataFrame(data=[0], columns=['period'])
mid

Unnamed: 0,period
0,0


In [137]:
user_day_df = pd.concat([user_day_df[:13262], mid, user_day_df[13262:]], ignore_index=True)
user_day_df.iloc[13262, :]

period    0
Name: 13262, dtype: int64

In [138]:
user_day = user_day_df.period.to_numpy()

In [139]:
submission['43'] = user_day
submission

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,0.0,0.0,0.0,0.0,88.0,2.0,2.0,0.0,0.0,0.0,...,2.0,6.0,6.0,0.0,4.0,184.0,0.0,4.0,1.0,29
1,0.0,0.0,6.0,196.0,6.0,8.0,8.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28
2,0.0,0.0,0.0,0.0,82.0,4.0,2.0,0.0,0.0,0.0,...,0.0,2.0,2.0,0.0,28.0,226.0,2.0,4.0,0.0,30
3,0.0,0.0,0.0,0.0,146.0,4.0,4.0,0.0,0.0,0.0,...,6.0,2.0,2.0,0.0,0.0,80.0,16.0,0.0,0.0,28
4,0.0,0.0,0.0,4.0,12.0,2.0,2.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,0.0,96.0,180.0,0.0,1.0,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14994,0.0,0.0,0.0,4.0,24.0,10.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,200.0,174.0,0.0,0.0,30
14995,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,...,0.0,2.0,2.0,0.0,0.0,24.0,34.0,0.0,0.0,30
14996,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,...,0.0,2.0,2.0,2.0,0.0,658.0,4.0,6.0,0.0,30
14997,0.0,0.0,0.0,2.0,8.0,4.0,4.0,0.0,0.0,0.0,...,0.0,2.0,2.0,6.0,0.0,60.0,98.0,0.0,0.0,28


In [140]:
prep_pipe = get_config('prep_pipe')
transformed_unseen_data = prep_pipe.transform(submission)

In [141]:
prections = final_model.predict_proba(transformed_unseen_data)[:,1]
prections

array([0.94374904, 0.40005905, 0.44403366, ..., 0.81455874, 0.87453076,
       0.60191553])

In [142]:
sub = pd.read_csv(r"C:\Users\gmlkd\data\시스템 품질 변화로 인한 사용자 불편 예지 AI 경진대회_data/sample_submission.csv")
sub['problem'] = prections
sub

Unnamed: 0,user_id,problem
0,30000,0.943749
1,30001,0.400059
2,30002,0.444034
3,30003,0.825597
4,30004,0.904217
...,...,...
14994,44994,0.587068
14995,44995,0.458282
14996,44996,0.814559
14997,44997,0.874531


In [143]:
# 결측값 그냥 둠
sub.iloc[43262-30000]

user_id    43262.00000
problem        0.03271
Name: 13262, dtype: float64

In [144]:
sub.to_csv('../submissions/submission_case4.csv', index=False)