In [4]:
from pycaret.classification import *
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc #garbage collection 감시
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
import seaborn as sns
warnings.filterwarnings(action='ignore')

# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    min  = int(x[10:12])
    sec  = int(x[12:])
    return dt.datetime(year, month, day, hour,min,sec)

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        return 0
    else:
        return int(x)
    
def string2num2(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    return int(x)
    
PATH = "./data/"



In [7]:
train_err  = pd.read_csv(PATH+'train_err_data.csv')
id_error = train_err[['user_id','errtype']].values
error = np.zeros((15000,42))
for person_idx, err in tqdm(id_error):
    # person_idx - 10000 위치에 person_idx, errtype에 해당하는 error값을 +1
    error[person_idx - 10000,err - 1] += 1

train_prob = pd.read_csv(PATH+'train_problem_data.csv')
problem = np.zeros(15000)
problem[train_prob.user_id.unique()-10000] = 1 

train = pd.DataFrame(data=error)
train['problem'] = problem
del error, problem

clf = setup(data = train, target = 'problem') 

Unnamed: 0,Description,Value
0,session_id,586
1,Target,problem
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(15000, 43)"
5,Missing Values,False
6,Numeric Features,42
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [9]:
best_5 = compare_models(sort = 'AUC', n_select = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.7835,0.8083,0.492,0.7675,0.5992,0.4606,0.4822,3.975
lightgbm,Light Gradient Boosting Machine,0.7812,0.8043,0.4967,0.7558,0.5992,0.4575,0.4767,0.085
gbc,Gradient Boosting Classifier,0.7785,0.8024,0.4478,0.7886,0.5709,0.4366,0.4685,0.227
et,Extra Trees Classifier,0.7776,0.8023,0.4756,0.7596,0.5847,0.4437,0.4664,0.173
rf,Random Forest Classifier,0.7758,0.7991,0.4643,0.7623,0.5768,0.4364,0.4613,0.159
xgboost,Extreme Gradient Boosting,0.766,0.7948,0.5105,0.6984,0.5895,0.4314,0.442,0.283
ada,Ada Boost Classifier,0.7701,0.7903,0.4565,0.7474,0.5665,0.4223,0.4462,0.069
qda,Quadratic Discriminant Analysis,0.7496,0.7367,0.3469,0.7661,0.4759,0.3414,0.3882,0.017
lda,Linear Discriminant Analysis,0.7485,0.7361,0.3226,0.7899,0.4577,0.3298,0.386,0.021
lr,Logistic Regression,0.7517,0.7327,0.3631,0.756,0.49,0.352,0.3935,0.139


In [10]:
blended = blend_models(estimator_list = best_5, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7881,0.82,0.4928,0.7839,0.6051,0.4704,0.4941
1,0.7757,0.8058,0.4465,0.7783,0.5675,0.4307,0.461
2,0.7781,0.8034,0.4581,0.777,0.5764,0.4393,0.4674
3,0.7848,0.8132,0.4841,0.7791,0.5971,0.461,0.4853
4,0.7842,0.812,0.4602,0.799,0.584,0.4522,0.4836
Mean,0.7822,0.8109,0.4683,0.7835,0.586,0.4507,0.4783
SD,0.0046,0.0059,0.0173,0.0081,0.0136,0.0143,0.0122


In [11]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.778,0.8058,0.4659,0.8031,0.5897,0.452,0.4834


In [12]:
final_model = finalize_model(blended)

In [13]:
test_err  = pd.read_csv(PATH+'test_err_data.csv')
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((14999,42))
for person_idx, err in tqdm(id_error):
    test_x[person_idx - 30000,err - 1] += 1
test_x = test_x.reshape(test_x.shape[0],-1)
test = pd.DataFrame(data=test_x)

100%|██████████████████████████████████████████████████████████████████| 16532648/16532648 [00:29<00:00, 566343.41it/s]


In [None]:
predictions = predict_model(final_model, data = test)

In [None]:
#pycaret에서는 score이 label을 맞출 확률이기때문에 output을 제출 양식에 맞게 바꿔줍니다
x = []
for i in range(len(predictions['Score'])):
    if predictions['Label'][i] =='1.0':
        x.append(predictions['Score'][i])
    else:
        x.append(1-predictions['Score'][i])

In [None]:
sample_submssion = pd.read_csv(PATH+'sample_submission.csv')
sample_submssion['problem'] = x
sample_submssion.to_csv("AutoML.csv", index = False)