In [1]:
from pycaret.classification import *
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings(action='ignore')

In [36]:
train = pd.read_csv(r'C:\Users\gmlkd\machine-learning-exercises\10.DAYCON대회_사용자불편예지\training set\train_case4.csv')
test = pd.read_csv(r'C:\Users\gmlkd\machine-learning-exercises\10.DAYCON대회_사용자불편예지\training set\test_case4.csv')
train_problem = pd.read_csv(r'C:\Users\gmlkd\machine-learning-exercises\10.DAYCON대회_사용자불편예지\preprocessed data\new_train_problem.csv')
submission = pd.read_csv("C:/Users/gmlkd/data/시스템 품질 변화로 인한 사용자 불편 예지 AI 경진대회_data/sample_submission.csv")

In [37]:
# drop fwver
train_fwver_c = train.iloc[:, 10:30].columns
test_fwver_c = test.iloc[:, 10:30].columns

train.drop(columns=train_fwver_c, axis=1, inplace=True)
test.drop(columns=test_fwver_c, axis=1, inplace=True)

In [38]:
# drop period
train.drop(columns=['period'], axis=1, inplace=True)
test.drop(columns=['period'], axis=1, inplace=True)

In [39]:
train = train.to_numpy()
problem = np.zeros(15000)
problem[train_problem.user_id.unique() - 10000] = 1

train_x = train
train_y = problem

In [40]:
# Train
#-------------------------------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5
# 파라미터 설정
params =      {
                'boosting_type' : 'gbdt',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'seed': 1015
                }
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x):
    # split train, validation set
    X = train_x[train_idx]
    y = train_y[train_idx]
    valid_x = train_x[val_idx]
    valid_y = train_y[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 3
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

[LightGBM] [Info] Number of positive: 3633, number of negative: 8367
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4254
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302750 -> initscore=-0.834237
[LightGBM] [Info] Start training from score -0.834237
Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[2]	valid_0's auc: 0.793829	valid_0's pr_auc: 0.798678
[LightGBM] [Info] Number of positive: 4828, number of negative: 7172
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4394
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.402333 -> initscore=-0.395752
[LightGBM] [Info] Start training from score -0.395752
Training until validation scores don't improve for 3 rounds
Early stopping

In [41]:
print(np.mean(auc_scores))

0.7959604409445947


In [42]:
train_x = pd.DataFrame(train_x)
train_y = pd.DataFrame(train_y)
train_y.columns = ['problem']
train = pd.concat([train_x,train_y], axis =1)
train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,problem
0,-0.268744,-0.340504,-0.239957,-0.414143,0.102027,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,...,-0.061546,-0.074159,-1.030661,-0.114398,-0.080131,-0.602460,-0.336841,-0.644371,0.0,0.0
1,0.500848,-0.340504,-0.239957,4.011349,-0.064861,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,...,0.083164,-0.074159,0.628662,-0.114398,-0.080131,0.795861,2.510621,-0.362551,0.0,1.0
2,-0.272916,-0.340504,-0.239957,-0.414143,0.096218,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,...,-0.061546,-0.074159,-1.030661,-0.114398,-0.080131,-0.602460,-0.336841,-0.644371,0.0,0.0
3,-0.275951,-0.340504,-0.239957,0.146356,-0.064861,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,...,-0.061546,-0.074159,0.628662,0.641705,-0.080131,-0.392093,-0.285994,-0.644371,0.0,0.0
4,-0.093888,0.651704,-0.239957,-0.414143,-0.064861,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,...,-0.061546,-0.074159,0.628662,-0.114398,-0.080131,-0.552962,-0.336841,-0.080732,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,-0.316156,-0.340504,-0.239957,-0.053688,-0.064861,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,...,-0.061546,-0.074159,-1.030661,-0.114398,-0.080131,-0.491089,0.019092,0.482907,0.0,0.0
14996,-0.387084,-0.340504,-0.239957,-0.414143,-0.062749,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,...,-0.061546,-0.074159,-1.030661,-0.114398,-0.080131,-0.602460,-0.336841,-0.644371,0.0,0.0
14997,-0.078337,0.704060,-0.239957,-0.414143,-0.064861,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,...,0.034927,-0.074159,0.628662,-0.114398,-0.080131,0.115262,0.019092,0.764726,0.0,1.0
14998,-0.333983,-0.156620,-0.239957,-0.414143,-0.064861,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,...,-0.045467,-0.074159,0.628662,-0.114398,-0.080131,-0.528213,-0.336841,-0.644371,0.0,1.0


In [43]:
clf = setup(data = train, target = "problem", silent=True) 

Unnamed: 0,Description,Value
0,session_id,828
1,Target,problem
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(15000, 43)"
5,Missing Values,False
6,Numeric Features,41
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [44]:
best_3 = compare_models(sort = 'AUC', n_select = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7871,0.8067,0.4785,0.8124,0.6016,0.4694,0.5004,0.239
lightgbm,Light Gradient Boosting Machine,0.7865,0.805,0.5082,0.7811,0.6154,0.4767,0.498,0.045
rf,Random Forest Classifier,0.7855,0.8029,0.4929,0.7911,0.6069,0.4703,0.4954,0.167
et,Extra Trees Classifier,0.7841,0.7997,0.4991,0.78,0.6083,0.4691,0.4916,0.16
ada,Ada Boost Classifier,0.7843,0.797,0.4923,0.7878,0.6054,0.4677,0.4925,0.07
lr,Logistic Regression,0.7774,0.7715,0.432,0.8238,0.5661,0.4354,0.4768,0.305
lda,Linear Discriminant Analysis,0.7595,0.7564,0.3559,0.8365,0.4982,0.3725,0.4318,0.022
qda,Quadratic Discriminant Analysis,0.766,0.754,0.451,0.7557,0.5643,0.418,0.4444,0.019
nb,Naive Bayes,0.7356,0.7391,0.3494,0.7226,0.4695,0.3212,0.3595,0.016
knn,K Neighbors Classifier,0.7416,0.7275,0.4383,0.6796,0.5324,0.3654,0.3824,0.28


In [45]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7781,0.7892,0.4674,0.7857,0.5861,0.4476,0.4758
1,0.7938,0.8208,0.5014,0.8138,0.6205,0.4897,0.5168
2,0.7914,0.8132,0.5064,0.8009,0.6205,0.4865,0.5108
3,0.7943,0.8168,0.4965,0.822,0.619,0.4897,0.5189
4,0.7866,0.8125,0.4986,0.7892,0.6111,0.4742,0.4979
Mean,0.7888,0.8105,0.4941,0.8023,0.6115,0.4775,0.504
Std,0.006,0.011,0.0137,0.0139,0.0131,0.016,0.0159


In [46]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.796,0.8176,0.502,0.7976,0.6162,0.4869,0.5111


In [47]:
final_model = finalize_model(blended)

In [48]:
# pycaret의 predict_model을 사용하기 위해 test의 모든 컬럼을 정수로 변경
t_columns = list(np.arange(0,42))
test.columns = t_columns

In [27]:
# prep_pipe = get_config('prep_pipe')
# transformed_unseen_data = prep_pipe.transform(test)

In [49]:
prections = final_model.predict_proba(test)[:,1]
prections

array([0.92594711, 0.16045179, 0.3064495 , ..., 0.58080392, 0.86207154,
       0.38991368])

In [50]:
submission.drop(index=13262, inplace=True)

In [51]:
submission['problem'] = prections

In [52]:
prections.mean()

0.33461557555011984

In [53]:
user_43262 = pd.DataFrame({"user_id":[43262], "problem":[0.33]})
user_43262

Unnamed: 0,user_id,problem
0,43262,0.33


In [54]:
# 예측 결과값의 평균으로 넣어준다.
sub = pd.concat([submission[:13262], user_43262, submission[13262:]], ignore_index=True)
sub.iloc[13262, :]

user_id    43262.00
problem        0.33
Name: 13262, dtype: float64

In [55]:
sub.user_id = sub.user_id.astype(int)

In [56]:
sub

Unnamed: 0,user_id,problem
0,30000,0.925947
1,30001,0.160452
2,30002,0.306449
3,30003,0.776081
4,30004,0.903467
...,...,...
14994,44994,0.277326
14995,44995,0.333294
14996,44996,0.580804
14997,44997,0.862072


In [57]:
sub.to_csv('../submissions/submission_concat_drop_period_fwver.csv', index=False)