In [1]:
from pycaret.classification import *
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
train = pd.read_csv(r'C:\Users\gmlkd\machine-learning-exercises\10.DAYCON대회_사용자불편예지\training set\train_case2.csv')
train_problem = pd.read_csv(r'C:\Users\gmlkd\machine-learning-exercises\10.DAYCON대회_사용자불편예지\preprocessed data\new_train_problem.csv', parse_dates=['time'])
test = pd.read_csv(r'C:\Users\gmlkd\machine-learning-exercises\10.DAYCON대회_사용자불편예지\training set\test_case2.csv')
submission = pd.read_csv("C:/Users/gmlkd/data/시스템 품질 변화로 인한 사용자 불편 예지 AI 경진대회_data/sample_submission.csv")

In [3]:
train = train.to_numpy()
problem = np.zeros(15000)
problem[train_problem.user_id.unique() - 10000] = 1

train_x = train
train_y = problem

In [4]:
# Train
#-------------------------------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5
# 파라미터 설정
params =      {
                'boosting_type' : 'gbdt',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'seed': 1015
                }
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x):
    # split train, validation set
    X = train_x[train_idx]
    y = train_y[train_idx]
    valid_x = train_x[val_idx]
    valid_y = train_y[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 3
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

[LightGBM] [Info] Number of positive: 3633, number of negative: 8367
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5879
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302750 -> initscore=-0.834237
[LightGBM] [Info] Start training from score -0.834237
Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[2]	valid_0's auc: 0.796928	valid_0's pr_auc: 0.803588
[LightGBM] [Info] Number of positive: 4828, number of negative: 7172
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6078
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.402333 -> initscore=-0.395752
[LightGBM] [Info] Start training from score -0.395752
Training until validation scores don't improve for 3 rounds
Early stopping

In [5]:
print(np.mean(auc_scores))

0.7993551366349869


In [6]:
train_x = pd.DataFrame(train_x)
train_y = pd.DataFrame(train_y)
train_y.columns = ['problem']
train = pd.concat([train_x,train_y], axis =1)
train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,problem
0,-0.268744,-0.340504,-0.239957,-0.414143,0.102027,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,-0.009173,-0.013527,-0.011326,-0.00921,-0.084309,-0.008165,-0.009902,-0.035379,-0.277099,-0.01059,-0.019935,-0.012796,-0.250725,-0.026621,-0.008165,-0.041437,-0.008165,-0.008165,-0.016768,-0.010397,-0.026337,-0.014996,-0.196852,-0.030118,0.085244,-0.296006,-0.075511,-0.052361,-0.037669,0.025586,0.015363,-0.026017,-0.652306,-0.279724,-0.225162,-0.114028,-0.252677,-0.225553,-0.071956,-0.075181,-0.149787,-0.154519,-0.382093,-0.198956,-1.052305,-0.061546,-0.074159,-1.030661,-0.114398,-0.080131,-0.602460,-0.336841,-0.644371,-0.023049,0.120567,-0.026656,-0.059816,-0.058683,-0.044469,0.226409,-0.06688,0.442781,0.0,0.0,0.0
1,0.500848,-0.340504,-0.239957,4.011349,-0.064861,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,-0.009173,-0.013527,-0.011326,-0.00921,-0.084309,-0.008165,-0.009902,-0.035379,-0.277099,-0.01059,-0.019935,-0.012796,1.267385,-0.026621,-0.008165,-0.041437,-0.008165,-0.008165,-0.016768,-0.010397,-0.026337,-0.014996,-0.196852,-0.030118,-0.029046,0.042137,-0.075511,-0.052361,-0.037669,-0.048693,0.015363,-0.034766,0.560808,-0.279724,4.832761,2.461897,5.213885,1.147012,0.066194,0.877285,-0.590355,-0.154519,0.339732,-0.198956,-0.166879,0.083164,-0.074159,0.628662,-0.114398,-0.080131,0.795861,2.510621,-0.362551,-0.023049,0.120567,-0.026656,-0.059993,-0.058683,-0.044469,0.226409,-0.06688,0.442781,0.0,1.0,1.0
2,-0.272916,-0.340504,-0.239957,-0.414143,0.096218,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,-0.009173,-0.013527,-0.011326,-0.00921,-0.084309,-0.008165,-0.009902,-0.035379,-0.277099,-0.01059,-0.019935,-0.012796,-0.250725,-0.026621,-0.008165,-0.041437,-0.008165,-0.008165,-0.016768,-0.010397,-0.026337,-0.014996,-0.196852,-0.030118,-0.000473,-0.288962,-0.043544,-0.052361,-0.037669,-0.038082,0.015363,-0.052263,-0.753398,-0.279724,-0.225162,-0.114028,-0.252677,-0.225553,-0.016696,-0.075181,-0.458185,-0.154519,-0.382093,-0.198956,-1.052305,-0.061546,-0.074159,-1.030661,-0.114398,-0.080131,-0.602460,-0.336841,-0.644371,-0.022932,0.023923,-0.026656,-0.059551,-0.011320,-0.044469,0.091401,-0.06688,0.272376,0.0,0.0,0.0
3,-0.275951,-0.340504,-0.239957,0.146356,-0.064861,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,-0.009173,-0.013527,-0.011326,-0.00921,-0.084309,-0.008165,-0.009902,-0.035379,-0.277099,-0.01059,-0.019935,-0.012796,0.079299,-0.026621,-0.008165,-0.041437,-0.008165,-0.008165,-0.016768,-0.010397,-0.026337,-0.014996,-0.196852,-0.030118,-0.029046,-0.281917,-0.075511,-0.052361,-0.037669,-0.048693,-0.064734,-0.069761,-0.854491,-0.188357,-0.225162,-0.114028,-0.252677,-0.174717,-0.071956,-0.075181,0.951632,-0.154519,-0.194419,-0.198956,-0.343964,-0.061546,-0.074159,0.628662,0.641705,-0.080131,-0.392093,-0.285994,-0.644371,-0.023049,0.120567,-0.026656,-0.059993,-0.058683,-0.044469,0.226409,-0.06688,0.442781,0.0,1.0,0.0
4,-0.093888,0.651704,-0.239957,-0.414143,-0.064861,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,-0.009173,-0.013527,-0.011326,-0.00921,-0.084309,-0.008165,-0.009902,-0.035379,0.164038,-0.01059,-0.019935,-0.012796,-0.250725,-0.026621,-0.008165,-0.041437,-0.008165,-0.008165,-0.016768,-0.010397,-0.026337,-0.014996,-0.196852,-0.030118,-0.029046,-0.296006,-0.011578,-0.052361,-0.037669,-0.048693,0.175556,-0.047889,0.560808,-0.279724,-0.225162,-0.114028,-0.252677,0.028626,-0.071956,-0.075181,-0.105730,-0.154519,0.126072,-0.198956,0.364376,-0.061546,-0.074159,0.628662,-0.114398,-0.080131,-0.552962,-0.336841,-0.080732,-0.023167,0.023923,-0.026760,-0.059949,0.031832,-0.044469,0.091401,-0.06688,0.442781,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,-0.316156,-0.340504,-0.239957,-0.053688,-0.064861,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,-0.009173,-0.013527,-0.011326,-0.00921,-0.084309,-0.008165,-0.009902,-0.035379,-0.277099,-0.01059,-0.019935,-0.012796,-0.250725,-0.026621,-0.008165,-0.041437,-0.008165,-0.008165,-0.016768,-0.010397,-0.026337,-0.014996,-0.196852,-0.030118,-0.029046,-0.281917,0.052354,-0.052361,-0.037669,-0.048693,0.335749,-0.034766,-1.128886,-0.279724,-0.225162,-0.114028,-0.252677,-0.194688,-0.071956,-0.075181,-0.590355,-0.154519,-0.292587,-0.198956,-0.609592,-0.061546,-0.074159,-1.030661,-0.114398,-0.080131,-0.491089,0.019092,0.482907,-0.023285,-0.072722,-0.026864,-0.059949,-0.060788,-0.044469,-0.043608,-0.06688,-2.965326,0.0,0.0,0.0
14996,-0.387084,-0.340504,-0.239957,-0.414143,-0.062749,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,-0.009173,-0.013527,-0.011326,-0.00921,-0.084309,-0.008165,-0.009902,-0.035379,-0.277099,-0.01059,-0.019935,-0.012796,-0.250725,-0.026621,-0.008165,-0.041437,-0.008165,-0.008165,-0.016768,-0.010397,-0.026337,-0.014996,-0.196852,-0.030118,-0.029046,-0.296006,-0.107477,-0.052361,-0.037669,-0.048693,-0.064734,-0.069761,-1.504373,-0.279724,-0.225162,-0.114028,-0.252677,-0.225553,-0.071956,-0.075181,-1.515547,5.611132,-0.382093,-0.198956,-1.052305,-0.061546,-0.074159,-1.030661,-0.114398,-0.080131,-0.602460,-0.336841,-0.644371,-0.023049,0.120567,-0.026656,-0.059993,-0.058683,-0.044469,0.226409,-0.06688,-4.498974,0.0,0.0,0.0
14997,-0.078337,0.704060,-0.239957,-0.414143,-0.064861,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,-0.009173,-0.013527,-0.011326,-0.00921,-0.084309,-0.008165,-0.009902,-0.035379,0.919317,-0.01059,-0.019935,-0.012796,-0.250725,-0.026621,-0.008165,-0.041437,-0.008165,-0.008165,-0.016768,-0.010397,-0.026337,-0.014996,-0.196852,-0.030118,-0.029046,-0.239649,-0.075511,-0.052361,-0.037669,-0.048693,0.175556,-0.065387,1.109597,-0.279724,-0.225162,-0.114028,-0.252677,-0.134775,-0.071956,-0.075181,0.555121,-0.154519,0.261775,-0.198956,0.364376,0.034927,-0.074159,0.628662,-0.114398,-0.080131,0.115262,0.019092,0.764726,-0.023167,0.023923,-0.026760,-0.059757,-0.059735,-0.044469,0.091401,-0.06688,0.442781,0.0,1.0,1.0
14998,-0.333983,-0.156620,-0.239957,-0.414143,-0.064861,-0.055185,-0.047764,-0.024537,-0.050453,-0.026486,-0.009173,-0.013527,-0.011326,-0.00921,-0.084309,-0.008165,-0.009902,-0.035379,0.177405,-0.01059,-0.019935,-0.012796,-0.250725,-0.026621,-0.008165,-0.041437,-0.008165,-0.008165,-0.016768,-0.010397,-0.026337,-0.014996,-0.196852,-0.030118,-0.029046,-0.296006,-0.107477,-0.052361,-0.037669,-0.048693,-0.064734,-0.065387,-0.767840,-0.279724,-0.225162,-0.114028,-0.252677,-0.212844,-0.071956,-0.075181,-1.427433,-0.154519,-0.341671,-0.198956,0.275834,-0.045467,-0.074159,0.628662,-0.114398,-0.080131,-0.528213,-0.336841,-0.644371,-0.023049,0.120567,-0.026656,-0.059993,-0.058683,-0.044469,0.226409,-0.06688,-1.090867,0.0,1.0,1.0


In [8]:
clf = setup(data = train, target = "problem", silent=True, session_id=42, train_size=0.85) 

Unnamed: 0,Description,Value
0,session_id,42
1,Target,problem
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(15000, 75)"
5,Missing Values,False
6,Numeric Features,67
7,Categorical Features,7
8,Ordinal Features,False
9,High Cardinality Features,False


In [9]:
best_3 = compare_models(sort = 'AUC', n_select = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8147,0.8148,0.4593,0.8175,0.5878,0.4803,0.5137,0.379
lightgbm,Light Gradient Boosting Machine,0.8166,0.8143,0.4947,0.7903,0.6083,0.4967,0.5201,0.057
rf,Random Forest Classifier,0.8148,0.81,0.4724,0.8039,0.5948,0.4851,0.5141,0.217
ada,Ada Boost Classifier,0.8085,0.8057,0.4615,0.7852,0.5811,0.4677,0.4956,0.099
et,Extra Trees Classifier,0.813,0.8051,0.4691,0.7991,0.5907,0.4801,0.5089,0.217
lr,Logistic Regression,0.8048,0.7775,0.4119,0.8213,0.5483,0.4409,0.4837,0.366
lda,Linear Discriminant Analysis,0.7928,0.7643,0.3514,0.8324,0.4939,0.3896,0.447,0.047
nb,Naive Bayes,0.7026,0.7424,0.5464,0.5576,0.5197,0.3235,0.3369,0.017
knn,K Neighbors Classifier,0.7776,0.7249,0.4067,0.6943,0.5125,0.3811,0.4042,0.456
dt,Decision Tree Classifier,0.722,0.6682,0.5413,0.5168,0.5286,0.3317,0.332,0.042


In [10]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8137,0.8057,0.4632,0.8076,0.5887,0.4795,0.5105
1,0.8239,0.8249,0.5027,0.8146,0.6217,0.5152,0.5407
2,0.8173,0.8108,0.4578,0.8317,0.5905,0.4853,0.5212
3,0.8149,0.8144,0.4776,0.7995,0.598,0.4875,0.5148
4,0.814,0.8336,0.4564,0.8171,0.5857,0.4779,0.5116
Mean,0.8168,0.8179,0.4715,0.8141,0.5969,0.4891,0.5198
Std,0.0038,0.0101,0.0173,0.0107,0.013,0.0135,0.0111


In [11]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.6726,0.8206,0.474,0.9431,0.6309,0.39,0.4659


In [12]:
final_model = finalize_model(blended)

In [15]:
# pycaret의 predict_model을 사용하기 위해 test의 모든 컬럼을 정수로 변경
t_columns = list(np.arange(0,74))
test.columns = t_columns

In [16]:
prep_pipe = get_config('prep_pipe')
transformed_unseen_data = prep_pipe.transform(test)

In [63]:
prections = final_model.predict_proba(transformed_unseen_data)[:,1]
prections

array([0.91528047, 0.37724455, 0.45742957, ..., 0.66312932, 0.87929147,
       0.51487411])

In [67]:
a = prections[:43262-30000]
b = prections[43262-30000:]

In [66]:
new = prections.mean()
new

0.4944148152602951

In [68]:
a = np.append(a, new)
print(a)
print(b)

[0.91528047 0.37724455 0.45742957 ... 0.44824058 0.35992413 0.49441482]
[0.34965652 0.3971697  0.35079405 ... 0.66312932 0.87929147 0.51487411]


In [70]:
prections = np.concatenate((a, b))
prections

array([0.91528047, 0.37724455, 0.45742957, ..., 0.66312932, 0.87929147,
       0.51487411])

In [72]:
submission['problem'] = prections
submission

Unnamed: 0,user_id,problem
0,30000,0.915280
1,30001,0.377245
2,30002,0.457430
3,30003,0.841255
4,30004,0.823994
...,...,...
14994,44994,0.432040
14995,44995,0.495195
14996,44996,0.663129
14997,44997,0.879291


In [73]:
submission.iloc[43262-30000]

user_id    43262.000000
problem        0.494415
Name: 13262, dtype: float64

In [75]:
submission.to_csv(r'C:\Users\gmlkd\machine-learning-exercises\10.DAYCON대회_사용자불편예지\submissions\submission_use_pipeline.csv', index=False)