# PyCaret 3개 열 사용 모델

In [None]:
!pip install pycaret==3.0.0



In [None]:
from pycaret.classification import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import pandas as pd
diabates = pd.read_csv('diabetes_generators.csv', index_col=0) # index번호 없앰
diabates.head()

Unnamed: 0,Glucose,BMI,Age,Outcome
0,148.0,33.6,50,1
1,85.0,26.6,31,0
2,183.0,23.3,32,1
3,89.0,28.1,21,0
4,137.0,43.1,33,1


In [None]:
diabates = diabates.astype(float)
diabates['Outcome'] = diabates['Outcome'].astype(int)  # 'Outcome' int로 되돌리기

In [None]:
diabates

Unnamed: 0,Glucose,BMI,Age,Outcome
0,148.0,33.6,50.0,1
1,85.0,26.6,31.0,0
2,183.0,23.3,32.0,1
3,89.0,28.1,21.0,0
4,137.0,43.1,33.0,1
...,...,...,...,...
763,101.0,32.9,63.0,0
764,122.0,36.8,27.0,0
765,121.0,26.2,30.0,0
766,126.0,30.1,47.0,1


#### Standadardization - Robert Scaler

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

# 숫자형 변수들의 이름 리스트
numeric_columns = ['Age', 'BMI', 'Glucose']

# RobustScaler 사용 전처리기 정의
preprocessor = ColumnTransformer(
    transformers=[('numeric', RobustScaler(), numeric_columns)]
)

# 파이프라인에 추가
preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)]) # preprocessing-only

# fit & transform
preprocessor_pipe.fit(diabates)
diabates_transformed = preprocessor_pipe.transform(diabates)

# DataFrame로 변환 & 컬럼 이름 재지정
diabates_transformed = pd.DataFrame(diabates_transformed)
diabates_transformed.columns = ['Age', 'BMI', 'Glucose']
diabates_transformed['Outcome'] = diabates['Outcome']

diabates_transformed.head(3)

Unnamed: 0,Age,BMI,Glucose,Outcome
0,1.235294,0.142857,0.765432,1
1,0.117647,-0.626374,-0.790123,0
2,0.176471,-0.989011,1.62963,1


In [None]:
# PyCaret setup 함수로 실험 환경 설정: 훈련 데이터와 목표 변수, 훈련 세트 크기 등 설정
model = setup(data=diabates_transformed,
              target='Outcome',
              train_size=0.7,
              # normalize=True,
              session_id=9)


Unnamed: 0,Description,Value
0,Session id,9
1,Target,Outcome
2,Target type,Binary
3,Original data shape,"(768, 4)"
4,Transformed data shape,"(768, 4)"
5,Transformed train set shape,"(537, 4)"
6,Transformed test set shape,"(231, 4)"
7,Numeric features,3
8,Preprocess,True
9,Imputation type,simple


In [None]:
# Accuracy 기준 top 3 모델 선택
top_3_models = compare_models(sort='Accuracy',
                              n_select = 3) # Select top n models

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7748,0.8307,0.562,0.7402,0.6327,0.4758,0.4891,0.776
nb,Naive Bayes,0.7729,0.8313,0.5728,0.7248,0.6338,0.474,0.4847,0.056
ridge,Ridge Classifier,0.771,0.0,0.5564,0.7315,0.6258,0.4666,0.4794,0.047
lda,Linear Discriminant Analysis,0.771,0.83,0.562,0.7259,0.6275,0.4676,0.4792,0.062
qda,Quadratic Discriminant Analysis,0.7673,0.8209,0.5564,0.7177,0.6225,0.4593,0.4697,0.061
gbc,Gradient Boosting Classifier,0.7505,0.7988,0.6047,0.6554,0.6258,0.4398,0.4431,0.304
ada,Ada Boost Classifier,0.7488,0.8078,0.5731,0.6625,0.6116,0.4281,0.4326,0.448
rf,Random Forest Classifier,0.7469,0.7938,0.5839,0.6608,0.6146,0.4283,0.4344,0.456
knn,K Neighbors Classifier,0.745,0.7759,0.5994,0.6513,0.6154,0.4271,0.4347,0.074
svm,SVM - Linear Kernel,0.745,0.0,0.4608,0.634,0.5135,0.3746,0.392,0.049


Processing:   0%|          | 0/67 [00:00<?, ?it/s]

In [None]:
# 3개 모델 조합
blended = blend_models(estimator_list=top_3_models,
                       fold=10, # default
                       optimize='Accuracy',
                       method = 'hard')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7407,0.0,0.4737,0.6923,0.5625,0.3874,0.4014
1,0.7593,0.0,0.5789,0.6875,0.6286,0.4524,0.4561
2,0.7407,0.0,0.4211,0.7273,0.5333,0.371,0.3976
3,0.7593,0.0,0.4737,0.75,0.5806,0.4236,0.4456
4,0.8889,0.0,0.7368,0.9333,0.8235,0.7441,0.7551
5,0.7037,0.0,0.6316,0.5714,0.6,0.3656,0.3668
6,0.7593,0.0,0.5263,0.7143,0.6061,0.4384,0.449
7,0.7925,0.0,0.6667,0.7059,0.6857,0.531,0.5315
8,0.7925,0.0,0.6111,0.7333,0.6667,0.5178,0.5223
9,0.7925,0.0,0.5,0.8182,0.6207,0.489,0.5171


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
# 조합 모델 예측 수행
predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7662,0.7149,0.5432,0.7213,0.6197,0.4558,0.4653


Unnamed: 0,Age,BMI,Glucose,Outcome,prediction_label
223,1.882353,-0.384615,0.617284,0,1
537,2.235294,-1.164835,-1.481481,0,0
109,-0.294118,0.560440,-0.543210,1,0
716,0.117647,0.164835,1.382716,1,1
49,-0.294118,0.000000,-0.296296,0,0
...,...,...,...,...,...
253,-0.235294,0.384615,-0.765432,0,0
176,0.764706,-0.120879,-0.790123,0,0
743,0.941176,0.043956,0.567901,1,1
173,-0.352941,1.230769,-0.938272,0,0


In [None]:
# 최종 모델 확정
final_blended = finalize_model(blended)
print(final_blended)

Pipeline(memory=FastMemory(location=/tmp/joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(exclude=None,
                                    include=['Age', 'BMI', 'Glucose'],
                                    transformer=SimpleImputer(add_indicator=False,
                                                              copy=True,
                                                              fill_value=None,
                                                              keep_empty_features=False,
                                                              missing_values=nan,
                                                              strategy='mean',
                                                              verbose='deprecated'))),
                ('categorical_imputer',
                 TransformerWrapper(exclude=No...
                                                                  tol=0.0001,
                                                   

In [None]:
# 최종 모델로 예측 수행
predict_model(final_blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7662,0.7149,0.5432,0.7213,0.6197,0.4558,0.4653


Unnamed: 0,Age,BMI,Glucose,Outcome,prediction_label
223,1.882353,-0.384615,0.617284,0,1
537,2.235294,-1.164835,-1.481481,0,0
109,-0.294118,0.560440,-0.543210,1,0
716,0.117647,0.164835,1.382716,1,1
49,-0.294118,0.000000,-0.296296,0,0
...,...,...,...,...,...
253,-0.235294,0.384615,-0.765432,0,0
176,0.764706,-0.120879,-0.790123,0,0
743,0.941176,0.043956,0.567901,1,1
173,-0.352941,1.230769,-0.938272,0,0


##### 모델 예측 테스트

In [None]:
sample_data = pd.DataFrame([[171, 45.4, 54]], columns=['Glucose', 'BMI', 'Age'])
final_blended.predict(sample_data)


array([1], dtype=int8)

#### 모델 저장

In [None]:
save_model(final_blended, 'final_blended_3')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['Age', 'BMI', 'Glucose'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean',
                                                               verbose='deprecated'))),
                 ('categorical_imputer',
                  TransformerWrapper(exclude=No...
                                                                   tol=0.0001,
                                     

### 성능 측정
파일 "2. ML.jpynb" 에서 가져온 x_test_df.csv, y_test_df.csv 불러오기(x_test_transformed, y_test)

In [None]:
# !pip install pycaret
import joblib
import numpy as np
import pandas as pd
import sklearn

from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

model_loaded = load_model('final_blended_3')

x_test_df = pd.read_csv('x_test_df.csv', index_col=0)
y_test = pd.read_csv('y_test_df.csv', index_col=0)

# hard 투표 방식으로 predict_proba 사용 불가능 & ROC AUC 계산불가
pred = model_loaded.predict(x_test_df)
# pred_proba = model_loaded.predict_proba(x_test_df)[:, 1]

confusion_without = confusion_matrix(y_test, pred)
accuracy_without = accuracy_score(y_test, pred)
precision_without = precision_score(y_test, pred)
recall_without = recall_score(y_test, pred)
f1_without = f1_score(y_test, pred)
# roc_auc_without = roc_auc_score(y_test, pred_proba)

print('Confusion matrix:')
print(confusion_without)
print()

print('Accuracy :', accuracy_without)
print('Precision :', precision_without)
print('Recall :', recall_without)
print('F1 :', f1_without)
# print('ROC_AUC :', roc_auc_without)

Transformation Pipeline and Model Successfully Loaded
Confusion matrix:
[[133  24]
 [ 42  32]]

Accuracy : 0.7142857142857143
Precision : 0.5714285714285714
Recall : 0.43243243243243246
F1 : 0.49230769230769234


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=645823d6-5f8a-49f6-b0db-26e2163dbcc0' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>