In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

#-------------------- 차트 관련 속성 (한글처리, 그리드) -----------
plt.rcParams['font.family']= 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

#-------------------- 주피터 , 출력결과 넓이 늘리기 ---------------
# from IPython.core.display import display, HTML
from IPython.display import display, HTML

display(HTML("<style>.container{width:100% !important;}</style>"))
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', None)

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# from sklearn.datasets import load_iris, load_diabetes
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler

diabetes_data = pd.read_csv("./datasets/diabetes.csv")
print(diabetes_data["Outcome"].value_counts())
diabetes_data.head(3)

Outcome
0    500
1    268
Name: count, dtype: int64


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [3]:
diabetes_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [9]:
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)
    f1 = f1_score(y_test,pred)
    # roc_auc 추가
    roc_auc = roc_auc_score(y_test, pred_proba)
    print("오차 행렬")
    print(confusion)
    # F1 Score print
    print(f"정확도:{accuracy:.4f}, 정밀도:{precision:.4f}, 재현율:{recall:.4f}, F1:{f1:.4f}")

In [10]:
X = diabetes_data.iloc[:,:-1]
y = diabetes_data.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=156, stratify=y)

# 로지스틱 회귀로 학습, 예측 및 평가 수행
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train,y_train)
pred = lr_clf.predict(X_test)
pred_proba = lr_clf.predict_proba(X_test)[:,1]

get_clf_eval(y_test, pred, pred_proba)

오차 행렬
[[87 13]
 [22 32]]
정확도:0.7727, 정밀도:0.7111, 재현율:0.5926, F1:0.6465


In [15]:
pred_proba_c1 = lr_clf.predict_proba(X_test)[:,1]
precision_recall_curve(y_test, pred_proba_c1)

(array([0.35064935, 0.34640523, 0.34210526, 0.34437086, 0.34666667,
        0.34899329, 0.35135135, 0.3537415 , 0.35616438, 0.35862069,
        0.36111111, 0.36363636, 0.36619718, 0.36170213, 0.36428571,
        0.36690647, 0.36956522, 0.37226277, 0.375     , 0.37777778,
        0.38059701, 0.38345865, 0.38636364, 0.38931298, 0.39230769,
        0.3875969 , 0.390625  , 0.39370079, 0.3968254 , 0.4       ,
        0.40322581, 0.40650407, 0.40983607, 0.41322314, 0.41666667,
        0.42016807, 0.42372881, 0.42735043, 0.43103448, 0.43478261,
        0.43859649, 0.44247788, 0.44642857, 0.45045045, 0.45454545,
        0.4587156 , 0.46296296, 0.46728972, 0.47169811, 0.47619048,
        0.47115385, 0.47572816, 0.48039216, 0.47524752, 0.48      ,
        0.48484848, 0.48979592, 0.48453608, 0.48958333, 0.49473684,
        0.5       , 0.50537634, 0.5       , 0.50549451, 0.5       ,
        0.50561798, 0.51136364, 0.51724138, 0.51162791, 0.51764706,
        0.52380952, 0.53012048, 0.53658537, 0.53