In [10]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


In [18]:
lr = LogisticRegression(random_state=2023)
svc = SVC(random_state=2023,probability=True)
knn = KNeighborsClassifier()

In [13]:
df = pd.read_csv('pima-indians-diabetes.csv',skiprows=9,header=None)
df.columns = ['P','G','BP','S','I','BMI','D','Age','Target']
df.head()

Unnamed: 0,P,G,BP,S,I,BMI,D,Age,Target
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [14]:
df.isna().sum()

P         0
G         0
BP        0
S         0
I         0
BMI       0
D         0
Age       0
Target    0
dtype: int64

In [15]:
df1 = df.iloc[:,:-1]
df2 = df.iloc[:,-1]
X = df1.values 
y = df2.values
X.shape, y.shape

((768, 8), (768,))

In [16]:
fis = MinMaxScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    fis, y, stratify=y, test_size =0.2, random_state=2023
)

In [19]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
voc = VotingClassifier(
    estimators=[('svc', svc), ('knn', knn), ('lr', lr)],voting='soft'
)
params = {
    'svc__C': [0.1, 1, 10],
    'lr__C': [0.1, 1, 10]
}
grid_voc = GridSearchCV(voc,params,scoring='accuracy',cv=5)
grid_voc.fit(X_train,y_train)
grid_voc.best_params_

{'lr__C': 1, 'svc__C': 1}

In [20]:
params = {
    'svc__C': [0.5, 1, 1.5],
    'lr__C': [0.5, 1, 1.5]
}
grid_voc = GridSearchCV(voc,params,scoring='accuracy',cv=5)
grid_voc.fit(X_train,y_train)
grid_voc.best_params_

{'lr__C': 0.5, 'svc__C': 1.5}

In [22]:
params = {
    'svc__C': [1.3, 1.5, 2],
    'lr__C': [0.2, 0.5, 0.7]
}
grid_voc = GridSearchCV(voc,params,scoring='accuracy',cv=5)
grid_voc.fit(X_train,y_train)
grid_voc.best_params_

{'lr__C': 0.7, 'svc__C': 1.3}

In [25]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=2023),
    'XGBoost': XGBClassifier(),
    'LGBM': LGBMClassifier(random_state=2023)
}
for name, clf in classifiers.items():
    evals = [(X_test,y_test)]
    clf.fit(X_train, y_train) if name != 'LGBM' else clf.fit(X_train, y_train,eval_set=evals,eval_metric='logloss',verbose=True)
    pred = clf.predict(X_test)
    acc = accuracy_score(y_test, pred)
    prc = precision_score(y_test, pred)
    rec = recall_score(y_test, pred)
    print(f'{name}: 정확도 = {acc:.4f}, 정밀도 = {prc:.4f}, 재현율 = {rec:.4f}')

Random Forest: 정확도 = 0.7597, 정밀도 = 0.6735, 재현율 = 0.6111
XGBoost: 정확도 = 0.7403, 정밀도 = 0.6458, 재현율 = 0.5741
[1]	valid_0's binary_logloss: 0.615506
[2]	valid_0's binary_logloss: 0.5971
[3]	valid_0's binary_logloss: 0.57897
[4]	valid_0's binary_logloss: 0.562392
[5]	valid_0's binary_logloss: 0.546361
[6]	valid_0's binary_logloss: 0.53331
[7]	valid_0's binary_logloss: 0.523093
[8]	valid_0's binary_logloss: 0.515014
[9]	valid_0's binary_logloss: 0.506786
[10]	valid_0's binary_logloss: 0.503446
[11]	valid_0's binary_logloss: 0.496683
[12]	valid_0's binary_logloss: 0.492718
[13]	valid_0's binary_logloss: 0.488116
[14]	valid_0's binary_logloss: 0.484265
[15]	valid_0's binary_logloss: 0.480535
[16]	valid_0's binary_logloss: 0.481302
[17]	valid_0's binary_logloss: 0.482878
[18]	valid_0's binary_logloss: 0.48072
[19]	valid_0's binary_logloss: 0.479572
[20]	valid_0's binary_logloss: 0.482852
[21]	valid_0's binary_logloss: 0.484995
[22]	valid_0's binary_logloss: 0.487411
[23]	valid_0's binary_loglos

