In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV


file_path = "https://github.com/MyungKyuYi/AI-class/raw/main/heart.csv"
df = pd.read_csv(file_path)

# 결측치 확인
print(df.isnull().sum())
print("\n=====================================================\n")

# 레이블 불균형 여부를 위한 갯수 확인 (숫자)
print(df['target'].value_counts())
print("\n=====================================================\n")


# 모든 데이터들이 숫자이기 때문에 Encoding을 할 필요가 없다.

# X와 Y 분할
X = df.drop('target', axis=1) # Feature
Y = df['target'] # Label

# 학습 데이터 & 테스트 데이터 분할
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 데이터 표준화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 분할된 데이터의 shape을 출력
print("\n=====================================================\n")
print("X_train.shape : ", X_train.shape)
print("X_test.shape : ", X_test.shape)
print("Y_train.shape : ", Y_train.shape)
print("Y_test.shape : ", Y_test.shape)


# Decision Tree (DT)
print("\n=====================================================\n")
dt_model = DecisionTreeClassifier() # DT 모델 생성.
dt_model.fit(X_train, Y_train) # 모델 학습.
dt_predictions = dt_model.predict(X_test) # 예측. 즉, dt_predictions에는 테스트 데이터 특성(X_test)에 따라 예측한 Y값이 들어있는 것.
print("Decision Tree Accuracy:", accuracy_score(Y_test, dt_predictions)) # 예측한 값과 실제 Y값을 비교하여 정확도 평가.
print("\nDecision Tree Classification Report:\n", classification_report(Y_test, dt_predictions))
print("DT Confusion Matrix:\n", confusion_matrix(Y_test, dt_predictions)) # 혼동 행렬 출력

# Random Forest (RF)
print("\n=====================================================\n")
rf_model = RandomForestClassifier() # RF 모델 생성.
rf_model.fit(X_train, Y_train) # 모델 학습.
rf_predictions = rf_model.predict(X_test) # 예측. 즉, rf_predictions에는 테스트 데이터 특성(X_test)에 따라 예측한 Y값이 들어있는 것.
print("Random Forest Accuracy:", accuracy_score(Y_test, rf_predictions)) # 예측한 값과 실제 Y값을 비교하여 정확도 평가.
print("\nRandom Forest Classification Report:\n", classification_report(Y_test, rf_predictions))
print("RF Confusion Matrix:\n", confusion_matrix(Y_test, rf_predictions)) # 혼동 행렬 출력

# Support Vector Machine (SVM)
print("\n=====================================================\n")
svm_model = SVC() # SVM 모델 생성.
svm_model.fit(X_train, Y_train) # 모델 학습.
svm_predictions = svm_model.predict(X_test) # 예측. 즉, svm_predictions에는 테스트 데이터 특성(X_test)에 따라 예측한 Y값이 들어있는 것.
print("SVM Accuracy:", accuracy_score(Y_test, svm_predictions)) # 예측한 값과 실제 Y값을 비교하여 정확도 평가.
print("\nSVM Classification Report:\n", classification_report(Y_test, svm_predictions, zero_division=1))
print("SVM Confusion Matrix:\n", confusion_matrix(Y_test, svm_predictions)) # 혼동 행렬 출력


# Logistic Regression (LR)
print("\n=====================================================\n")
lr_model = LogisticRegression() # LR 모델 생성.
lr_model.fit(X_train, Y_train) # 모델 학습.
lr_predictions = lr_model.predict(X_test) # 예측. 즉, lr_predictions에는 테스트 데이터 특성(X_test)에 따라 예측한 Y값이 들어있는 것.
print("Logistic Regression Accuracy:", accuracy_score(Y_test, lr_predictions)) # 예측한 값과 실제 Y값을 비교하여 정확도 평가.
print("\nLogistic Regression Classification Report:\n", classification_report(Y_test, lr_predictions))
print("LR Confusion Matrix:\n", confusion_matrix(Y_test, lr_predictions)) # 혼동 행렬 출력

# K-Nearest Neighbors (KNN)
print("\n=====================================================\n")
knn_model = KNeighborsClassifier(n_neighbors=5)  # 기본적으로 K=5 사용
knn_model.fit(X_train, Y_train)
knn_predictions = knn_model.predict(X_test)
print("KNN Accuracy:", accuracy_score(Y_test, knn_predictions))
print("\nKNN Algorithm Classification Report:\n", classification_report(Y_test, knn_predictions))
print("KNN Confusion Matrix:\n", confusion_matrix(Y_test, knn_predictions)) # 혼동 행렬 출력

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


target
1    165
0    138
Name: count, dtype: int64




X_train.shape :  (242, 13)
X_test.shape :  (61, 13)
Y_train.shape :  (242,)
Y_test.shape :  (61,)


Decision Tree Accuracy: 0.819672131147541

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.90      0.83        29
           1       0.89      0.75      0.81        32

    accuracy                           0.82        61
   macro avg       0.83      0.82      0.82        61
weighted avg       0.83      0.82      0.82        61

DT Confusion Matrix:
 [[26  3]
 [ 8 24]]


Random Forest Accuracy: 0.8688524590163934

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.83      0.86   