출처: 
[Machine Learning for Beginners with 7 Models](https://www.kaggle.com/mehmetzahitylmaz/machine-learning-for-beginners-with-7-models)

* Import Libraries and Datasets 🔌
* Exploring Dataset 🔍
* Spliting Numerical and Categorical Values ✂️
* Exploring Categorical Columns 🔦
* Spliting Columns for One Hot Encoding and Label Encoding ✍️
* Investigating Missing Values 🕵️‍♂️
* Bring Data Together 📖
* Split Data to Train and Test 👟
* Training our models 💪
* Compare Models Perfomance ⚡️⭐️
* Performance of Models and Last Words 🎯

In [12]:
# pandas & numpy 불러오기
import pandas as pd
import numpy as np

# 데이터 프로세싱을 위한 라이브러리
from sklearn.preprocessing import LabelEncoder

# 결측값 채우기 위한 라이브러리
from sklearn.impute import SimpleImputer

# train data를 분리하기 위한 라이브러리
from sklearn.model_selection import train_test_split

# 시각화 도구 라이브러리
import matplotlib.pyplot as plt
import seaborn as sns

# To Train our data
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB

# To evaluate and result we have
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

# 데이터 불러오기
df=pd.read_csv("Kaggle/heart-2.csv")



In [39]:
# 경고 표시 숨기기
import warnings
warnings.filterwarnings("ignore")

# EXplorting Dataset

In [13]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,Male,asymptomatic,145,233,high,normal,150,no,2.3,0,0,fixed deffect,1
1,37,Male,non anginal pain,130,250,normal,S-T abnormanilty,187,no,3.5,0,0,reversible deffect,1
2,41,Female,atypical angina,130,204,normal,normal,172,no,1.4,2,0,reversible deffect,1
3,56,Male,atypical angina,120,236,normal,S-T abnormanilty,178,no,0.8,2,0,reversible deffect,1
4,57,Female,typical angina,120,354,normal,S-T abnormanilty,163,yes,0.6,2,0,reversible deffect,1


In [15]:
print("Rows:",len(df))

Rows: 303


In [19]:
# Numerical Data Summary
df.describe()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,slope,ca,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,131.623762,246.264026,149.646865,1.039604,1.39934,0.729373,0.544554
std,9.082101,17.538143,51.830751,22.905161,1.161075,0.616226,1.022606,0.498835
min,29.0,94.0,126.0,71.0,0.0,0.0,0.0,0.0
25%,47.5,120.0,211.0,133.5,0.0,1.0,0.0,0.0
50%,55.0,130.0,240.0,153.0,0.8,1.0,0.0,1.0
75%,61.0,140.0,274.5,166.0,1.6,2.0,1.0,1.0
max,77.0,200.0,564.0,202.0,6.2,2.0,4.0,1.0


In [18]:
# Categorical Data Summary
df.describe(include=[np.object])

Unnamed: 0,sex,cp,fbs,restecg,exang,thal
count,303,303,303,303,303,303
unique,2,4,2,3,2,4
top,Male,typical angina,normal,S-T abnormanilty,no,reversible deffect
freq,207,143,258,152,204,166


# Numerical and Categorical Values 분리

selec_dtypes 기능을 이용하여 원하는 유형의 데이터 타입 추출. 
* 개체 유형을 포함할 때 문자열 열을 선택
* 개체 유형을 제외할 때 숫자 열을 선택

In [20]:
numerical_column = df.select_dtypes(exclude="object").columns.tolist()
categorical_column= df.select_dtypes(include="object").columns.tolist()
print("Numerical Columns:", numerical_column)
print("Categorical_Columns:", categorical_column)

Numerical Columns: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca', 'target']
Categorical_Columns: ['sex', 'cp', 'fbs', 'restecg', 'exang', 'thal']


# Categorical Columns 

열이 가지고 있는 unique 값을 확인할 수 있으며, 이런 정보는 추후 분리하여 encoding할 예정

In [21]:
df[categorical_column].describe()

Unnamed: 0,sex,cp,fbs,restecg,exang,thal
count,303,303,303,303,303,303
unique,2,4,2,3,2,4
top,Male,typical angina,normal,S-T abnormanilty,no,reversible deffect
freq,207,143,258,152,204,166


# Spliting Columns for One Hot Encoding and Label Encoding

Data Set 안에는 6개의 범주형 데이터가 있다. One Hot Encoding을 사용하여 고유값이 10개 미만과 2개이상인 열로 변환  

나머지 열은 Lable Encoding으로 변환(각 고유 레이브렝 대한 숫자 값을 제공하는 대신 새 열을 정의 하지 않음)  

성능에 나쁜 영향을 미치기 때문에 열에 10개이상의 고유값을 가지지 않도록 One Hot Encoding을 사용  

"Dummy Variable" 문제가 발생하므로 고유값 2개에 One hot Encoding을 사용하지 않음

In [23]:
# Get column names have less than 10 more than 2 unique values
to_one_hot_encoding = [col for col in categorical_column if df[col].nunique()<=10 and df[col].nunique()>2]

# Get Categorical Column names thoose are not in "to_one_hot_encoding"
to_label_encoding = [col for col in categorical_column if not col in to_one_hot_encoding]

print("TO One Hot Encoding:", to_one_hot_encoding)
print("TO Label Encoding:", to_label_encoding)

TO One Hot Encoding: ['cp', 'restecg', 'thal']
TO Label Encoding: ['sex', 'fbs', 'exang']


# Investingatign Missing Valeus

In [24]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

# One Hot Encoding and Label Encoding

In [25]:
# 내장된 판다 함수 "get_delfmies()"를 사용하여 One hot Encoding 
one_hot_encoding_colums = pd.get_dummies(df[to_one_hot_encoding])
one_hot_encoding_colums

Unnamed: 0,cp_asymptomatic,cp_atypical angina,cp_non anginal pain,cp_typical angina,restecg_2,restecg_S-T abnormanilty,restecg_normal,thal_deffect-3,thal_fixed deffect,thal_normal,thal_reversible deffect
0,1,0,0,0,0,0,1,0,1,0,0
1,0,0,1,0,0,1,0,0,0,0,1
2,0,1,0,0,0,0,1,0,0,0,1
3,0,1,0,0,0,1,0,0,0,0,1
4,0,0,0,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
298,0,0,0,1,0,1,0,1,0,0,0
299,1,0,0,0,0,1,0,1,0,0,0
300,0,0,0,1,0,1,0,1,0,0,0
301,0,0,0,1,0,1,0,1,0,0,0


In [28]:
# Label Encoding 

label_encoded_columns =[]

for col in to_label_encoding:
    # 각 새로운 컬럼에 대한 새로운 label encoder로 정의
    le = LabelEncoder()
    # Encode 하고 이에 대한 새로운 데이터 프레임 생성
    # 컬럼명을 "contraction"으로 지정
    column_dataframe = pd.DataFrame(le.fit_transform(df[col]),
                                   columns=[col])
    # "label_encoded_columns"리스트에 새로운 DataFrame을 추가
    label_encoded_columns.append(column_dataframe)
    
# 데이터 프레임 합치기(Merge)
label_encoded_columns = pd.concat(label_encoded_columns, axis=1)
label_encoded_columns

Unnamed: 0,sex,fbs,exang
0,1,0,0
1,1,1,0
2,0,1,0
3,1,1,0
4,0,1,1
...,...,...,...
298,0,1,1
299,1,1,0
300,1,0,0
301,1,1,1


# Bring Data Together

변경된 데이터를 가져와 한개의 데이터 프레임으로 만들고 시험하기

In [32]:
#  데이터 프레임을 복사하여 X 변수로 정의 
X=df.copy()

# Categorical columns 버리기
X.drop(categorical_column, axis=1, inplace=True)

# 데이터 프레임 합치기
X = pd.concat([X, one_hot_encoding_colums, label_encoded_columns], axis=1)
print("All columns:", X.columns.tolist())
X

All columns: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca', 'target', 'cp_asymptomatic', 'cp_atypical angina', 'cp_non anginal pain', 'cp_typical angina', 'restecg_2', 'restecg_S-T abnormanilty', 'restecg_normal', 'thal_deffect-3', 'thal_fixed deffect', 'thal_normal', 'thal_reversible deffect', 'sex', 'fbs', 'exang']


Unnamed: 0,age,trestbps,chol,thalach,oldpeak,slope,ca,target,cp_asymptomatic,cp_atypical angina,...,restecg_2,restecg_S-T abnormanilty,restecg_normal,thal_deffect-3,thal_fixed deffect,thal_normal,thal_reversible deffect,sex,fbs,exang
0,63,145,233,150,2.3,0,0,1,1,0,...,0,0,1,0,1,0,0,1,0,0
1,37,130,250,187,3.5,0,0,1,0,0,...,0,1,0,0,0,0,1,1,1,0
2,41,130,204,172,1.4,2,0,1,0,1,...,0,0,1,0,0,0,1,0,1,0
3,56,120,236,178,0.8,2,0,1,0,1,...,0,1,0,0,0,0,1,1,1,0
4,57,120,354,163,0.6,2,0,1,0,0,...,0,1,0,0,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,140,241,123,0.2,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,1
299,45,110,264,132,1.2,1,0,0,1,0,...,0,1,0,1,0,0,0,1,1,0
300,68,144,193,141,3.4,1,2,0,0,0,...,0,1,0,1,0,0,0,1,0,0
301,57,130,131,115,1.2,1,1,0,0,0,...,0,1,0,1,0,0,0,1,1,1


# Train & Test 데이터로 나누기

머신러닝 모델을 돌리기 위한 Train & Test 로 나눔  

먼저 X와 Y 두 그룹으로 나눔

X 는 Target Column이 없는 데이터  
Y 는 예측을 위한 Target 

In [33]:
# Y에 대한 정의 (예측을 위한 값)
y=df["target"]

# X의 "class" 버리기
X.drop(["target"], axis=1, inplace=True)
X

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,slope,ca,cp_asymptomatic,cp_atypical angina,cp_non anginal pain,...,restecg_2,restecg_S-T abnormanilty,restecg_normal,thal_deffect-3,thal_fixed deffect,thal_normal,thal_reversible deffect,sex,fbs,exang
0,63,145,233,150,2.3,0,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0
1,37,130,250,187,3.5,0,0,0,0,1,...,0,1,0,0,0,0,1,1,1,0
2,41,130,204,172,1.4,2,0,0,1,0,...,0,0,1,0,0,0,1,0,1,0
3,56,120,236,178,0.8,2,0,0,1,0,...,0,1,0,0,0,0,1,1,1,0
4,57,120,354,163,0.6,2,0,0,0,0,...,0,1,0,0,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,140,241,123,0.2,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,1
299,45,110,264,132,1.2,1,0,1,0,0,...,0,1,0,1,0,0,0,1,1,0
300,68,144,193,141,3.4,1,2,0,0,0,...,0,1,0,1,0,0,0,1,0,0
301,57,130,131,115,1.2,1,1,0,0,0,...,0,1,0,1,0,0,0,1,1,1


**train & test 분리**

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Train

가장 적합한 머신러닝 모델을 찾기 위해서 몇개의 모델을 사용할 것 

nominal values ("True or False" or "Category A, Category B, C .." 와 같은 타입) 예측을 위해서 Classifiers를 사용  

numerical 값(exp. 회사 급여)을 예측하기 위해서는 Reggressors를 사용 

사용해볼 알고리즘들 :   

* Random Forest 
* Desicion Tree
* Logistic Regression Classifier
* Bernouilli Naive Bias
* Gaussian Naive Bias
* KNN (K-Nearest Neighbors)
* XGBoost 

## Random Forest
Random forest 는 n_estimator인수가 필요하지 않은 알고리즘 중에 하나
n_estimator를 사용하여 다른 예측을 얻을 수 있음
하지만 너무 많이 늘리면 실제로 원하지 않은 과도한 train된 모델이 나올 수 있으므로 주의 필요

In [36]:
# Random Forest 모델 정의
rf = RandomForestClassifier(n_estimators=100)

# We fit our model with our train data
rf.fit(X_train, y_train)

# X_test 데이터의 예측결과
pred_rf=rf.predict(X_test)

# See First 10 Predictionis and They Actual Values
print("Predicted:", pred_rf[0:10])
print("Actual:", y_test[0:10])

Predicted: [0 1 0 1 1 1 1 1 1 1]
Actual: 256    0
93     1
239    0
261    0
119    1
92     1
104    1
125    1
137    1
82     1
Name: target, dtype: int64


## Decision Tree

In [37]:
# define Decision Tree Model
dt = DecisionTreeClassifier()

# We fit our model with our train data
dt.fit(X_train, y_train)

# Then predict results from X-test data
pred_dt = dt.predict(X_test)

# See First 10 Predicatoins and They Actual Values
print("Predicted: ", pred_dt[0:10])
print("Actual: ", y_test[0:10])

Predicted:  [0 0 0 0 1 1 1 1 0 1]
Actual:  256    0
93     1
239    0
261    0
119    1
92     1
104    1
125    1
137    1
82     1
Name: target, dtype: int64


## Logistic Regression

In [40]:
# define Decision Tree Model
log = LogisticRegression()

# We fit our model with our train data
log.fit(X_train, y_train)

# Then predict results from X-test data
pred_log=log.predict(X_test)

# See First 10 Predicatoins and They Actual Values
print("Predicted: ", pred_log[0:10])
print("Actual:", y_test[0:10])

Predicted:  [0 1 0 1 0 1 1 1 1 1]
Actual: 256    0
93     1
239    0
261    0
119    1
92     1
104    1
125    1
137    1
82     1
Name: target, dtype: int64


## Bernouilli Naive Bias

In [41]:
# define Decision Tree Model
bnb = BernoulliNB()

# We fit our model with our train data
bnb.fit(X_train, y_train)

# Then predict results from X-test data
pred_bnb = bnb.predict(X_test)

# See First 10 Predicatoins and They Actual Values
print("Predicted: ", pred_bnb[0:10])
print("Actual:", y_test[0:10])

Predicted:  [0 1 0 1 1 1 1 1 1 1]
Actual: 256    0
93     1
239    0
261    0
119    1
92     1
104    1
125    1
137    1
82     1
Name: target, dtype: int64


## Gaussian Naive BIas

In [44]:
# define Decision Tree Model
gnb=GaussianNB()

# We fit our model with our train data
gnb.fit(X_train, y_train)

# Then predict results from X-test data
pred_gnb= gnb.predict(X_test)

# See First 10 Predicatoins and They Actual Values
print("Predicted: ", pred_gnb[0:10])
print("Actual:", y_test[0:10])

Predicted:  [0 1 0 1 1 1 1 1 1 1]
Actual: 256    0
93     1
239    0
261    0
119    1
92     1
104    1
125    1
137    1
82     1
Name: target, dtype: int64


## KNN(K-Nearest Neighbors)

KNN은 데이터 가장 가까운 요소의 계산 기반   
KNN은 "n_neighbors"와 "filename"의 몇가지 인수를 사용  
"n_neighbors"은 결과를 예측하는 데 사용할 가장 가까운 neighbros의 수임  
"metric"은 neighbors와의 거리를 계산하는 방법

In [45]:
# define Decision Tree Model
knn = KNeighborsClassifier(n_neighbors=3, metric="minkowski")

# We fit our model with our train data
knn.fit(X_train, y_train)

# Then predict results from X-test data
pred_knn=knn.predict(X_test)

# See First 10 Predicatoins and They Actual Values
print("Predicted: ", pred_knn[0:10])
print("Actual:", y_test[0:10])

Predicted:  [0 0 0 1 1 1 1 1 0 1]
Actual: 256    0
93     1
239    0
261    0
119    1
92     1
104    1
125    1
137    1
82     1
Name: target, dtype: int64


## XGBoost
gradient boosting을 기반으로 하는 라이브러리  
매우 빠르고 정확하지만 기능을 잘 활용하려면 지식이 필요하다.


In [47]:
# define Decision Tree Model
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05)

# We fit our model with our train data
xgb.fit(X_train, y_train,
       early_stopping_rounds=5,
        # 5번 이내에 향상시키지 않으면 멈춤
        # 시간 절약 할 수 있꼬 overtrian하지 않을 수 있음
       eval_set=[(X_test, y_test)],
        # 모델 성능을 평가하기 위해 test데이터를 제공 
       verbose=False)

# Then predict results from X-test data
pred_xgb = xgb.predict(X_test)

# See First 10 Predicatoins and They Actual Values
print("Predicted: ", pred_xgb[0:10])
print("Actual:", y_test[0:10])

Predicted:  [0 1 0 1 0 1 1 1 1 1]
Actual: 256    0
93     1
239    0
261    0
119    1
92     1
104    1
125    1
137    1
82     1
Name: target, dtype: int64


## 모델 성능 비교

**"Confusion Matrix"  와 "Accuracy_score"** 를 사용하여 모델성능을 비교하여 최고 성능의 모델 선택

### Confusion Matrixes

In [49]:
# Confusion Matrixes
# 첫번째 파라미터는 실제값
# 두번째 파라미터는 예상한 값

# Random Forest
cm_rf=confusion_matrix(y_test, pred_rf)
# Decisoin Tree
cm_dt=confusion_matrix(y_test, pred_dt)
# Logistic Regression
cm_log=confusion_matrix(y_test, pred_log)
# Bernouili Naive Bias
cm_bnb=confusion_matrix(y_test, pred_bnb)
# Gaussian Naive Bias
cm_gnb=confusion_matrix(y_test, pred_gnb)
# KNN(K-Nearest Neighbors)
cm_knn=confusion_matrix(y_test, pred_knn)
# XGBoost
cm_xgb=confusion_matrix(y_test, pred_xgb)

print("***********************")
print("Confusion Matrixes")
print("***********************")
print("Random Forest:\n", cm_rf)
print("Decision Tree:\n", cm_dt)
print("Logistic Regression:\n", cm_log)
print("Bernouili Naive Bias:\n", cm_bnb)
print("Gaussian Naive Bias:\n", cm_gnb)
print("KNN (K-Nearest Neighbors):\n", cm_knn)
print("XGBoost:\n", cm_xgb)

***********************
Confusion Matrixes
***********************
Random Forest:
 [[34  7]
 [13 46]]
Decision Tree:
 [[31 10]
 [24 35]]
Logistic Regression:
 [[32  9]
 [ 9 50]]
Bernouili Naive Bias:
 [[33  8]
 [10 49]]
Gaussian Naive Bias:
 [[29 12]
 [ 4 55]]
KNN (K-Nearest Neighbors):
 [[25 16]
 [22 37]]
XGBoost:
 [[35  6]
 [19 40]]


In [50]:
# Accuracy Scores
# 첫번째 파라미터는 실제값
# 두번째 파라미터는 예상한 값

# Random Forest
acc_rf=accuracy_score(y_test, pred_rf)
# Decisoin Tree
acc_dt=accuracy_score(y_test, pred_dt)
# Logistic Regression
acc_log=accuracy_score(y_test, pred_log)
# Bernouili Naive Bias
acc_bnb=accuracy_score(y_test, pred_bnb)
# Gaussian Naive Bias
acc_gnb=accuracy_score(y_test, pred_gnb)
# KNN(K-Nearest Neighbors)
acc_knn=accuracy_score(y_test, pred_knn)
# XGBoost
acc_xgb=accuracy_score(y_test, pred_xgb)

print("***********************")
print("Accuracy Score")
print("***********************")
print("Random Forest:\n", acc_rf)
print("Decision Tree:\n", acc_dt)
print("Logistic Regression:\n", acc_log)
print("Bernouili Naive Bias:\n", acc_bnb)
print("Gaussian Naive Bias:\n", acc_gnb)
print("KNN (K-Nearest Neighbors):\n", acc_knn)
print("XGBoost:\n", acc_xgb)

***********************
Accuracy Score
***********************
Random Forest:
 0.8
Decision Tree:
 0.66
Logistic Regression:
 0.82
Bernouili Naive Bias:
 0.82
Gaussian Naive Bias:
 0.84
KNN (K-Nearest Neighbors):
 0.62
XGBoost:
 0.75
