In [2]:
import numpy as np
import pandas as pd
import graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# 資料描述
---
#### Abstract: This dataset is a heart disease database similar to a database already present in the repository (Heart Disease databases) but in a slightly different form

### 判斷是否有心臟方面的疾病




### 欄位描述
---
-- 1. age 

-- 2. sex 

-- 3. chest pain type (4 values) 

-- 4. resting blood pressure 

-- 5. serum cholestoral in mg/dl 

-- 6. fasting blood sugar > 120 mg/dl 

-- 7. resting electrocardiographic results (values 0,1,2) 

-- 8. maximum heart rate achieved 

-- 9. exercise induced angina 

-- 10. oldpeak = ST depression induced by exercise relative to rest 

-- 11. the slope of the peak exercise ST segment 

-- 12. number of major vessels (0-3) colored by flourosopy 

-- 13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect 


In [3]:
header = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure',
          'serum_cholestoral', 'fasting_blood_sugar', 'resting_electrocardiographic_results', 
          'heart_rate', 'exercise_induced_angina', 'oldpeak', 'ST', 'vessels', 'thal', 'Absence']
df = pd.read_csv('heart.dat', sep=' ', names=header)

df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_electrocardiographic_results,heart_rate,exercise_induced_angina,oldpeak,ST,vessels,thal,Absence
0,70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,2
1,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,1
2,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,2
3,64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,1
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
age                                     270 non-null float64
sex                                     270 non-null float64
chest_pain_type                         270 non-null float64
resting_blood_pressure                  270 non-null float64
serum_cholestoral                       270 non-null float64
fasting_blood_sugar                     270 non-null float64
resting_electrocardiographic_results    270 non-null float64
heart_rate                              270 non-null float64
exercise_induced_angina                 270 non-null float64
oldpeak                                 270 non-null float64
ST                                      270 non-null float64
vessels                                 270 non-null float64
thal                                    270 non-null float64
Absence                                 270 non-null int64
dtypes: float64(13), int64(1)
memory usage:

In [5]:
df.describe()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_electrocardiographic_results,heart_rate,exercise_induced_angina,oldpeak,ST,vessels,thal,Absence
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,54.433333,0.677778,3.174074,131.344444,249.659259,0.148148,1.022222,149.677778,0.32963,1.05,1.585185,0.67037,4.696296,1.444444
std,9.109067,0.468195,0.95009,17.861608,51.686237,0.355906,0.997891,23.165717,0.470952,1.14521,0.61439,0.943896,1.940659,0.497827
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,1.0
25%,48.0,0.0,3.0,120.0,213.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0,1.0
50%,55.0,1.0,3.0,130.0,245.0,0.0,2.0,153.5,0.0,0.8,2.0,0.0,3.0,1.0
75%,61.0,1.0,4.0,140.0,280.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,2.0


In [6]:
y = df.pop('Absence')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 13 columns):
age                                     270 non-null float64
sex                                     270 non-null float64
chest_pain_type                         270 non-null float64
resting_blood_pressure                  270 non-null float64
serum_cholestoral                       270 non-null float64
fasting_blood_sugar                     270 non-null float64
resting_electrocardiographic_results    270 non-null float64
heart_rate                              270 non-null float64
exercise_induced_angina                 270 non-null float64
oldpeak                                 270 non-null float64
ST                                      270 non-null float64
vessels                                 270 non-null float64
thal                                    270 non-null float64
dtypes: float64(13)
memory usage: 27.5 KB


# 資料切分
---
## 總比數270
### 切分80/20 做訓練與測試用
##### 訓練集 189筆、測試集81筆

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3, random_state=4)
print("Train DataSet:", X_train.shape[0], ", Test DataSet:", X_test.shape[0])

Train DataSet: 189 , Test DataSet: 81


# KNN
---
### 評估標準 : Entropy
### 樹的最大深度 : 3

In [9]:
k = 3
KNN = KNeighborsClassifier(n_neighbors=k)
KNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [10]:
print("Train accuracy %.2f" % (KNN.score(X_train, y_train)))
print("Test accuracy  %.2f" % (KNN.score(X_test, y_test)))

Train accuracy 0.83
Test accuracy  0.58
