# Author: Sandesh Basnet

# Ensamble Learning 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [4]:
from sklearn.ensemble import VotingClassifier

In [5]:
df = pd.read_csv('../dataset/dataset/heart.csv')

In [6]:
df.shape

(303, 14)

In [7]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [8]:
df.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [9]:
df['oldpeak'].unique()

array([2.3, 3.5, 1.4, 0.8, 0.6, 0.4, 1.3, 0. , 0.5, 1.6, 1.2, 0.2, 1.8,
       1. , 2.6, 1.5, 3. , 2.4, 0.1, 1.9, 4.2, 1.1, 2. , 0.7, 0.3, 0.9,
       3.6, 3.1, 3.2, 2.5, 2.2, 2.8, 3.4, 6.2, 4. , 5.6, 2.9, 2.1, 3.8,
       4.4])

In [10]:
df.describe() #helps to find out numeric or categoried data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [11]:
numeric_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak',]

In [12]:
df[numeric_cols].corr()  #only for numeric 

Unnamed: 0,age,trestbps,chol,thalach,oldpeak
age,1.0,0.279351,0.213678,-0.398522,0.210013
trestbps,0.279351,1.0,0.123174,-0.046698,0.193216
chol,0.213678,0.123174,1.0,-0.00994,0.053952
thalach,-0.398522,-0.046698,-0.00994,1.0,-0.344187
oldpeak,0.210013,0.193216,0.053952,-0.344187,1.0


In [13]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [14]:
df.drop_duplicates(inplace= True)

In [15]:
df.shape

(302, 14)

In [16]:
X = df.drop(['target'], axis = 1)
y = df['target']

In [17]:
from collections import Counter

In [18]:
Counter(y)

Counter({1: 164, 0: 138})

In [19]:
from sklearn.naive_bayes import GaussianNB  #probability theorem

In [20]:
dtree = DecisionTreeClassifier()
nb = GaussianNB()
knn = KNeighborsClassifier()
logistic= LogisticRegression()  #distance is too long so max iteration should be inilized

In [21]:
classifiers = [                 
    ('KNN', knn),
    ('NaiveBayes', nb),
    ('DTree', dtree),
    ('Logistic', logistic)
]

In [22]:
ensamble_clf = VotingClassifier(estimators=classifiers, n_jobs= -1)

In [23]:
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size= 0.2, random_state= 666)               

In [24]:
ensamble_clf.fit(X_train, y_train)

In [25]:
y_pred = ensamble_clf.predict(X_test)

In [26]:
from sklearn.metrics import classification_report

In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.77      0.75        30
           1       0.77      0.74      0.75        31

    accuracy                           0.75        61
   macro avg       0.75      0.75      0.75        61
weighted avg       0.75      0.75      0.75        61

