In [1]:
# Importing Necessary modules

import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,f1_score
from sklearn.model_selection import train_test_split
from math import log
import numpy as np
import pandas as pd

In [2]:
# Loading the data

df=pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv')
df.columns=list('ABCDEFGHI')
df

Unnamed: 0,A,B,C,D,E,F,G,H,I
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0
...,...,...,...,...,...,...,...,...,...
762,10,101,76,48,180,32.9,0.171,63,0
763,2,122,70,27,0,36.8,0.340,27,0
764,5,121,72,23,112,26.2,0.245,30,0
765,1,126,60,0,0,30.1,0.349,47,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 767 entries, 0 to 766
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       767 non-null    int64  
 1   B       767 non-null    int64  
 2   C       767 non-null    int64  
 3   D       767 non-null    int64  
 4   E       767 non-null    int64  
 5   F       767 non-null    float64
 6   G       767 non-null    float64
 7   H       767 non-null    int64  
 8   I       767 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
df.isnull().sum()

A    0
B    0
C    0
D    0
E    0
F    0
G    0
H    0
I    0
dtype: int64

Data is already cleaned, so we can directly give it to the model. We can also do preprocessing, to churn out accuracy

In [6]:
x_train,x_test,y_train,y_test=train_test_split(df.drop('I',axis=1),df["I"],test_size=0.2)

In [7]:
x_train

Unnamed: 0,A,B,C,D,E,F,G,H
220,2,158,90,0,0,31.6,0.805,66
131,3,170,64,37,225,34.5,0.356,30
599,1,108,88,19,0,27.1,0.400,24
633,10,92,62,0,0,25.9,0.167,31
509,12,84,72,31,0,29.7,0.297,46
...,...,...,...,...,...,...,...,...
328,6,105,70,32,68,30.8,0.122,37
53,7,150,66,42,342,34.7,0.718,42
632,1,128,82,17,183,27.5,0.115,22
134,2,125,60,20,140,33.8,0.088,31


In [8]:
x_test

Unnamed: 0,A,B,C,D,E,F,G,H
485,1,139,62,41,480,40.7,0.536,21
63,7,114,66,0,0,32.8,0.258,42
456,5,86,68,28,71,30.2,0.364,24
486,0,173,78,32,265,46.5,1.159,58
601,1,124,74,36,0,27.8,0.100,30
...,...,...,...,...,...,...,...,...
743,13,153,88,37,140,40.6,1.174,39
617,9,112,82,24,0,28.2,1.282,50
567,4,154,72,29,126,31.3,0.338,37
718,5,97,76,27,0,35.6,0.378,52


In [9]:
# Extracting the values

x_train=x_train.values
x_test=x_test.values
y_train=y_train.values
y_test=y_test.values

In [10]:
# Creating our custom Adaboost class

class AdaBoost:
    def __init__(self,n_estimators=20):
        self.n_estimators=n_estimators
    
    # code to train the models
    def fit(self,x,y):
        self.models=[]
        self.model_weights=[]
        d=np.ones(x.shape[0])/x.shape[0]
        for i in range(self.n_estimators):
            m=DecisionTreeClassifier(max_depth=1) # used many weak learners
            m.fit(x,y,sample_weight=d) 
            res=m.predict(x)
            k=(res!=y)
            s=np.sum(d[k])
            a=0.5*(log((1-s)/s))
            for j in range(len(d)):
                if(res[j]==y[j]):
                    d[j]*=np.exp(-a)
                else:
                    d[j]*=np.exp(a)
            d=d/d.sum()
            self.models.append(m) # saved every model here
            self.model_weights.append(a) # added the model weights for each weak learner(used at prediction time)
            
    # code to generate prediction on the trained model
    def predict(self,x):
        l=[]
        for i in x:
            d={}
            for j in range(self.n_estimators):
                res=self.models[j].predict([i])
                if(res[0] in d):
                    d[res[0]]+=self.model_weights[j]
                else:
                    d[res[0]]=self.model_weights[j]
            l1=list(d.items())
            l1.sort(key=lambda x:x[1],reverse=True)
            l.append(l1[0][0])
        return np.array(l)

In [11]:
model=AdaBoost(n_estimators=20) # Creating the object
model.fit(x_train,y_train) # training it on our data

In [12]:
y_pred=model.predict(x_test)

In [13]:
# Accuracy of the custom coded model on the test-set

print('Accuracy is: ',accuracy_score(y_test,y_pred))

Accuracy is:  0.7207792207792207


In [14]:
# F1-Score of the custom coded model on the test-set

print('F1-Score is: ',f1_score(y_test,y_pred,average='weighted'))

F1-Score is:  0.7136413227783787
