# Model aggregation

1. This approach allows us to improve the model accuracy.
2. Lower error.
3. Higher consistency that means avoids over fitting.
4. Reduce bias and variance error.

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Loading Iris Dataset

In [16]:
data = load_iris()
df = pd.DataFrame(data['data'], columns=data['feature_names'])
df['target'] = data['target']
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [17]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
8,4.4,2.9,1.4,0.2,0
9,4.9,3.1,1.5,0.1,0


In [19]:
df['target'].unique()

array([0, 1, 2])

In [0]:
X = df.drop(columns=['target'], axis=1)
y = df['target']

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

# Decision Tree Classifier

In [27]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [28]:
model.score(X_test, y_test)

0.9666666666666667

# Overfitting Model

In [29]:
model.score(X_train, y_train)

1.0

# Random forest classifier 

In [32]:
modelRF = RandomForestClassifier(n_estimators=10)
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [33]:
modelRF.score(X_test, y_test)

0.9666666666666667

In [35]:
modelRF.score(X_train, y_train)

0.9833333333333333

# Now let's start with bagging classifier from sklearn

In [0]:
model_bagging = BaggingClassifier(DecisionTreeClassifier(),max_samples=0.5, max_features=1, n_estimators=20)

In [40]:
model_bagging.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
    

In [41]:
model_bagging.score(X_test, y_test)

0.9666666666666667

# Let's try ada-boosting

In [0]:
model_bagging = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=10, learning_rate=1)

In [44]:
model_bagging.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=None,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                          

In [45]:
model_bagging.score(X_test, y_test)

0.9666666666666667

# Let's check out voting ensemble classifier

In [0]:
df['target'] = df['target'].apply(pd.to_numeric) 

In [63]:
df.dtypes

sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
target                 int64
dtype: object

In [0]:
y_train = y_train.astype(float)


In [75]:
y_train

84     1.0
47     0.0
108    2.0
1      0.0
93     1.0
144    2.0
125    2.0
92     1.0
86     1.0
46     0.0
135    2.0
7      0.0
65     1.0
10     0.0
132    2.0
13     0.0
15     0.0
61     1.0
91     1.0
116    2.0
27     0.0
69     1.0
118    2.0
136    2.0
97     1.0
96     1.0
43     0.0
64     1.0
119    2.0
90     1.0
      ... 
40     0.0
32     0.0
146    2.0
143    2.0
66     1.0
49     0.0
8      0.0
30     0.0
117    2.0
56     1.0
21     0.0
0      0.0
131    2.0
52     1.0
126    2.0
38     0.0
44     0.0
147    2.0
57     1.0
55     1.0
94     1.0
109    2.0
103    2.0
58     1.0
137    2.0
50     1.0
87     1.0
104    2.0
129    2.0
122    2.0
Name: target, Length: 120, dtype: float64

In [88]:
y_test

128    2.0
18     0.0
130    2.0
105    2.0
107    2.0
78     1.0
83     1.0
14     0.0
5      0.0
133    2.0
25     0.0
11     0.0
12     0.0
63     1.0
113    2.0
34     0.0
60     1.0
2      0.0
24     0.0
123    2.0
35     0.0
124    2.0
68     1.0
26     0.0
29     0.0
19     0.0
41     0.0
16     0.0
20     0.0
101    2.0
Name: target, dtype: float64

In [0]:
y_test = y_test.astype(float)

In [77]:
y_test.dtype

dtype('float64')

In [0]:
lr = LogisticRegression()
svm = SVC(kernel='poly', degree=2)
dt = DecisionTreeClassifier()

In [0]:
final_model = VotingClassifier(estimators=[('lr', lr),('dt', dt), ('svm', svm)], voting='hard')

In [125]:
final_model.fit(X_train, y_train)



VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('dt',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,...
                                        

In [126]:
final_model.score(X_test, y_test)

0.9763560500695411

# We can see that model aggregation is very effective and improve the accuracy level to 97%

# Trying different dataset


*   Digit Dataset



In [0]:
from sklearn.datasets import load_digits


In [0]:
data = load_digits()

In [93]:
data

{'DESCR': ".. _digits_dataset:\n\nOptical recognition of handwritten digits dataset\n--------------------------------------------------\n\n**Data Set Characteristics:**\n\n    :Number of Instances: 5620\n    :Number of Attributes: 64\n    :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n    :Missing Attribute Values: None\n    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n    :Date: July; 1998\n\nThis is a copy of the test set of the UCI ML hand-written digits datasets\nhttps://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\nThe data set contains images of hand-written digits: 10 classes where\neach class refers to a digit.\n\nPreprocessing programs made available by NIST were used to extract\nnormalized bitmaps of handwritten digits from a preprinted form. From a\ntotal of 43 people, 30 contributed to the training set and different 13\nto the test set. 32x32 bitmaps are divided into nonoverlapping blocks of\n4x4 and the number o

In [0]:
X = data.data
y = data.target


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [102]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [103]:
model.score(X_test, y_test)

0.8400556328233658

In [104]:
model.score(X_train, y_train)

1.0

# so this model is overfitting

In [0]:
model_bagging = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=10, learning_rate=1)

In [107]:
model_bagging.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=None,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                          

In [108]:
model_bagging.score(X_test, y_test)

0.8456189151599444

# Voting classifier

In [0]:
lr = LogisticRegression()
svm = SVC(kernel='poly', degree=2)
dt = DecisionTreeClassifier()

In [0]:
model = VotingClassifier(estimators=[('lr', lr),('dt', dt),('svm', svm)], voting='hard')

In [116]:
model.fit(X_train, y_train)



VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('dt',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,...
                                        

In [117]:
model.score(X_test, y_test)

0.9694019471488178

# Improved results using model aggregation