# Loading, Exploring, and Cleaning the Data

In [1]:
# import packages
import pandas as pd
import numpy as np

In [2]:
# url path
url_path = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter15/Dataset/crx.data'

In [3]:
# load the data
credData = pd.read_csv(url_path, header=None, na_values='?')
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [4]:
# Change the classes to 1 and 0
credData.loc[credData[15] == '+', 15] = 1
credData.loc[credData[15] == '-', 15] = 0
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [5]:
# Find the number of null values in the dataset
credData.isnull().sum()

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

In [6]:
# print the shape and data types of each column
print(f'Shape of the dataset: {credData.shape}')
print(f'Types of data in dataset: {credData.dtypes}')

Shape of the dataset: (690, 16)
Types of data in dataset: 0      object
1     float64
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13    float64
14      int64
15     object
dtype: object


There are few NA values, so we will drop them.

In [7]:
# Remove the rows with na values
credData.dropna(inplace=True, axis=0)
credData.shape

(653, 16)

In [8]:
# Verify that no null values
credData.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

In [9]:
# make dummy values from the categorical variables
credCat = pd.get_dummies(credData[[0, 3, 4, 5, 6, 8, 9, 11, 12]])

In [10]:
# Separate the numerical variables
credNum = credData[[1, 2, 7, 10, 13, 14]]

In [11]:
# Create the X and y variables
X = pd.concat([credCat, credNum], axis=1)
y = pd.Series(credData[15], dtype='int')

In [12]:
# Normalize the dataset
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_trans = pd.DataFrame(scaler.fit_transform(X))

In [13]:
# Split the dataset into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_trans, y, test_size=0.3, random_state=123)

# Ensemble Model Using the Averaging Technique

In [14]:
# import packages
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [15]:
# define the three base models
model1 = LogisticRegression(random_state=123)
model2 = KNeighborsClassifier(n_neighbors=5)
model3 = RandomForestClassifier(n_estimators=500)

In [16]:
# fit the models
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)

RandomForestClassifier(n_estimators=500)

In [17]:
# predict the probabilities of each model
pred1 = model1.predict_proba(X_test)
pred2 = model2.predict_proba(X_test)
pred3 = model3.predict_proba(X_test)

In [18]:
# Average the predictions generated from all of the three models
ensemblepred = (pred1 + pred2 + pred3) / 3

In [19]:
# Display the first four rows of the ensemble prediction
ensemblepred[0:4, :]

array([[0.90183583, 0.09816417],
       [0.95996145, 0.04003855],
       [0.18757473, 0.81242527],
       [0.05216821, 0.94783179]])

In [20]:
# Print the order of each class from the prediction output
print(model1.classes_)
print(model2.classes_)
print(model3.classes_)

[0 1]
[0 1]
[0 1]


In [21]:
# get the final predictions for each example
pred = np.argmax(ensemblepred, axis=1)
pred

array([0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1],
      dtype=int64)

In [22]:
# print the confusion matrix
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, pred))

[[96 11]
 [ 8 81]]


In [23]:
# print classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       107
           1       0.88      0.91      0.90        89

    accuracy                           0.90       196
   macro avg       0.90      0.90      0.90       196
weighted avg       0.90      0.90      0.90       196



# Ensemble Model Using the Weighted Averaging Technique

In [24]:
# Take the weighted average of the predictions
ensemblepred = (0.60 * pred1 + 0.20 * pred2 + 0.20 * pred3)

In [25]:
# Display the first four rows of the ensemble prediction array
ensemblepred[0:4, :]

array([[0.9209045 , 0.0790955 ],
       [0.9471306 , 0.0528694 ],
       [0.14563452, 0.85436548],
       [0.08910278, 0.91089722]])

In [26]:
# Calculate the final predictions from the probabilities
pred = np.argmax(ensemblepred, axis=1)
pred

array([0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1],
      dtype=int64)

In [27]:
# print the confusion matrix
print(confusion_matrix(y_test, pred))

[[94 13]
 [ 8 81]]


In [28]:
# print classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.92      0.88      0.90       107
           1       0.86      0.91      0.89        89

    accuracy                           0.89       196
   macro avg       0.89      0.89      0.89       196
weighted avg       0.89      0.89      0.89       196



# Ensemble Model Using Max Voting

In [29]:
# construct the ensemble model using the VotingClassifier() function
from sklearn.ensemble import VotingClassifier

model1 = LogisticRegression(random_state=123)
model2 = KNeighborsClassifier(n_neighbors=5)
model3 = RandomForestClassifier(n_estimators=500)

model = VotingClassifier(estimators=[('lr', model1), ('knn', model2), ('rf', model3)], voting='hard')

In [30]:
# fit the model
model.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=123)),
                             ('knn', KNeighborsClassifier()),
                             ('rf', RandomForestClassifier(n_estimators=500))])

In [31]:
# calculate accuracy
print(f'Accuracy on test set: {model.score(X_test, y_test)}')

Accuracy on test set: 0.9030612244897959


In [32]:
# make predictions
pred = model.predict(X_test)

In [33]:
# print confusion matrix
print(confusion_matrix(y_test, pred))

[[95 12]
 [ 7 82]]


In [34]:
# print classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91       107
           1       0.87      0.92      0.90        89

    accuracy                           0.90       196
   macro avg       0.90      0.90      0.90       196
weighted avg       0.90      0.90      0.90       196



# Ensemble Learning Using Bagging

In [35]:
# Define the base learner
bl1 = RandomForestClassifier(random_state=123)

In [36]:
# construct the ensemble model
from sklearn.ensemble import BaggingClassifier

baggingLearner = BaggingClassifier(base_estimator=bl1,
                                  n_estimators=10,
                                  max_samples=0.8,
                                  max_features=0.7)

In [37]:
# fit the model
model = baggingLearner.fit(X_train, y_train)

In [38]:
# make predictions
pred = model.predict(X_test)

In [39]:
# confusion matrix
print(confusion_matrix(y_test, pred))

[[94 13]
 [ 9 80]]


In [40]:
# classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.91      0.88      0.90       107
           1       0.86      0.90      0.88        89

    accuracy                           0.89       196
   macro avg       0.89      0.89      0.89       196
weighted avg       0.89      0.89      0.89       196



# Ensemble Learning Using Boosting

In [41]:
# Define the base learner
bl1 = LogisticRegression(random_state=123)

In [42]:
# construct the ensemble model
from sklearn.ensemble import AdaBoostClassifier

boosting = AdaBoostClassifier(base_estimator=bl1, n_estimators=200)

In [43]:
# fit the model
model = boosting.fit(X_train, y_train)

In [44]:
# make predictions
pred = model.predict(X_test)

In [45]:
# confusion matrix
print(confusion_matrix(y_test, pred))

[[96 11]
 [ 8 81]]


In [46]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       107
           1       0.88      0.91      0.90        89

    accuracy                           0.90       196
   macro avg       0.90      0.90      0.90       196
weighted avg       0.90      0.90      0.90       196



# Ensemble Learning Using Stacking

In [47]:
# create base learners
bl1 = KNeighborsClassifier(n_neighbors=5)
bl2 = RandomForestClassifier(random_state=123)
ml = LogisticRegression(random_state=123)

In [48]:
# create stacking classifier
from mlxtend.classifier import StackingClassifier

stackcl = StackingClassifier(classifiers=[bl1, bl2], meta_classifier=ml)

In [49]:
# fit the model
model = stackcl.fit(X_train, y_train)

In [50]:
# make predictions
pred = model.predict(X_test)

In [51]:
# confusion matrix
print(confusion_matrix(y_test, pred))

[[97 10]
 [ 9 80]]


In [52]:
# classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.92      0.91      0.91       107
           1       0.89      0.90      0.89        89

    accuracy                           0.90       196
   macro avg       0.90      0.90      0.90       196
weighted avg       0.90      0.90      0.90       196

