
# This notebook was done a practice in manually implementing the Bootstrap Aggregation (bagging) method using Scikit-learn's decision trees.
## The dataset used was given from an old data science class. The data is 1600 8x8 pixel JPG grayscale images of the digits 0-9 handwritten. The labels and image names are contained in a csv file that came with the data.


In [17]:

import numpy as np
import pandas as pd
import cv2 

# setting up file paths for the labels and the images for the image data
# the dataset was from my data science class for image recognition
# they are 8x8 pixels in dimensions.
folder_path = r"C:\Users\KennoHead\Desktop\Data Science and Machine Learning Refresher\Digit\\"
image_labels_df = pd.read_csv(r'C:\Users\KennoHead\Desktop\Data Science and Machine Learning Refresher\Digit\label.csv')
file_names_list = image_labels_df['name of the file']



In [51]:

pixel_columns = []

for i in range (1, 65, 1):
    pixel_columns.append(str(i))
    

# creating a list to append to.
# using cv2, read each image as a grayscale image
# i was taught to process image data using either
# matplotlib or pillow libraries
# then convert the pixel dimensions into a flat array.
# normally we can use flatten, but I use reshape here
pixels_matrix = []
for file_name in file_names_list:
    current_image_pixels = cv2.imread(folder_path + str(file_name) + '.jpg',0)
    pixels_matrix.append(current_image_pixels.reshape(64,))



In [22]:

# combining the pixel data and the labels (with the file names aswell)
pixels_df = pd.DataFrame(pixels_matrix, columns = pixel_columns)

pixels_df = pd.concat([pixels_df, image_labels_df], axis = 1)


In [23]:

pixels_df.head()


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,57,58,59,60,61,62,63,64,name of the file,digit label
0,0,2,88,217,158,21,0,0,16,0,...,4,3,87,228,182,0,9,2,0,0
1,0,0,11,187,210,78,5,0,13,5,...,0,0,0,172,251,157,0,12,1,1
2,0,0,5,59,240,199,2,0,5,0,...,0,11,0,54,173,255,138,0,2,2
3,6,0,108,255,213,13,10,0,2,131,...,0,0,118,217,226,159,0,3,3,3
4,0,6,0,34,160,7,7,0,5,0,...,0,0,2,42,250,51,0,4,4,4


In [24]:


# train test split

from sklearn.model_selection import train_test_split

pixels_features_list = list(pixels_df.columns)

pixels_features_list.remove('name of the file')

pixels_features_list.remove('digit label')

pixels_features = pixels_df[pixels_features_list]

pixels_labels = pixels_df['digit label']

X_training, X_testing, Y_training, Y_testing = train_test_split(pixels_features, pixels_labels, test_size = .3, random_state = 0)




In [34]:

# here we are going to do the bootstrap aggregation method
# also known as "bagging"
# the idea is that we get a subset of the training set
# and do resampling with replacement
# with each new subset, we train a decision tree 

from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier

bootstrap_size = int(0.8 * len(X_training)) 

dec_tree_predict_list = []

for i in range(0, 20, 1):
    
    X_r_training = resample(X_training, n_samples = bootstrap_size, random_state = i, replace = True)
    
    Y_r_training = resample(Y_training, n_samples = bootstrap_size, random_state = i, replace = True)
    
    base_dec_tree = DecisionTreeClassifier(random_state = 0)
    
    base_dec_tree.fit(X_r_training, Y_r_training) 
    
    current_tree_prediction = base_dec_tree.predict(X_testing)
    
    dec_tree_predict_list.append(current_tree_prediction)
    
result_df = pd.DataFrame(data = dec_tree_predict_list)

In [49]:

# here we doing polling by majority
# to predict each sample's label

from collections import Counter

ensemble_predict = []

for current_row in result_df:
    current_row_count = Counter (result_df[current_row])
    ensemble_predict.append(current_row_count.most_common(1)[0][0])


In [50]:

# from this we can see the accuracy score is pretty high
# so bootstrap aggregation is an effective approach to classification

from sklearn.metrics import accuracy_score

ensemble_acc = accuracy_score(Y_testing, ensemble_predict)

print(ensemble_acc)


0.924074074074074
