

# This notebook was done as practice using ensemble learning methods to do image classification
## The dataset was given from an old advanced machine learning class. There are 1600 8x8 pixel grayscale images with a csv file containing the labels for each image.



In [2]:

# importing libraries
import numpy as np
import pandas as pd
import cv2 
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier



In [3]:

# setting up file paths for the labels and the images for the image data
# the dataset was from my data science class for image recognition
# they are 8x8 pixels in dimensions.
folder_path = r"C:\Users\KennoHead\Desktop\Data Science and Machine Learning Refresher\Digit\\"
image_labels_df = pd.read_csv(r'C:\Users\KennoHead\Desktop\Data Science and Machine Learning Refresher\Digit\label.csv')
file_names_list = image_labels_df['name of the file']



In [4]:

# creating the list of feature pixel column names.
pixel_columns = []

for i in range (1, 65, 1):
    pixel_columns.append(str(i))
    
# creating a list to append to.
# using cv2, read each image as a grayscale image
# i was taught to process image data using either
# matplotlib or pillow libraries
# then convert the pixel dimensions into a flat array.
# normally we can use flatten, but I use reshape here
pixels_matrix = []
for file_name in file_names_list:
    current_image_pixels = cv2.imread(folder_path + str(file_name) + '.jpg',0)
    pixels_matrix.append(current_image_pixels.reshape(64,))


In [5]:

# combining the pixel data and the labels (with the file names aswell)
pixels_df = pd.DataFrame(pixels_matrix, columns = pixel_columns)
pixels_df = pd.concat([pixels_df, image_labels_df], axis = 1)


In [6]:

# train test split
pixels_features_list = list(pixels_df.columns)
pixels_features_list.remove('name of the file')
pixels_features_list.remove('digit label')
pixels_features = pixels_df[pixels_features_list]
pixels_labels = pixels_df['digit label']
X_training, X_testing, Y_training, Y_testing = train_test_split(pixels_features, pixels_labels, test_size = .3, random_state = 0)



In [7]:

# here we are going to do the bootstrap aggregation method
# also known as "bagging"
# the idea is that we get a subset of the training set
# and do resampling with replacement
# with each new subset, we train a decision tree 

bootstrap_size = int(0.8 * len(X_training)) 
dec_tree_predict_list = []

for i in range(0, 30, 1):
    X_r_training = resample(X_training, n_samples = bootstrap_size, random_state = i, replace = True)
    Y_r_training = resample(Y_training, n_samples = bootstrap_size, random_state = i, replace = True)
    base_dec_tree = DecisionTreeClassifier(random_state = 0)
    base_dec_tree.fit(X_r_training, Y_r_training) 
    current_tree_prediction = base_dec_tree.predict(X_testing)
    dec_tree_predict_list.append(current_tree_prediction)
    
result_df = pd.DataFrame(data = dec_tree_predict_list)


In [8]:


# here we doing polling by majority
# to predict each sample's label

ensemble_predict = []

for current_row in result_df:
    current_row_count = Counter (result_df[current_row])
    ensemble_predict.append(current_row_count.most_common(1)[0][0])

# from this we can see the accuracy score is pretty high
# so bootstrap aggregation is an effective approach to classification

ensemble_acc = accuracy_score(Y_testing, ensemble_predict)


In [9]:

# using Adaptive Boost Classifier
adaboost_clssr = AdaBoostClassifier(n_estimators = 30, random_state = 0)
adaboost_clssr.fit(X_training, Y_training)
adaboost_predictions = adaboost_clssr.predict(X_testing)
adaboost_acc = accuracy_score(Y_testing, adaboost_predictions)



In [10]:

# using the Extreme Gradient Boost Classifier
xgb_clssr = XGBClassifier(n_estimators= 30, random_state = 0, use_label_encoder = False)
xgb_clssr.fit (X_training, Y_training, eval_metric = 'logloss')
xgb_predictions = xgb_clssr.predict (X_testing)
xgb_acc = accuracy_score(Y_testing, xgb_predictions)



In [11]:

# using the Random Forest Classifier
rfc = RandomForestClassifier(n_estimators = 30, bootstrap = True, random_state = 0)
rfc.fit(X_training, Y_training) 
rfc_predictions = rfc.predict(X_testing)
rfc_acc = accuracy_score(Y_testing, rfc_predictions)


In [12]:

print(ensemble_acc) 
print(adaboost_acc)
print(xgb_acc) 
print(rfc_acc) 


0.9277777777777778
0.2037037037037037
0.9333333333333333
0.9574074074074074
