# Importing Necessary Libraries

In [26]:
import keras
from keras.datasets import mnist
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from matplotlib import pyplot as plt
import numpy as np

(X_train,y_train),(X_test,y_test)=mnist.load_data()
X_train = X_train.reshape(X_train.shape[0],784)
X_test = X_test.reshape(X_test.shape[0],784)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/6)

# Refining the Dataset(if pixel value <= 100, then pixel value = 0 else pixel value = 255)

In [27]:
def makebin(vec):
  for i in range(len(vec)):
    for j in range(len(vec[0])):
      if vec[i][j]<=100:
        vec[i][j]=0
      else: 
        vec[i][j]=255
makebin(X_train) #modifying training set
makebin(X_test) #modifying test set
makebin(X_val) #modifying validation set

# Mean accuracy of 50 Decision Trees

In [28]:
n=50 #Total number of instances of decision trees
d=10 #Total number of features for every tree
DT_list=[]  #To store generated decision trees
DT_features=[]  #To store generated feature-sets
ac_sc=np.zeros(n) #To store accuracy scores of randomly generated decision trees
for i in range(n):

    #Generating random subset of 10 specific pixels
    rand_px=np.random.randint(0,784, size=d)
    X_train_DT=X_train[:,rand_px]

    #Training the DT classifier
    DT=tree.DecisionTreeClassifier(max_depth=5,criterion='entropy')
    DT.fit(X_train_DT,y_train)
    DT_list.append(DT)
    DT_features.append(rand_px)
    #Predicting and calculating accuracy of the DT
    y_pred = DT.predict(X_val[:,rand_px])
    ac_sc[i]=accuracy_score(y_val, y_pred)
#fig = plt.figure(figsize=(40,25))
#_ = tree.plot_tree(DT, filled=True)

# Mean accuracy on the validation set of all 50 DTs
print('Mean accuracy score of the 50 Decision Trees = \033[1m{:.2f}%\033[0m,'.format(np.mean(ac_sc)*100))
print('with a minimum score of \033[1m{:.2f}%\033[0m% and a maximum score of \033[1m{:.2f}%\033[0m%'.format(np.min(ac_sc)*100,np.max(ac_sc)*100))

Mean accuracy score of the 50 Decision Trees = [1m33.44%[0m,
with a minimum score of [1m23.82%[0m% and a maximum score of [1m44.05%[0m%


# Random Forest/ Ensemble on the basis of weights of validation accuracies obtained(Obtaining accuracy on testing set)

In [29]:
#Working on the test set
#Creating a prediction matrix for all 10000 testing examples across the 50 decision trees 
pred_mat=[]
for i in range(n):
    y_pred_test=DT_list[i].predict(X_test[:,DT_features[i]])
    pred_mat.append(y_pred_test)
pred_mat=np.asarray(pred_mat)

In [30]:
#Initializing list of weights for each possible label value for all test set examples 
w_list=np.zeros((10,len(y_pred_test)))

#Enumerating the vote provided each of the DTs 
for i in range(n):
    for j in range(len(y_pred_test)):
        w_list[pred_mat[i,j],j]+=ac_sc[i]

#Weighted ensemble classifier predictions:
y_weighted_pred=np.argmax(w_list,axis=0)
print('Accuracy score of the weighted ensemble/Random Forest of the 50 Decision Trees on the \033[1mtesting\033[0m examples= \033[1m{:.2f}%\033[0m'.format(accuracy_score(y_weighted_pred, y_test)*100))

Accuracy score of the weighted ensemble/Random Forest of the 50 Decision Trees on the [1mtesting[0m examples= [1m61.71%[0m


# Random Forest/ Ensemble on the basis of weights of validation accuracies obtained(Obtaining accuracy on training set)

In [31]:
#Obtaining initial weak classifier for boosting
pred_train_mat=[]
for i in range(n):
    y_pred_train=DT_list[i].predict(X_train[:,DT_features[i]])
    pred_train_mat.append(y_pred_train)
pred_train_mat=np.asarray(pred_train_mat)

#Initializing list of weights for each possible label value for all test set examples 
w_train_list=np.zeros((d,len(y_pred_train)))

#Enumerating the vote provided each of the DTs 
for i in range(n):
    for j in range(len(y_pred_train)):
        w_train_list[pred_train_mat[i,j],j]+=ac_sc[i]

#Weighted ensemble classifier predictions:
y_weighted_pred_train=np.argmax(w_train_list,axis=0)
print('Accuracy score of the weighted ensemble/Random Forest of 50 Decision Trees on the \033[1mtraining\033[0m examples= \033[1m{:.2f}%\033[0m'.format(accuracy_score(y_weighted_pred_train, y_train)*100))

Accuracy score of the weighted ensemble/Random Forest of 50 Decision Trees on the [1mtraining[0m examples= [1m60.23%[0m


# Adaboost Implementation with weighted entropy (Use of class_weights argument)

In [32]:
#Adaboost implementation:

n_ada=20
#Initializing weights
w_trainex=np.ones(len(y_train))/len(y_train)
AB_list=[]
w_ABtree=np.zeros(n_ada)
w_err=np.zeros(n_ada)

#running Adaptive-Boost (AdaBoost) for 20 iterations
for i in range(n_ada):
    w_trainex[i]=w_trainex[i]/np.sum(w_trainex)
    AB=tree.DecisionTreeClassifier(max_depth=5,criterion='entropy',class_weight='balanced')   #class_weight = 'balanced', ensures that the 
    AB.fit(X_train,y_train,sample_weight=w_trainex)                                           #weights used in the entropy aren't equally weighted    
    for j in range(len(y_train)):                                                             #but instead they are adjusted automatically as the 
        if y_train[j] != y_weighted_pred_train[j]:                                            #weights proportional to class frequencies in the input data as 
            w_err[i]+=w_trainex[j]                                                            #(n_classes * np.bincount(y))/n_samples
    AB_list.append(AB)
    w_ABtree[i]=0.5*np.log((1-w_err[i])/w_err[i])
    w_trainex[i]=w_trainex[i]*np.exp(w_ABtree[i]) if y_train[j] != y_weighted_pred_train[j] else w_trainex[i]*np.exp(-1*w_ABtree[i])

In [33]:
AB_pred_mat=[]
for i in range(n_ada):
    y_pred_test_AB=AB_list[i].predict(X_test)
    AB_pred_mat.append(y_pred_test_AB)
AB_pred_mat=np.asarray(AB_pred_mat)

w_list_AB=np.zeros((10,len(y_pred_test_AB)))

#Enumerating the vote provided each of the DTs 
for i in range(n_ada):
    for j in range(len(y_pred_test_AB)):
        w_list_AB[AB_pred_mat[i,j],j]+=w_ABtree[i]

#Weighted ensemble classifier predictions:
y_AB_final=np.argmax(w_list_AB,axis=0)
print('Accuracy score of the \033[1m\'Adaboost\'\033[0m-ed ensemble of the 50 Decision Trees on the \033[1mtesting\033[0m examples= \033[1m{:.2f}%\033[0m'.format(accuracy_score(y_AB_final, y_test)*100))

Accuracy score of the [1m'Adaboost'[0m-ed ensemble of the 50 Decision Trees on the [1mtesting[0m examples= [1m64.05%[0m
