In [4]:
# load libraries and modules 
import scipy.io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.datasets import mnist
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [5]:
# import the data
(X_train, y_train), (X_test, y_test) = mnist.load_data()

The training set is used to train the model.

The development set is used to give an early indication of how well the model is performing, and to help tune the parameters of the model.

The test set is used to judge the performance of the model after tuning.

In [6]:
# check data shape
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(60000, 28, 28)
(60000,)
(10000, 28, 28)
(10000,)


In [7]:
# reshape data for the model
X_train_reshaped = X_train.reshape((60000,28*28))
X_test_reshaped = X_test.reshape((10000,28*28))


In [8]:
# create model
model = RandomForestClassifier()

# fit model on training data
model.fit(X_train_reshaped, y_train)

# make predictions
preds = model.predict(X_test_reshaped)

# print accuracy 
print("Accuracy:", accuracy_score(y_test,preds))


Accuracy: 0.9689


I'm going to pick n_estimators to tune, to see how the number of trees created by the model changes the accuracy. It seems important.

In [9]:
#create empty accuracy variable to update in loop
optimal_model_accuracy = 0
optimal_estimators = 0

# loop from n_accuracy from 5 to 100 in crements of 5, training and running the model and printing the optimal results
for i in range(5, 105, 5):
    model = RandomForestClassifier(n_estimators = i)
    model.fit(X_train_reshaped, y_train)
    preds = model.predict(X_test_reshaped)
    model_accuracy = accuracy_score(y_test,preds)
    if model_accuracy > optimal_model_accuracy:
        optimal_model_accuracy = model_accuracy
        optimal_estimators = i
print("Accuracy:", optimal_model_accuracy)
print("n_estimators:", optimal_estimators)

Accuracy: 0.9698
n_estimators: 85


It appears that the model works best with 85 estimators.

In [10]:
# make model again
model = RandomForestClassifier(n_estimators = 85)
model.fit(X_train_reshaped, y_train)
preds = model.predict(X_test_reshaped)

# create confusion matrix
classes = ['0','1','2','3','4','5','6','7','8','9']
conf_mat = confusion_matrix(y_test, preds)
conf_matrix_df = pd.DataFrame(conf_mat, columns=classes, index=classes)
conf_matrix_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,970,0,0,0,0,3,3,1,3,0
1,0,1123,2,3,0,2,2,0,2,1
2,6,0,998,4,3,0,4,9,8,0
3,1,0,9,971,0,9,0,10,8,2
4,1,0,0,0,957,0,6,0,3,15
5,4,0,1,14,2,857,6,1,5,2
6,7,3,0,0,3,5,936,0,3,1
7,2,3,19,1,2,0,0,988,4,9
8,4,0,4,11,3,4,3,3,935,7
9,6,6,3,12,13,3,1,5,5,955


In [11]:
# calculate accuracies of each class
for i in range(10):
    accuracy = conf_matrix_df.iloc[i][i] / conf_matrix_df.sum(axis='columns')[i]
    print(f"{i} accuracy = {accuracy}")

0 accuracy = 0.9897959183673469
1 accuracy = 0.9894273127753304
2 accuracy = 0.9670542635658915
3 accuracy = 0.9613861386138614
4 accuracy = 0.9745417515274949
5 accuracy = 0.9607623318385651
6 accuracy = 0.9770354906054279
7 accuracy = 0.9610894941634242
8 accuracy = 0.9599589322381931
9 accuracy = 0.9464816650148662


It appears that the number 9 is the toughest class for the model to recognise.

As printed before, the overall accuracy is 0.9698

In [12]:
from sklearn.metrics import precision_recall_fscore_support

precision_recall_fscore_support(preds, y_test)

(array([0.98979592, 0.98942731, 0.96705426, 0.96138614, 0.97454175,
        0.96076233, 0.97703549, 0.96108949, 0.95995893, 0.94648167]),
 array([0.96903097, 0.98942731, 0.96332046, 0.95570866, 0.97355036,
        0.97055493, 0.97398543, 0.97148476, 0.9579918 , 0.96270161]),
 array([0.97930338, 0.98942731, 0.96518375, 0.95853899, 0.9740458 ,
        0.9656338 , 0.97550808, 0.96625917, 0.95897436, 0.95452274]),
 array([1001, 1135, 1036, 1016,  983,  883,  961, 1017,  976,  992],
       dtype=int64))