In [27]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score
import matplotlib.pyplot as plt

In [2]:
mnist_train = pd.read_csv('mnist_train.csv')
mnist_test = pd.read_csv('mnist_test.csv')

In [3]:
mnist_train.shape

(60000, 785)

In [4]:
mnist_test.shape

(10000, 785)

In [5]:
mnist_train.head()

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Establish independent and dependent variables.
X = mnist_train.iloc[:, 1:].values
y = mnist_train.label.values

- Split the training data into a training and development(test) set.

In [7]:
X_train, X_development, y_train, y_development = train_test_split(X, y, random_state = 38)

# Let's establish the validation set while we're at it.
X_validation = mnist_test.iloc[:, 1:].values
y_validation = mnist_test.label.values

- Add a comment explaining the purpose of the train, development(test) and test(validation) sets.

We use the training set for initial fitting of the model. We then test the model on the development set to see what needs to be tweaked and how. Once we've made those tweaks, we assess the model finally on the validation set.

- Use the RandomForestClassifier built into sklearn to create a classification model.

In [8]:
rfc = RandomForestClassifier(random_state = 14)
rfc.fit(X_train, y_train)

In [9]:
rfc_train_accuracy = rfc.score(X_train, y_train)
rfc_development_accuracy = rfc.score(X_development, y_development)

print(f'''
The training accuracy is {rfc_train_accuracy}.
The development accuracy is {rfc_development_accuracy}.
''')


The training accuracy is 1.0.
The development accuracy is 0.9668.



Since the training accuracy is perfect, the model is almost certainly over-fitting (although the accuracy on the development set is also remarkably good).

- Pick one parameter to tune, and explain why you chose this parameter.

I will fine-tune the parameter max_depth, because the default is None and so the model may be producing extremely deep trees that could benefit from some tuning.

In [10]:
for depth in range (3, 20):
    model  = RandomForestClassifier(random_state = 14, max_depth = depth)
    model.fit(X_train, y_train)
    print(f"With max_depth = {depth} the development accuracy is {model.score(X_development, y_development)}.")

With max_depth = 3 the development accuracy is 0.7381333333333333.
With max_depth = 4 the development accuracy is 0.8048.
With max_depth = 5 the development accuracy is 0.8568666666666667.
With max_depth = 6 the development accuracy is 0.8858.
With max_depth = 7 the development accuracy is 0.9034.
With max_depth = 8 the development accuracy is 0.9240666666666667.
With max_depth = 9 the development accuracy is 0.936.
With max_depth = 10 the development accuracy is 0.9442.
With max_depth = 11 the development accuracy is 0.9506.
With max_depth = 12 the development accuracy is 0.9558.
With max_depth = 13 the development accuracy is 0.9588.
With max_depth = 14 the development accuracy is 0.9614666666666667.
With max_depth = 15 the development accuracy is 0.9635333333333334.
With max_depth = 16 the development accuracy is 0.9651333333333333.
With max_depth = 17 the development accuracy is 0.9654.
With max_depth = 18 the development accuracy is 0.9668.
With max_depth = 19 the development accu

- Choose which value for the parameter to set for testing on the test data and explain why.

The best performace was achieved with max_depth set to 18, so that's what we'll go with.

- Print the confusion matrix for your Random Forest model on the test set.

In [25]:
final_model = RandomForestClassifier(random_state = 14, max_depth = 18)
final_model.fit(X_train, y_train)
y_predications = final_model.predict(X_validation)

cm = confusion_matrix(y_validation, y_predications)
cm_df = pd.DataFrame(cm)

cm_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,972,0,0,0,0,3,1,1,3,0
1,0,1122,1,4,1,1,4,0,1,1
2,6,0,994,7,4,0,4,10,7,0
3,1,0,9,972,0,7,0,9,8,4
4,1,0,1,0,951,0,5,0,2,22
5,3,0,1,19,2,848,6,3,6,4
6,9,3,0,0,4,5,933,0,4,0
7,1,5,20,2,2,0,0,984,4,10
8,4,0,4,10,5,7,4,6,921,13
9,9,5,0,11,14,6,1,5,6,952


- Report which classes the model struggles with the most.

The most frequent errors are: thinking that a 4 is a 9, thinking that a 7 is a 2, and thinking that a 5 is a 3.

- Report the accuracy, precision, recall, and f1-score.

In [32]:
print(f'''
The accuracy is {accuracy_score(y_validation, y_predications)}.
The precision is {precision_score(y_validation, y_predications, average = 'weighted')}.
The recall is {recall_score(y_validation, y_predications, average = 'weighted')}.
The f1-score is {f1_score(y_validation, y_predications, average = 'weighted')}.
''')


The accuracy is 0.9649.
The precision is 0.9649114522242768.
The recall is 0.9649.
The f1-score is 0.9648716383039476.

