In [None]:
import matplotlib.pyplot as matplot
import seaborn as sb
import pandas as pd
import time
import numpy as np
from sklearn.svm import SVC
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
# importing the dataset
X,y = fetch_openml('mnist_784', version=1, return_X_y=True)
y = y.astype(int)
    
# deleting the columns with unique values and rescaling
X = X[:, ~np.all(X[1:] == X[:-1], axis=0)]
X = X/255
    
# splitting the data into train, validation and test
X_train_80, X_test, y_train_80, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train, X_valid , y_train, y_valid = train_test_split(X_train_80, y_train_80, test_size = 0.25, random_state=0)

In [None]:
rf = RandomForestClassifier(n_jobs = -1)
params = {
    'n_estimators':[100, 200, 500, 1000],
    'criterion':['gini', 'entropy'],
    'max_features':['sqrt', 'log2', None]
}
grid_RF = GridSearchCV(estimator = rf, param_grid = params, cv = 10, n_jobs = -1, return_train_score = True, verbose = 3)
grid_RF.fit(X_valid, y_valid)

In [None]:
grid_RF.cv_results_

In [None]:
grid_RF.best_params_

In [None]:
bes_grid_RF = RandomForestClassifier(n_estimators = 1000, criterion = 'gini', max_features = 'sqrt', n_jobs = -1)

start_train = time.time()
bes_grid_RF.fit(X_train, y_train)
time_train = time.time() - start_train

start_test = time.time()
y_pred = bes_grid_RF.predict(X_test)
time_test = time.time() - start_test

score = accuracy_score(y_test, y_pred)

print('Training time: ', time_train)
print('Test time: ', time_test)
print('Test accuracy = ', score)

In [None]:
cm = confusion_matrix(y_test, y_pred)
matplot.subplots(figsize=(10, 6))
sb.heatmap(cm, annot = True, fmt = 'g')
matplot.xlabel("Predicted")
matplot.ylabel("Actual")
matplot.title("Confusion Matrix")
matplot.show()