# MNIST - Kuskapskontroll 2

In [3]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

np.random.seed(42)

# Data

In [None]:
mnist = fetch_openml('mnist_784', version=1, cache=True, as_frame=False)

X = mnist["data"]
y = mnist["target"].astype(np.uint8)

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

# Modelling

In [None]:
random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
logreg_clf = LogisticRegression(solver='saga', max_iter=1000, random_state=42)

In [10]:
random_forest_clf.fit(X_train, y_train)
random_forest_clf.score(X_val, y_val)

0.9692

In [11]:
extra_trees_clf.fit(X_train, y_train)
extra_trees_clf.score(X_val, y_val)

0.9715

In [None]:
logreg_clf.fit(X_train, y_train)
print(f"Logistic Regression Accuracy: {logreg_clf.score(X_val, y_val)}")




**Voting classifier**

In [13]:
# Sort of picks the concensus prediciton
named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("svm_clf", svm_clf)
]

voting_clf = VotingClassifier(named_estimators)
voting_clf.fit(X_train, y_train)

In [14]:
# Sort of picks the concensus prediciton, removing svm_clf as it was completely wrong.
named_estimators_2 = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf)
]

voting_clf_2 = VotingClassifier(named_estimators_2)
voting_clf_2.fit(X_train, y_train)

In [18]:
print(f"All 3 models: {voting_clf.score(X_val, y_val)}")

print(f"Random forest + extra tress: {voting_clf_2.score(X_val, y_val)}")

All 3 models: 0.9642
Random forest + extra tress: 0.9713


# Evaluation

In [19]:
best_model = extra_trees_clf

# Retrain the best model on the combined training and validation sets
best_model.fit(X_train_val, y_train_val)

# Evaluate the retrained best model on the test set
test_accuracy = best_model.score(X_test, y_test)
print(f"\nTest set accuracy of the retrained best model: {test_accuracy}")


Test set accuracy of the retrained best model: 0.9682


# Summary and analysis

As usually less complicated models excel over complicated ones. 
Voting_clf was the worst if we exclude the incorrectly trained svm_clf, voting_clf_2 was 2nd best but still worse than only relying on extra_trees_clf.

The best model was extra_trees_clf with a test accuracy of 96.82%.




---- AI summary for learning purposes

This experiment compared the performance of a Random Forest, Extra Trees classifier, a Linear SVC, and two Voting Classifiers on the MNIST dataset. 
The Linear SVC performed poorly due to insufficient iterations, highlighting the importance of hyperparameter tuning. While generally simpler models are preferred to avoid overfitting, in this case, the Extra Trees classifier (with 100 estimators) outperformed both the Random Forest and the voting classifiers. 

The voting classifier using all three models (including the poorly-performing Linear SVC) yielded a lower validation accuracy than using only the Random Forest and Extra Trees. 
This likely indicates that the Linear SVC's errors negatively impacted the combined prediction. 
The best model was the Extra Trees classifier, achieving a validation accuracy of 97.15% and a final test accuracy of 96.82%. 

This relatively small difference between validation and test accuracies suggests good generalization. The Extra Trees classifier's performance is likely due to its ability to handle high-dimensional data and its robustness to overfitting, making it a strong choice for this dataset.