<a href="https://colab.research.google.com/github/MarcoMulas99/AI_OCR_Project/blob/main/AI_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Optical Character Recognition task

Classification is a kind of supervised learning predicting the class (or category) of input “objects”,
out of a predefined set of classes. Handwritten digit recognition is a ten-class problem (0, 1, . . . , 9)

**Dataset**
We use the MNIST dataset, a set of handwritten digits.

It is contained in tensorflow library. TensorFlow provides built-in functions to directly download and load popular datasets like MNIST.
https://www.tensorflow.org/api_docs/python/tf/keras/datasets/mnist/load_data



Images in MNIST dataset are represented as tri-dimensional arrays, to use them with scikit-learn classifier you have to turn them into uni-dimensional arrays.



In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
import numpy as np
import time


In [None]:
#----preprocessing----
def load_mnist():
  #split the dataset in training set and test set
  mnist = tf.keras.datasets.mnist
  (x_train, y_train), (x_test, y_test) = mnist.load_data()

  #Normalization of the pixel values to fit them into the interval [0,1]
  x_train = x_train / 255.0
  x_test = x_test / 255.0

  return (x_train, y_train), (x_test, y_test)

#Load MNIST dataset
(x_train, y_train), (x_test, y_test) = load_mnist()

#Images Flattening into unidimensional array
x_train_flattened = x_train.reshape(x_train.shape[0], -1)
x_test_flattened = x_test.reshape(x_test.shape[0], -1)



Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, x_test, y_test, model_name="", training_time=None, extra_info=None):
    """
    Function to evaluate a machine learning model on test data and display training time.

    Args:
    - model: The trained machine learning model to evaluate.
    - x_test: Test feature data.
    - y_test: True labels for the test data.
    - model_name: Name of the model (optional, for printing).
    - training_time: Time taken to train the model (optional).
    - extra_info: Any additional information to print (e.g., number of trees, model architecture).

    Returns:
    - A dictionary containing accuracy, precision, recall, F1 score, and training time.
    """

    # Predict the test labels
    y_pred = model.predict(x_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Print evaluation results
    print(f"--- {model_name} Evaluation ---")
    if training_time is not None:
        print(f"Training time: {training_time:.4f} seconds")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Print additional information if provided
    if extra_info:
        print(extra_info)

    print("-----------------------------")

    # Return metrics as a dictionary
    return


## Decision trees

In [None]:
#----Decision trees----
#DT training
start_time = time.time()
decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train_flattened, y_train)
end_time = time.time()
training_time_dt  = end_time - start_time
#DT Prediction and accuracy test
extra_info_dt = "Decision Tree:"
evaluate_model(decision_tree, x_test_flattened, y_test, "Decision Tree", training_time=training_time_dt)

--- Decision Tree Evaluation ---
Training time: 24.9235 seconds
Accuracy: 0.8765
Precision: 0.8763
Recall: 0.8765
F1 Score: 0.8764
-----------------------------


In [None]:
start_time = time.time()
decision_tree_overfit = DecisionTreeClassifier(criterion="entropy")
decision_tree_overfit.fit(x_train_flattened, y_train)
end_time = time.time()
training_time_dt_entropy = end_time - start_time

# Evaluate the decision tree model with entropy
evaluate_model(decision_tree_overfit, x_test_flattened, y_test, "Decision Tree (Entropy)", training_time=training_time_dt_entropy, extra_info=extra_info_dt_entropy)


--- Decision Tree (Entropy) Evaluation ---
Training time: 19.3759 seconds
Accuracy: 0.8864
Precision: 0.8865
Recall: 0.8864
F1 Score: 0.8864
Decision Tree (Entropy):
-----------------------------


In [None]:
# Overfitting - Deep tree with few constraints
start_time = time.time()
decision_tree_overfit = DecisionTreeClassifier(max_depth=None, min_samples_split=2, min_samples_leaf=1)
decision_tree_overfit.fit(x_train_flattened, y_train)
end_time = time.time()
training_time_dt_overfit = end_time - start_time

# Evaluate overfitting DT on test data
extra_info_dt_overfit = "Decision Tree (Overfit): max_depth=None, min_samples_split=2, min_samples_leaf=1"
evaluate_model(decision_tree_overfit, x_test_flattened, y_test, "Decision Tree (Overfit)",
               training_time=training_time_dt_overfit, extra_info=extra_info_dt_overfit)

# Evaluate overfitting DT on training data
evaluate_model(decision_tree_overfit, x_train_flattened, y_train, "Decision Tree (Overfit on Training Data)",
               training_time=training_time_dt_overfit, extra_info=extra_info_dt_overfit + " (Training Data)")



--- Decision Tree (Overfit) Evaluation ---
Training time: 27.0926 seconds
Accuracy: 0.8772
Precision: 0.8774
Recall: 0.8772
F1 Score: 0.8772
Decision Tree (Overfit): max_depth=None, min_samples_split=2, min_samples_leaf=1
-----------------------------
--- Decision Tree (Overfit on Training Data) Evaluation ---
Training time: 27.0926 seconds
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
Decision Tree (Overfit): max_depth=None, min_samples_split=2, min_samples_leaf=1 (Training Data)
-----------------------------


In [None]:
# Underfitting - Shallow tree with constraints
start_time = time.time()
decision_tree_underfit = DecisionTreeClassifier(max_depth=2, min_samples_split=10, min_samples_leaf=10)
decision_tree_underfit.fit(x_train_flattened, y_train)
end_time = time.time()
training_time_underfit = end_time - start_time

# Evaluate underfitting DT
extra_info_dt_underfit = "Decision Tree (Underfit): max_depth=2, min_samples_split=10, min_samples_leaf=10"
evaluate_model(decision_tree_underfit, x_test_flattened, y_test, "Decision Tree (Underfit)", training_time=training_time_underfit, extra_info=extra_info_dt_underfit)


--- Decision Tree (Underfit) Evaluation ---
Training time: 2.4622 seconds
Accuracy: 0.3447
Precision: 0.1614
Recall: 0.3447
F1 Score: 0.2144
Decision Tree (Underfit): max_depth=2, min_samples_split=10, min_samples_leaf=10
-----------------------------


  _warn_prf(average, modifier, msg_start, len(result))


### **Random forest**

In [None]:
#----Random forest-----
# Random Forest with different n_estimators values
# n_estimators = 10
N_ESTIMATORS = 10
start_time = time.time()
random_forest_10 = RandomForestClassifier(n_estimators=N_ESTIMATORS)
random_forest_10.fit(x_train_flattened, y_train)
end_time = time.time()
training_time_rf_10 = end_time - start_time

extra_info_rf_10 = f"Number of trees (n_estimators): {N_ESTIMATORS}"
evaluate_model(random_forest_10, x_test_flattened, y_test, "Random Forest (10 estimators)",
               training_time=training_time_rf_10, extra_info=extra_info_rf_10)

# n_estimators = 100
N_ESTIMATORS = 100
start_time = time.time()
random_forest_100 = RandomForestClassifier(n_estimators=N_ESTIMATORS)
random_forest_100.fit(x_train_flattened, y_train)
end_time = time.time()
training_time_rf_100 = end_time - start_time

extra_info_rf_100 = f"Number of trees (n_estimators): {N_ESTIMATORS}"
evaluate_model(random_forest_100, x_test_flattened, y_test, "Random Forest (100 estimators)",
               training_time=training_time_rf_100, extra_info=extra_info_rf_100)

# n_estimators = 1000
N_ESTIMATORS = 1000
start_time = time.time()
random_forest_1000 = RandomForestClassifier(n_estimators=N_ESTIMATORS)
random_forest_1000.fit(x_train_flattened, y_train)
end_time = time.time()
training_time_rf_1000 = end_time - start_time

extra_info_rf_1000 = f"Number of trees (n_estimators): {N_ESTIMATORS}"
evaluate_model(random_forest_1000, x_test_flattened, y_test, "Random Forest (1000 estimators)",
               training_time=training_time_rf_1000, extra_info=extra_info_rf_1000)

--- Random Forest (10 estimators) Evaluation ---
Training time: 4.9764 seconds
Accuracy: 0.9480
Precision: 0.9481
Recall: 0.9480
F1 Score: 0.9479
Number of trees (n_estimators): 10
-----------------------------
--- Random Forest (100 estimators) Evaluation ---
Training time: 53.3689 seconds
Accuracy: 0.9699
Precision: 0.9699
Recall: 0.9699
F1 Score: 0.9699
Number of trees (n_estimators): 100
-----------------------------
--- Random Forest (1000 estimators) Evaluation ---
Training time: 530.0451 seconds
Accuracy: 0.9721
Precision: 0.9721
Recall: 0.9721
F1 Score: 0.9721
Number of trees (n_estimators): 1000
-----------------------------


In [None]:
# n_estimators = 100
N_ESTIMATORS = 100
start_time = time.time()
random_forest_100_no_bootstrap = RandomForestClassifier(n_estimators=N_ESTIMATORS, bootstrap=False)
random_forest_100_no_bootstrap.fit(x_train_flattened, y_train)
end_time = time.time()
training_time_rf_100_no_bootstrap = end_time - start_time

extra_info_rf_100_no_bootstrap = f"Number of trees (n_estimators): {N_ESTIMATORS}, Bootstrap: False"
evaluate_model(random_forest_100_no_bootstrap, x_test_flattened, y_test, "Random Forest (100 estimators, Bootstrap=False)",
               training_time=training_time_rf_100_no_bootstrap, extra_info=extra_info_rf_100_no_bootstrap)


--- Random Forest (100 estimators, Bootstrap=False) Evaluation ---
Training time: 82.0344 seconds
Accuracy: 0.9736
Precision: 0.9736
Recall: 0.9736
F1 Score: 0.9736
Number of trees (n_estimators): 100, Bootstrap: False
-----------------------------


In [None]:
N_ESTIMATORS = 100

# Overfitting
start_time = time.time()
rf_overfit = RandomForestClassifier(n_estimators=N_ESTIMATORS, max_depth=None, min_samples_split=2, min_samples_leaf=1)
rf_overfit.fit(x_train_flattened, y_train)
end_time = time.time()
training_time_rf_overfit = end_time - start_time
extra_info_rf_overfit = f"Overfitting: n_estimators={N_ESTIMATORS}, max_depth=None, min_samples_split=2, min_samples_leaf=1"
evaluate_model(rf_overfit, x_test_flattened, y_test, "Random Forest (Overfit)",
               training_time=training_time_rf_overfit, extra_info=extra_info_rf_overfit)
evaluate_model(rf_overfit, x_train_flattened, y_train, "Random Forest (Overfit on Training Data)",
               training_time=training_time_rf_overfit, extra_info=extra_info_rf_overfit + " (Training Data)")

# Underfitting
start_time = time.time()
rf_underfit = RandomForestClassifier(n_estimators=N_ESTIMATORS, max_depth=2, min_samples_split=10, min_samples_leaf=10)
rf_underfit.fit(x_train_flattened, y_train)
end_time = time.time()
training_time_rf_underfit = end_time - start_time
extra_info_rf_underfit = f"Underfitting: n_estimators={N_ESTIMATORS}, max_depth=2, min_samples_split=10, min_samples_leaf=10"
evaluate_model(rf_underfit, x_test_flattened, y_test, "Random Forest (Underfit)",
               training_time=training_time_rf_underfit, extra_info=extra_info_rf_underfit)

--- Random Forest (Overfit) Evaluation ---
Training time: 53.3907 seconds
Accuracy: 0.9693
Precision: 0.9693
Recall: 0.9693
F1 Score: 0.9693
Overfitting: n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1
-----------------------------
--- Random Forest (Overfit on Training Data) Evaluation ---
Training time: 53.3907 seconds
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
Overfitting: n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1 (Training Data)
-----------------------------
--- Random Forest (Underfit) Evaluation ---
Training time: 7.7032 seconds
Accuracy: 0.6471
Precision: 0.7065
Recall: 0.6471
F1 Score: 0.6161
Underfitting: n_estimators=100, max_depth=2, min_samples_split=10, min_samples_leaf=10
-----------------------------


## Neural Networks
Usage of Multi-layer Perceptron classifier of scikit learn library

https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

Here we evaluate the performance of different architectures. We first see how the performance of a single hidden layer perceptron network improves as the number of neurons increases. We finally test a considerably more complex network with multiple layers to check how much we can improve on the previous architecture style.

In [None]:
MLPClassifiers = [
    MLPClassifier(hidden_layer_sizes=(1,), max_iter=500, batch_size=32),
    MLPClassifier(hidden_layer_sizes=(10,), max_iter=500, batch_size=32),
    MLPClassifier(hidden_layer_sizes=(20,), max_iter=500, batch_size=32),
    MLPClassifier(hidden_layer_sizes=(30,), max_iter=500, batch_size=32),
    MLPClassifier(hidden_layer_sizes=(40,), max_iter=500, batch_size=32),
    MLPClassifier(hidden_layer_sizes=(50,), max_iter=500, batch_size=32),
    MLPClassifier(hidden_layer_sizes=(60,), max_iter=500, batch_size=32),
    MLPClassifier(hidden_layer_sizes=(70,), max_iter=500, batch_size=32),
    MLPClassifier(hidden_layer_sizes=(80,), max_iter=500, batch_size=32),
    MLPClassifier(hidden_layer_sizes=(90,), max_iter=500, batch_size=32),
    MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, batch_size=32),
    MLPClassifier(hidden_layer_sizes=(512, 512, 256, 128), max_iter=500, batch_size=512),
]

for (idx, clf) in enumerate(MLPClassifiers):
  # Train the classifier
  start_time = time.time()
  clf.fit(x_train_flattened, y_train)
  end_time = time.time()
  training_time_nn = end_time - start_time

  # Predict on the training and test data
  y_train_pred = clf.predict(x_train_flattened)
  y_test_pred = clf.predict(x_test_flattened)

  # Evaluate the classifier
  extra_info_nn = f"Network architecture: {clf.hidden_layer_sizes}"
  evaluate_model(clf, x_test_flattened, y_test, "Neural Network", training_time=training_time_nn, extra_info=extra_info_nn)

  _warn_prf(average, modifier, msg_start, len(result))


--- Neural Network Evaluation ---
Training time: 16.9151 seconds
Accuracy: 0.1135
Precision: 0.0129
Recall: 0.1135
F1 Score: 0.0231
Network architecture: (1,)
-----------------------------
--- Neural Network Evaluation ---
Training time: 184.1986 seconds
Accuracy: 0.9330
Precision: 0.9332
Recall: 0.9330
F1 Score: 0.9329
Network architecture: (10,)
-----------------------------
--- Neural Network Evaluation ---
Training time: 518.3890 seconds
Accuracy: 0.9521
Precision: 0.9522
Recall: 0.9521
F1 Score: 0.9521
Network architecture: (20,)
-----------------------------
--- Neural Network Evaluation ---
Training time: 337.9443 seconds
Accuracy: 0.9636
Precision: 0.9637
Recall: 0.9636
F1 Score: 0.9636
Network architecture: (30,)
-----------------------------
--- Neural Network Evaluation ---
Training time: 435.4614 seconds
Accuracy: 0.9680
Precision: 0.9681
Recall: 0.9680
F1 Score: 0.9680
Network architecture: (40,)
-----------------------------
--- Neural Network Evaluation ---
Training time