# MNIST
The objective of this project is to use different algorithms from Sklearn and Tensorflow to identify hand-written digits, training our models using the MNIST dataset.

## 1. Read Data

In [1]:
'''
First, we need to download the mnist dataset from sklearn db
'''
from sklearn.datasets import fetch_openml
X,y = fetch_openml('mnist_784', return_X_y=True)

## 2. Data Preprocessing

In [2]:
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]

In [3]:
'''
For regression, it is necessary to normalize the data before. Knowing that every image is on scale of greys, we can just divide every cell into 255, the maximum number that a byte can reach.
'''
X_train_scaled = X_train / 255
X_test_scaled = X_test / 255

In [4]:
'''
Tensorflow recives only ndarray, so i'm gonna create a copy of the data and convert the original into a pandas df
'''
import numpy as np
X_train_reshape = X_train.values.reshape(X_train.shape[0], 28, 28, 1)
X_test_reshape = X_test.values.reshape(X_test.shape[0], 28, 28, 1)
input_shape = (28, 28, 1)

X_train_reshape = X_train_reshape.astype(np.float32)
X_test_reshape = X_test_reshape.astype(np.float32)


X_train_reshape /= 255
X_test_reshape /= 255

## 3. Model Creation
For this project, we will be comparing the next algorithms:

In [5]:
#Logistic Regression
#extrated from https://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_logistic_regression_mnist.html
from sklearn.linear_model import LogisticRegression
#Let's create a scaled model from our scaled dataset
log_reg = LogisticRegression(C=50.0 / 5000, penalty='l1', solver='saga', tol=0.1, max_iter=1000)

In [6]:
#SVM
#extracted from https://dmkothari.github.io/Machine-Learning-Projects/SVM_with_MNIST.html
from sklearn.svm import SVC
svm = SVC()

In [7]:
#Random Forest
#extracted from https://www.kaggle.com/ashwani07/mnist-classification-using-random-forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)

In [8]:
#Neural Network (MPL)
#exctracted from https://dmkothari.github.io/Machine-Learning-Projects/MLP_with_MNIST.html
from sklearn.neural_network import MLPClassifier
mpl = MLPClassifier()

In [9]:
#CNN
#extracted from https://towardsdatascience.com/image-classification-in-10-minutes-with-mnist-dataset-54c35b77a38d
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D
import tensorflow as tf

cnn = Sequential()
cnn.add(Conv2D(28, kernel_size=(3,3), input_shape=input_shape)) #para interpretar matrices de 28x28
cnn.add(MaxPooling2D(pool_size=(2,2)))
cnn.add(Flatten())
cnn.add(Dense(128, activation=tf.nn.relu))
cnn.add(Dropout(0.2))
cnn.add(Dense(10, activation=tf.nn.softmax))

## 4. Adjust Model with Historic Data

In [10]:
#Logistic Regression
log_reg.fit(X_train_scaled, y_train)

LogisticRegression(C=0.01, max_iter=1000, penalty='l1', solver='saga', tol=0.1)

In [11]:
#SVM
svm.fit(X_train, y_train)

SVC()

In [12]:
#Random Forest
rf.fit(X_train, y_train)

RandomForestClassifier()

In [13]:
#MPL
mpl.fit(X_train, y_train)

MLPClassifier()

In [19]:
#CNN
cnn.compile(optimizer ='adam',
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy'])
cnn.fit(x=X_train_reshape, y=y_train, epochs=10)

Epoch 1/10


UnimplementedError:  Cast string to float is not supported
	 [[node sparse_categorical_crossentropy/Cast (defined at <ipython-input-19-9d7c4ef127ac>:5) ]] [Op:__inference_train_function_2138]

Function call stack:
train_function


## 5. Prediction for new Data

In [20]:
#things i will use on this section
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [21]:
#Logistic Regression
log_reg_score = log_reg.score(X_test_scaled, y_test)
print(f"Logistic Regression score: {log_reg_score}")

Logistic Regression score: 0.8948


In [22]:
#SVM
svm_pred = svm.predict(X_test)
svm_score = classification_report(y_test, svm_pred)
print(svm_score)

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       980
           1       0.99      0.99      0.99      1135
           2       0.98      0.97      0.98      1032
           3       0.97      0.99      0.98      1010
           4       0.98      0.98      0.98       982
           5       0.99      0.98      0.98       892
           6       0.99      0.99      0.99       958
           7       0.98      0.97      0.97      1028
           8       0.97      0.98      0.97       974
           9       0.97      0.96      0.97      1009

    accuracy                           0.98     10000
   macro avg       0.98      0.98      0.98     10000
weighted avg       0.98      0.98      0.98     10000



In [None]:
#Random Forest
rd_pred = rf.predict(X_test)
rd_score = classification_report(y_test, rd_pred)
print(rd_score)

In [None]:
#MPL
mpl_pred = mpl.predict(X_test)
mpl_score = classification_report(y_test, rd_pred)
print(mpl_score)

In [None]:
#CNN
cnn.evaluate(X_test_reshape, y_test)


## 6. Visualization of Results