# Classification on number image dataset

### Step 1 (Install packages) 

In [None]:
!pip install opencv-python

In [None]:
!pip install matplotlib

In [None]:
!pip install numpy 

In [None]:
!pip install scipy

In [None]:
!pip install scikit-learn

### Step 2 (Call packages)

In [None]:
import cv2
import matplotlib.pyplot as plt 
import numpy as np
from scipy import io

# About dataset

Hoda's handwritten digits dataset, which is the first large dataset of Persian handwritten digits, consists of 102,353 black-and-white handwritten samples. This dataset was prepared during a master's thesis project on handwritten form recognition. The data in this collection was extracted from approximately 12,000 registration forms for the 2005 Master's entrance exam and the 2004 Continuous Associate Degree exam of the University of Applied Science and Technology [1].

In [None]:
file = io.loadmat("Data_hoda_full.mat")

### Step 3 (EDA)

In [None]:
data = file["Data"] 
data.shape

In [None]:
labels = file["labels"]
labels.shape

#### We have to reshape to (X,) 

In [None]:
data = np.squeeze(data)
labels = np.squeeze(labels)

In [None]:
plt.imshow(data[0], cmap="gray")

### Step 4 (Make train,test data)

In [None]:
from sklearn.model_selection import train_test_split

x_train , x_test, y_train, y_test = train_test_split(data, labels, test_size=0.15)

In [None]:
plt.imshow(x_train[0], cmap="gray")

#### Pay attention size of any data in dataset is not same so reshape all of them to (10, 10)

In [None]:
new_x_train = [cv2.resize(img, dsize=(10, 10)) for img in x_train]
new_x_test = [cv2.resize(img, dsize=(10, 10)) for img in x_test]

In [None]:
plt.imshow(new_x_train[45000], cmap="gray")
print(f"label for this picture is  >> {y_train[45000]}")

In [None]:
print(np.shape(new_x_train))
print(np.shape(new_x_test))

In [None]:
new_x_train = np.reshape(new_x_train, (-1, 100))
new_x_test = np.reshape(new_x_test, (-1, 100))

In [None]:
print(new_x_train.shape)
print(new_x_test.shape)

### Step 5 (Make model ,find best parameter for model and fit data)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

In [None]:
parameter = [{"n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

In [None]:
from sklearn.model_selection import GridSearchCV

grid_searcher = GridSearchCV(estimator=model, param_grid=parameter, cv=5, scoring="accuracy")

In [None]:
grid_searcher.fit(new_x_train, y_train)

In [None]:
print(grid_searcher.best_params_)

In [None]:
new_model = KNeighborsClassifier(n_neighbors=3)

In [None]:
new_model.fit(new_x_train, y_train)

### Step 6 (Evaluation)

In [None]:
yhat = new_model.predict(new_x_test)

In [None]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, yhat))

In [None]:
test_number = np.array([
    [0, 32, 0, 225, 225, 22, 25, 12, 3, 0],
    [0, 60, 0, 225, 225, 22, 235, 12, 3, 0],
    [55, 5, 0, 225, 225, 202, 205, 12, 3, 0],
    [5, 25, 0, 225, 225, 22, 25, 12, 0, 45],
    [80, 25, 0, 225, 225, 22, 25, 45, 0, 0],
    [0, 255, 120, 25, 225, 202, 190, 12, 3, 0],
    [0, 78, 90, 245, 225, 228, 205, 12, 3, 0],
    [0, 0, 0, 25, 225, 220, 25, 255, 3, 0],
    [0, 255, 0, 225, 250, 202, 250, 12, 8, 9],
    [8, 95, 73, 42, 225, 22, 250, 102, 43, 0],
])

In [None]:
prediction = new_model.predict(test_number.reshape(1,100))
plt.imshow(test_number, cmap="gray")
print(f"Model prediction for this image is >> {prediction}")

- Model best accuracy score = 0.98

# THE END