# Prediction

This notebook is only meant to use the final model. For (documentation on) the process during which it was developed, please refer to the other three notebooks.

# Imports and Configuration

In [1]:
import glob
import os
from joblib import dump, load
import pandas as pd
import numpy as np
import shutil
from skimage import io, color, filters
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Dataset images directory
DATASET_IMAGES_PATH = '../dataset-images'

# Scaler location
SCALER_PATH = '../classifiers/scaler.joblib'

# KNN Classifier location
KNN_CLASSIFIER_PATH = '../classifiers/knn_classifier.joblib'

# The Grid Size used in the dataset
GRID_SIZE = 8

# Temp directory
TEMP_DIR_PATH = '../temp'

# Helper Functions

In [3]:
# Crop an image (size 128x32) into four images (size 32x32)
# Returns an array of four cropped images
def crop_image(image):
  crops = []
  for i in range(5):
    y1 = 0
    y2 = 32
    x1 = i * 32
    x2 = (i + 1) * 32
    crops.append(image[y1:y2, x1:x2])
  
  return crops

In [4]:
# Transform an image to its binary form
def to_binary(image):
  grey = color.rgb2gray(image)
  thresh = filters.threshold_isodata(grey)
  return grey < thresh

# Get grid based features
def to_grid(binary):
  grids = []
  lg = int(len(binary) / GRID_SIZE)

  for row in range(0, GRID_SIZE):
    row_of_grids = []
    for col in range(0, GRID_SIZE):
      grid = []
      for y in range(lg*row, lg*row + lg):
        for x in range (lg*col, lg*col + lg):
          grid.append(binary[y][x])
      row_of_grids.append(grid)
    grids.append(row_of_grids)
  
  return grids

# Count all positive values in the grid
def count_positive(grid):
  count = []

  for y in range(0, len(grid)):
    for x in range(0, len(grid[y])):
      sum = np.sum(grid[y][x])
      count.append(sum)
  
  return count

# Transform an image into an array of sums of positive
# values in the grids
def image_to_count(image):
  binary = to_binary(image)
  grid = to_grid(binary)
  positive_count = count_positive(grid)
  return positive_count

# Classification

In [5]:
# Load the preprocessor
preproc = load(SCALER_PATH) 

# Load your final classifier
clf = load(KNN_CLASSIFIER_PATH) 

In [6]:
  """
  Load an image from file and predict the four digits in the image.
  The result should be an array containing the 4 digits (as string)
  """
def classify_image(filename):
    result = list()
        
    # Load the image from file
    img_array = io.imread(filename)
    crops = crop_image(img_array)

    # For each digit, collect features, preprocess and predict
    for i in range(4):
      image_crop = crops[i]

      X = image_to_count(image_crop)
      X_scaled = preproc.transform([X])

      Y = clf.predict(X_scaled)
      result.append(str(Y[0]))
            
    return result

In [7]:
# Outcomes
correct_classified_digits = 0
incorrect_classified_digits = 0
correct_classified_zipcodes = 0
incorrect_classified_zipcodes = 0

# Score the classifier
files = glob.glob(os.path.join(DATASET_IMAGES_PATH, '*.png'))
for f in files:
    # Get the correct label from the filename
    correct_label = f[-8:-4]
    # Predict using the classifier
    predicted_label = classify_image(f)
    
    # Score digits
    zipcode_correct = True
    for i in range(len(correct_label)):
        if str(correct_label[i]) == str(predicted_label[i]):
            correct_classified_digits += 1
        else:
            incorrect_classified_digits += 1
            zipcode_correct = False
    
    # Score correct zipcodes
    if zipcode_correct:
        correct_classified_zipcodes += 1
    else:
        incorrect_classified_zipcodes += 1

print("Digit accuracy: ", (correct_classified_digits / (correct_classified_digits + incorrect_classified_digits)), "(", correct_classified_digits, "/", incorrect_classified_digits, ")")
print("Zipcode accuracy: ", (correct_classified_zipcodes / (correct_classified_zipcodes + incorrect_classified_zipcodes)), "(", correct_classified_zipcodes, "/", incorrect_classified_zipcodes, ")")        

Digit accuracy:  0.9895833333333334 ( 1900 / 20 )
Zipcode accuracy:  0.9583333333333334 ( 460 / 20 )


# Cleanup

In [8]:
# # Remove the data downloaded from Github
# if os.path.isdir('/cloned-repo'):
#   shutil.rmtree('/cloned-repo')

# # Remove all data from the /content directory
# if os.path.isdir(DATASET_IMAGES_PATH):
#   shutil.rmtree(DATASET_IMAGES_PATH)
  
# if os.path.isfile(KNN_CLASSIFIER_PATH):
#   os.remove(KNN_CLASSIFIER_PATH)

# if os.path.isfile(TEMP_DIR_PATH):
#   os.remove(TEMP_DIR_PATH)
  
# if os.path.isfile(SCALER_PATH):
#   os.remove(SCALER_PATH)

[WinError 3] The system cannot find the path specified: "'/content'"
C:\Users\Dovydas\Documents\Introduction to Machine Learning\Introduction to Machine Learning\notebooks
