# Imports and Configuration

In [1]:
import os
import numpy as np
import pandas as pd

import shutil
from skimage import io, color, filters, util, measure

In [2]:
# Dataset images directory
DATASET_IMAGES_PATH = '../dataset-images'

# Grid-based dataset csv file
DATASET_CSV_PATH = '../dataset-numpy'

# Cropped images directory
CROPPED_IMAGES_PATH = '../cropped-images'

# Temp directory
TEMP_DIR_PATH = '../temp'

# Splitting Images
While the end result of the assignment is to classify postal codes (four numbers written next to one another), classifiying postal codes as a whole isn't a good idea due to the (very) limited number of training samples we have for them. For this reason, the postal codes will be split up into four images, each containing a single number. These numbers can then be used for training purposes.

During classification, we will once again split the number into four digits, and classify each of them separately.

In [3]:
# Crop an image (size 128x32) into four images (size 32x32)
# Returns an array of four cropped images
def crop_image(image):
  crops = []
  for i in range(5):
    y1 = 0
    y2 = 32
    x1 = i * 32
    x2 = (i + 1) * 32
    crops.append(image[y1:y2, x1:x2])
  
  return crops

In [4]:
# Create the directory structure
if not os.path.isdir(DATASET_CSV_PATH):
  os.mkdir(DATASET_CSV_PATH)

if not os.path.isdir(TEMP_DIR_PATH):
  os.mkdir(TEMP_DIR_PATH)
  
if not os.path.isdir(CROPPED_IMAGES_PATH):
  os.mkdir(CROPPED_IMAGES_PATH)
  
  for i in range (0, 10):
    os.mkdir(CROPPED_IMAGES_PATH + '/' + str(i))

In [5]:
# Store the file name (integer, without extension) for the last file saved in the folders 0-9
last_filename = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

# Save the image to file in the directory with the specified label
def save_image(image, label):
  path = CROPPED_IMAGES_PATH + '/' + label + "/" + str(last_filename[int(label)]) + ".jpg"
  io.imsave(path, image)
  last_filename[int(label)] += 1

In [6]:
# For each image in the dataset, crop it and write its four
# resulting images to files
for file in os.listdir(DATASET_IMAGES_PATH):
  image = io.imread(DATASET_IMAGES_PATH + '/' + file)
  labels = list(file.split(".")[0])

  crops = crop_image(image)
  for i in range(4):
    save_image(crops[i], labels[i])

# Feature Extraction

## Grid Based Features
For this section, we'll be creating a dataset on grid based features. Basically, this means we'll be dividing the image into several squares and counting the number of pixels (that are part of the written number) are in each square.

In order to create the dataset, a few helper functions are required:

Firstly, we'll transform the image into binary to easily differentiate between pixeles that are and aren't part of the written number. For this, a threshold will be used.

Secondly, we'll use a to_grid() function that takes in the binary image and splits it up into several squares. This function takes the grid_size as input, to determine how many squares it should divide the image up into. A grid size of 8 would result in an 8x8 grid, meaning 64 squares.

Lastly, the count_positive() function takes the grid as input and counts all the "positive" pixels (pixels that are part of the written digit). This function outputs the total number of positive pixels per grid as an array. An image in an 8x8 grid would be turned into an array of 64 integers.

To make this easier to use, a function is defined that takes in a regular image (of a single handwritten digit), pipes it through all the functions described above, and returns the the final array from count_positive().


In [7]:
# Transform an image to its binary form
def to_binary(image):
  grey = color.rgb2gray(image)
  thresh = filters.threshold_isodata(grey)
  return grey < thresh

# Get grid based features
def to_grid(binary, grid_size):
  grids = []
  lg = int(len(binary) / grid_size)

  for row in range(0, grid_size):
    row_of_grids = []
    for col in range(0, grid_size):
      grid = []
      for y in range(lg*row, lg*row + lg):
        for x in range (lg*col, lg*col + lg):
          grid.append(binary[y][x])
      row_of_grids.append(grid)
    grids.append(row_of_grids)
  
  return grids

# Count all positive values in the grid
def count_positive(grid):
  count = []

  for y in range(0, len(grid)):
    for x in range(0, len(grid[y])):
      sum = np.sum(grid[y][x])
      count.append(sum)
  
  return count

# Transform an image into an array of sums of positive
# values in the grids
def image_to_count(image, grid_size):
  binary = to_binary(image)
  grid = to_grid(binary, grid_size)
  positive_count = count_positive(grid)
  return positive_count

With the required functionality taken care of, we need to actually take all the images and translate them into the arrays described above. Then we need to take the data we created, and write it to file to be used later.

We'll create a file writer at the path defined in the configuration section, write a header for clarification and then loop through all the cropped images we created earlier. We'll then turn the images into arrays using the image_to_count() function, and write it to the file in CSV format:

image_label,count1,count2,count3,count4,...

In [8]:
def create_dataset(grid_size):

  # Create a file writer
  f = open(DATASET_CSV_PATH + '/grid_dataset_' + str(grid_size) + '.csv', 'w')

  # Write a file header
  f.write('label')
  for i in range(1,(grid_size**2 + 1)):
    f.write(',sum_' + str(i))
  f.write('\r\n')

  # Loop through all the cropped images, transform them into counts, and write the output to a file
  for i in range(0,10):
    dir = CROPPED_IMAGES_PATH + '/' + str(i) + "/"
    for file in os.listdir(dir):
      file_path = dir + file
      image = io.imread(file_path)
      image_count = image_to_count(image, grid_size)

      # Write the label to the file
      f.write(str(i))
    
      # Write all sums to the file, separated by a comma
      for count in image_count:
          f.write(',')
          f.write(str(count))
    
      # Go to the next line in the file
      f.write('\r\n')

  # Close the file writer
  f.close()

For every possible grid type, run the create_dataset() function. This allows us to compare the accuracy of each dataset in a later stage.

In [9]:
for i in [1, 2, 4, 8, 16, 32]:
  create_dataset(i)

## Image Based Features

Author: Dovydas Valiulis 436254

To gather features from the image first, we have to do image preprocessing. The first step is to convert the RGB image into a grayscale image. After that, we apply a gaussian filter to blur an image to make digit more uniform and make sure that the digit area does not have small holes. Next, we calculate the threshold for an image where we can separate digit from the background. After that, we apply the threshold to a greyscale image to get a binary representation of an image. We invert the binary image to label area of the digit instead of the area of the background. Finally, we create label from an inverted binary image from which we can gather various features.

We have decided to select 15 features from the region of a digit.
1. Area - an area of the region
2. Perimeter - the perimeter of the region
3. Orientation - the angle between the x-axis(row) and the major axis. value can be between -90 to 90 degrees
4. Euler's number - number of regions minus the number of holes in those regions
5. Major axis length - length of the major axis of the region
6. Bounding box area - an area of the box that encloses a region 
7. Centroid - the center of mass of the region. value as a tuple (x coordinate, y coordinate)
8. Eccentricity - the ratio of the distance between the foci and major axis length. values range from 0 to 1
9. Bounding box - coordinates of the bounding box. value is a list of four items
10. Convex area - number of pixels in ConvexImage.
11. Equivalent diameter - diameter of a circle with the same area as the region
12. Extent - the ratio between pixels in the bounding box and pixels in the region
13. Filled area - the area of the region with its all holes filled in
14. Minor axis length - length of the minor axis of the region
15. Solidity - area divided by convex area

In [10]:
# Method that extracts image properties from an image and returns dataframe of features of that image
def get_image_props(image):
    # Finding out region of an object
    grey = color.rgb2gray(image)
    blur = filters.gaussian(grey, 2)
    thresh = filters.threshold_otsu(blur)
    binary = grey > thresh
    inverted_binary = util.invert(binary)
    label_image = measure.label(inverted_binary)
    
    # features to be selected from the region properties
    prop_names = ['area', 'perimeter', 'orientation', 'euler_number', 'major_axis_length', 'bbox_area', 'centroid', 'eccentricity', 'bbox', 'convex_area', 'equivalent_diameter', 'extent', 'filled_area', 'minor_axis_length', 'solidity']
    result = []
    
    # extracting features from the region properties 
    for prop in measure.regionprops(label_image):
        result.append([prop[col] for col in prop_names])
        
    # puting region properties into dataframe
    df = pd.DataFrame(result, columns=prop_names)
    
    # Splitting centroid feature tuple into seperate entries in the dataframe
    temp = df['centroid'].apply(pd.Series)
    df['centroid-0'], df['centroid-1'] = temp[0], temp[1]
    df = df.drop(['centroid'], axis=1)
    
    # Splitting bounding box feature list into seperate entries in the dataframe
    temp = df['bbox'].apply(pd.Series)
    df['bbox-0'], df['bbox-1'], df['bbox-2'], df['bbox-3'] = temp[0], temp[1], temp[2], temp[3]
    df = df.drop(['bbox'], axis=1)
    
    return df

After we decided what feature to collect from the image we load all images and gather features of each image. We store these features in pandas dataframe to easily visualize properties of the dataset and easily gather statistics of this dataset.

In [11]:
# Method that loads all croped images of the number specified 
# returns dataframe of all properties of all images of specified number and adds labels to that datafrane
def load_all_number_image_props(img_no):
    images = io.imread_collection("{}/{}/*.jpg".format(CROPPED_IMAGES_PATH, img_no))
    image_props = [get_image_props(image) for image in images]
    df = pd.concat(image_props, ignore_index=True)
    df['label'] = img_no
    return df

In [12]:
# Combine all single digit dataframes into one dataframe
digit_data = [load_all_number_image_props(i) for i in range(10)]
df = pd.concat(digit_data, ignore_index=True)

See https://scikit-image.org/docs/0.14.x/release_notes_and_installation.html#deprecations for details on how to avoid this message.
  warn(XY_TO_RC_DEPRECATION_MESSAGE)
See https://scikit-image.org/docs/0.14.x/release_notes_and_installation.html#deprecations for details on how to avoid this message.
  warn(XY_TO_RC_DEPRECATION_MESSAGE)


In [13]:
# Looking at how final data set looks like
df.head()

Unnamed: 0,area,perimeter,orientation,euler_number,major_axis_length,bbox_area,eccentricity,convex_area,equivalent_diameter,extent,filled_area,minor_axis_length,solidity,centroid-0,centroid-1,bbox-0,bbox-1,bbox-2,bbox-3,label
0,291,145.63961,-1.519531,0,36.387393,620,0.757596,501,19.248707,0.469355,483,23.750934,0.580838,16.491409,15.32646,1,6,32,26,0
1,1,0.0,0.785398,1,0.0,1,0.0,1,1.128379,1.0,1,0.0,1.0,3.0,23.0,3,23,4,24,0
2,299,149.053824,-1.451364,0,37.480758,672,0.778776,522,19.5115,0.44494,496,23.511758,0.572797,16.170569,15.381271,0,6,32,27,0
3,1,0.0,0.785398,1,0.0,1,0.0,1,1.128379,1.0,1,0.0,1.0,20.0,4.0,20,4,21,5,0
4,342,155.053824,-1.478323,0,38.171582,704,0.729963,579,20.867389,0.485795,562,26.089783,0.590674,15.479532,15.342105,0,5,32,27,0


Finally, we export our dataset into a temp folder to use it in the later stage of the assignment.

In [14]:
# export dataframe so it can be used in feature analysis
if not os.path.isdir(TEMP_DIR_PATH + "/datasets"):
  os.mkdir(TEMP_DIR_PATH + "/datasets")

df.to_pickle(TEMP_DIR_PATH + "/datasets/initial-image-feature-dataset.pkl")

# Cleanup
Running this code will remove all data (both retrieved from Github and created afterwards) from the runtime, ensuring we're working with all "new" data every time.

In [15]:
# # Remove all data from the /content directory

# if os.path.isdir(CROPPED_IMAGES_PATH):
#   shutil.rmtree(CROPPED_IMAGES_PATH)

# if os.path.isdir(TEMP_DIR_PATH):
#   shutil.rmtree(TEMP_DIR_PATH)  
  
# if os.path.isdir(DATASET_CSV_PATH):
#   shutil.rmtree(DATASET_CSV_PATH)