# Imports and Configuration

In [1]:
import glob
import os
from joblib import dump, load
import pandas as pd
import numpy as np
import shutil
import joblib
from skimage import io, color, filters
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, chi2, f_regression, f_classif

In [2]:
# Grid-based dataset csv file
DATASET_CSV_PATH = '../dataset-numpy'

# Scaler location
SCALER_PATH = '../classifiers/scaler.joblib'

# KNN Classifier location
KNN_CLASSIFIER_PATH = '../classifiers/knn_classifier.joblib'

# Temp directory
TEMP_DIR_PATH = '../temp'

# Grid Based Model

It's always wise to check that we actually have the data we expect. After importing the datasets we created previously from Github, we should have the data for every grid size. Given the initial dataset, we should have 1920 rows (as it consists of 480 postal codes, which is 1920 single digits) for each grid size.

Let's load a dataset from the csv file into a DataFrame and describe it, to see if we indeed retrieved everything in properly.

In [3]:
# Import the dataset into a DataFrame
df_grid = pd.read_csv(DATASET_CSV_PATH + '/grid_dataset_4.csv')

# Describe the dataset to see if it's been loaded in properly
df_grid.describe()

Unnamed: 0,label,sum_1,sum_2,sum_3,sum_4,sum_5,sum_6,sum_7,sum_8,sum_9,sum_10,sum_11,sum_12,sum_13,sum_14,sum_15,sum_16
count,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0
mean,4.532813,2.235937,39.644792,36.097396,3.84375,4.896354,34.896875,33.200521,4.411458,3.408854,30.311979,36.148438,6.444792,1.163542,35.611979,37.214583,6.365625
std,2.868122,3.517256,12.735939,14.62658,7.979361,5.633055,15.674676,17.05907,6.696306,5.611908,18.605251,14.065148,6.858434,2.631299,13.570497,15.269706,9.307086
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,34.0,27.0,0.0,0.0,23.0,21.0,0.0,0.0,15.0,27.0,0.0,0.0,29.0,31.0,0.0
50%,5.0,0.0,42.0,38.0,0.0,3.0,38.0,36.0,0.0,0.0,31.0,37.0,5.0,0.0,37.0,41.0,1.0
75%,7.0,3.0,49.0,47.0,4.0,8.0,47.0,46.0,7.0,5.0,46.25,46.0,11.0,1.0,45.0,47.0,10.0
max,9.0,21.0,63.0,64.0,53.0,30.0,64.0,64.0,36.0,39.0,64.0,64.0,31.0,28.0,64.0,64.0,50.0


## Dataset Scoring

To score a dataset, we're going to need to create a model (such as a KNN (K Nearest Neighbors) classifier) to train, and then see how good it does on the data we give it. A simple function should do:

In [4]:
def score_model(dataset):
  # Get the dataset without the labels from the DataFrame
  X = dataset.drop('label', axis=1)

  # Get an array of labels that correspond with the dataset above
  Y = dataset['label']

  # Create a MinMaxScaler
  scaler = MinMaxScaler()

  # Fit the scaler to the dataset and scale the data
  X_scaled = scaler.fit_transform(X)

  # Create train and test sets (for both data and labels), with 90% of the data
  # being used to train and 10% to test the model
  X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=.1)

  # Create a KNN Classifier with K = 3
  knn_classifier = KNeighborsClassifier(n_neighbors=5)

  # Fit the classifier on the training set
  knn_classifier.fit(X_train, Y_train)

  # Score the classifier using the built in score function and the test set
  # created earlier
  score = knn_classifier.score(X_test, Y_test)

  return score

Every time we run the score_model() function, a slightly different score is generated. To even out the final score a bit, we'll run it twenty times and return the average score (without the five lowest and highest values). This should help somewhat with grading the datasets.

In [5]:
# Take the average of five runs of the score_model() function
# and return a percentage
def score_20(dataset):
  scores = []
  for i in range(20):
    scores.append(score_model(dataset))

  scores.sort()
  average = sum(scores[5:15]) / 10
  percentage = round(average * 100 , 2)

  return percentage

With the functions we need in place, let's get to scoring the actual datasets:

In [None]:
for i in [1, 2, 4, 8, 16, 32]:
  dataset_path = DATASET_CSV_PATH + '/grid_dataset_' + str(i) + '.csv'
  dataframe = pd.read_csv(dataset_path)

  print('grid size: {0}, average score: {1}%'.format(i, score_20(dataframe)))

grid size: 1, average score: 11.35%
grid size: 2, average score: 59.95%
grid size: 4, average score: 94.58%
grid size: 8, average score: 97.4%
grid size: 16, average score: 97.71%


As the output above shows, and we could have predicted, simply counting the number of pixels in an image isn't going to be very accurate. Splitting it up into a 2x2 grid (4 squares) makes it a lot more accurate, but 60% still isn't good enough for our efforts. From a 4x4 grid and up, it becomes quite accurate: about 94% to 98%.

There isn't much difference in the scores from grid size 8 to 32, although a grid size of 32 gives us quite a bit more squares than a grid size of 8. This could mean a lot more invaluable squares as well, which we can discard later on.

## Automatic Feature Selection
One way of selecting features is using the KBest function. We'll take the results we got earlier as a baseline, and see if we can improve it from there.

```
grid size: 8, average score: 97.45%
grid size: 16, average score: 97.06%
grid size: 32, average score: 97.08%
```

An 8x8 grid consists of 64 squares, meaning 64 values. A 16x16 grid consists of 256 values and a 32x32 grid of 1024 values.

We're also going to need a slightly different scoring function, as we'll be splitting up the dataset into actual data and labels before we can select the features we want to keep. This means our scoring function will need to take two inputs instead of one:

In [None]:
def score_model_2(X, Y):
  # Create train and test sets (for both data and labels), with 90% of the data
  # being used to train and 10% to test the model
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.1)

  # Create a KNN Classifier with K = 3
  knn_classifier = KNeighborsClassifier(n_neighbors=3)

  # Fit the classifier on the training set
  knn_classifier.fit(X_train, Y_train)

  # Score the classifier using the built in score function and the test set
  # created earlier
  score = knn_classifier.score(X_test, Y_test)

  return score

# Take the average of five runs of the score_model() function
# and return a percentage
def score_20_2(X, Y):
  scores = []
  for i in range(20):
    scores.append(score_model_2(X, Y))

  scores.sort()
  average = sum(scores[5:15]) / 10
  percentage = round(average * 100 , 2)

  return percentage

Now let's get to running some experiments:

In [None]:
def get_K_best(grid_size, k_arr):
  dataframe = pd.read_csv(DATASET_CSV_PATH + '/grid_dataset_' + str(grid_size) + '.csv')
  
  X = dataframe.drop('label', axis=1)
  Y = dataframe['label']

  scaler = MinMaxScaler()
  X_scaled = scaler.fit_transform(X)

  for k in k_arr:
    X_best = SelectKBest(chi2, k=k).fit_transform(X_scaled, Y)
    print('grid size: {0}, K best: {1}, average score: {2}%'.format(grid_size, k, score_20_2(X_best, Y)))

get_K_best(8, [16, 32, 48, 64])
get_K_best(16, [64, 128, 192, 256])
get_K_best(32, [128, 256, 512, 768, 1024])

The output above shows no big difference in accuracy by removing some columns from the equation. It does show that if you remove a lot of columns, this negatively impacts the performance of the model.

For a grid size of eight, a significant difference can be seen between 32 and 16 columns of data, with 16 having a 5% worse accuracy. For a grid size of eight a small difference can be seen at 64 in comparison to 128 columns and for a grid size of 32 this is true for 256 in comparison to 512.

That said, no significant difference is noticable between a a grid size of eight (with anywhere from 32 to 64 columns), a grid size of 16 (with anywhere from 128 to 256 columns) or a grid size of 32 (with anywhere from 512 to 1024 columns).

It would seem that our dataset is already quite optimized, and no further improvement can (easily) be obtained by selecting features.

# Image based model

Author: Dovydas Valiulis

In [None]:
# loading dataset that was created by feature extraction notebook
df = pd.read_pickle(TEMP_DIR_PATH + "/datasets/initial-image-feature-dataset.pkl")

## Feature analysis

In this part of the assignment, we will determine the best features to select for our classifier. We will look at statistics of the dataset to get an overall feeling of the data then we will graph boxplot for each feature grouped by a label to see if it is possible to separate labels from features. Furthermore, we will plot the histograms of each feature to see what is the distribution of each feature in our dataset. Then we will remove outliers and look at the graphs one more time. After that, we will look at the correlation graph to see which features are correlated with each other and could be removed from the final feature list. Finally, we will remove highly correlated features from our dataset.

The first step is to look at the statistics of the dataset

In [None]:
# Showing statistics of the dataset
df.describe()

From dataset statistics, we observed that there are obvious outliers in the dataset. For instance, some entries have an area of 1 and a perimeter of 0. These values can appear during the feature extraction phase when two or more regions are found in the single-digit image. To continue with our classification we will remove these outliers. To know how many outliers this dataset contains we draw boxplot and histogram.

In [None]:
# Drawing boxplots for each column grouped by label
for column in df.columns:
    df.boxplot([column], by='label',  figsize=(9, 6))

From the boxplot, we observed that there are some outliers in each feature. Then we looked at the histogram graph to see where outliers are concentrated. 

In [None]:
# drawing histograms for each column
df.hist(figsize=(30, 20))

In histograms, we can see that most features have outliers. Features that have obvious outliers: 
1. Area
2. Bbox area
3. Convex area
4. Eccentricity
5. Equivalent diameter
6. Extent
7. Filled area
8. Major axis length
9. Perimeter
10. Solidity

Ten of our features have outliers. Also, we can see that most of the features might have the same outliers because the number of outliers is very similar from feature to feature.

In this step, we will remove all entries where the area is less than 200 to remove outliers. we have chosen an area of 200 because if the area is less than 200 probably it is an extra region from the image feature extraction phase.

In [None]:
# removing outliers
df = df[df['area'] > 200]
df.describe()

Removing outliers based on one feature also removed outliers from other features. We can already see that minimum values make much more sense. There are no obvious outliers that we can see from dataset statistics. in order to confirm that we plotted boxplots and histograms again to see how these graphs have changed.

In [None]:
# Checkong how boxplot graphs changed after removal of outliers
for column in df.columns:
    df.boxplot([column], by='label',  figsize=(9, 6))

In [None]:
# Checkong how histogram graphs changed after removal of outliers
df.hist(figsize=(30, 20))

As we can see from the plots we have a lot fewer outliers. Furthermore, we can see that it should be possible to differentiate amongst different labels using a combination of different features. Also, it is clear which features do not offer any meaningful information for classification. these features are equivalent diameter, bbox-0, and bbox-2. Other features should offer some useful insight for the classification algorithm.

Next, we looked at the correlation matrix in order to see which features are highly correlated and remove them. There is no point having features that are highly correlated because they will have no or very little impact on the classification result but it will have an impact on processing speed. We have decided to remove all features over 80% correlation or below 80 % correlation.

In [None]:
# Checkong how corrolation changed after removal of outliers
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm', axis=None)

## Export labels

Now that we have a complete dataset where outliers have been removed we exported labels of the dataset to the separate .pkl file. this file will be loaded in the next phase and will be used for training, testing, and validation.

In [None]:
# Exporting dataset labels. To be used in training and testing later on
labels = df['label'].to_numpy()
joblib.dump(labels, TEMP_DIR_PATH + '/datasets/label-dataset.pkl')

## Manual feature selection

Based on the correlation matrix and histograms we have decided to remove these features: 
  1. equivalent diameter - due to high correlation with area feature
  2. bbox-0 - due to the very small distribution observed in histogram
  3. bbox-2 - due to the very small distribution observed in histogram

In [None]:
# Manual feature removal
df_mnl = df.drop(['equivalent_diameter', 'bbox-0', 'bbox-2'], axis=1)

## Manual feature pre-processing 


To classify entries of the dataset features must be scaled. This is important to have all features that have the same weight in most classification algorithms. We have selected to use Normalization (MinMaxScaler)
instead of Standardization because we already removed obvious outliers and we wanted to maintain our scale between entries. Furthermore, most of our features already are normally distributed.

In [None]:
# Scale manualy selected features
X_mnl_df = df_mnl.iloc[:, df_mnl.columns != 'label'].reindex()

scaler = MinMaxScaler()
X_mnl_scld = scaler.fit_transform(X_mnl_df)

After we have scaled our features we export them to the .pkl file alongside the scaler that was used. Scaler export needed for scaling new images to use in classifier after it was trained.

In [None]:
# Export dataset and scaler. Dataset will be used in training and testing in the next step
if not os.path.isdir(TEMP_DIR_PATH + "/datasets"):
  os.mkdir(TEMP_DIR_PATH + "/datasets")

if not os.path.isdir(TEMP_DIR_PATH + "/preprocessors"):
  os.mkdir(TEMP_DIR_PATH + "/preprocessors")

joblib.dump(X_mnl_scld, TEMP_DIR_PATH + '/datasets/manualy-selected-feature-dataset.pkl')
joblib.dump(scaler, TEMP_DIR_PATH + '/preprocessors/manual-image-feature-preprocessor.pkl')

## Auto feature selection

Also, we have decided to use auto feature selection sklearn to see if automatically selected features would yield better results. The first step to auto feature selection is the scaling dataset. Same as for manual feature selection we are using MinMaxScaler. After that, we are using the SelectKBest module from the sklearn. This module has two hyperparameters: score_func and k.
score_func is a method used for scoring each feature how useful it is and k is the number of features we want to be remaining in the final dataset. For our score_func we have selected to use "chi2" and "f_classif" because these functions are used to select the best features for classification task. For our k we decided to use 16, to make our manually selected feature amount and automatically selected feature amount be the same. In the next phase, we will test both datasets from "chi2" and "f_classif" and see which is better in for this problem.

In [None]:
# Scale features
X_auto_df = df.iloc[:, df.columns != 'label']

scaler = MinMaxScaler()
X_auto_scld = scaler.fit_transform(X_auto_df)

In [None]:
# select best features out of the dataset
skb_chi2 = SelectKBest(chi2, k=16)
X_auto_chi2 = skb_chi2.fit_transform(X_auto_scld, labels)

In [None]:
skb_f_classif = SelectKBest(f_classif, k=16)
X_auto_f_classif = skb_f_classif.fit_transform(X_auto_scld, labels)

Same as with manual features we need to export selected features and scalers. Besides them, we need to export two SelectKBest modules that were already fitted. This is required in order to extract the best features from new images that will be uploaded.

In [None]:
joblib.dump(scaler, TEMP_DIR_PATH + '/preprocessors/auto-image-feature-preprocessor.pkl')

In [None]:
if not os.path.isdir(TEMP_DIR_PATH + "/selection"):
  os.mkdir(TEMP_DIR_PATH + "/selection")

In [None]:
# Export dataset, scaler and feature selector. Dataset will be used in training and testing in the next step
joblib.dump(X_auto_chi2, TEMP_DIR_PATH + '/datasets/auto-selected-features-chi2.pkl')
joblib.dump(skb_chi2, TEMP_DIR_PATH + '/selection/auto-image-feature-selector-chi2.pkl')

In [None]:
# Export dataset, scaler and feature selector. Dataset will be used in training and testing in the next step
joblib.dump(X_auto_f_classif, TEMP_DIR_PATH + '/datasets/auto-selected-features-f-classif.pkl')
joblib.dump(skb_f_classif, TEMP_DIR_PATH + '/selection/auto-image-feature-selector-f-classif.pkl')

# Cleanup
Ensure no data is left on the runtime after execution of all code has completed. This ensures we won't re-use old data once something in the code has changed, eliminating the risk of hours of debugging functional code.

In [None]:
# # Remove all data from the /content directory
# if os.path.isdir(DATASET_CSV_PATH):
#   shutil.rmtree(DATASET_CSV_PATH)