In [1]:
import pandas as pd

%matplotlib inline

# Load the datasets

In [3]:
train = pd.read_csv("image_train.csv")
test = pd.read_csv("image_test.csv")

In [8]:
import re


def extract_features(dataframe: pd.DataFrame, column: str):
    """
    Convert both image_array and deep_features object columns to 
    separate pandas integer columns.
    """
    raw_data = [
        re.search(r"\[(.*)\]", col).group(1)
        for col in dataframe.loc[:, column]
    ]
    normalized_data = [
        list(map(float, row.split()))
        for row in raw_data
    ]
    return pd.DataFrame(normalized_data).add_prefix(column)

# re-build the train dataset
train = pd.concat(
    (
    train.drop(['deep_features', 'image_array'], axis=1),
    extract_features(train, 'image_array'),
    extract_features(train, 'deep_features'),
    )
    , axis='columns'
)

# re-build the test dataset
test = pd.concat(
    (
    test.drop(['deep_features', 'image_array'], axis=1),
    extract_features(test, 'image_array'),
    extract_features(test, 'deep_features')
    )
    , axis='columns'
)

# change label type to category
train['label'] = train.label.astype('category')
test['label'] = test.label.astype('category')

# Programming Assignment

## Task 1: Compute summary statistics of the data

* What is the **least common category in** the training data?

In [9]:
train.label.value_counts()

dog           509
cat           509
automobile    509
bird          478
Name: label, dtype: int64

## Task 2: Create category-specific image retrieval models

In [19]:
feature_cols = [
    col
    for col in dog_df.columns
    if col.startswith('deep_features')
]

### 2.1 Split the training data into 4 different Dataframe data structures

In [25]:
labels = ['bird', 'cat', 'dog', 'automobile']

In [26]:
dataframes = {}
for label in labels:
    dataframes[label] = train[train.label == label]

In [30]:
dataframes.keys()

dict_keys(['bird', 'cat', 'dog', 'automobile'])

### 2.2 Create a nearest neighbor model using deep features for each dataframe

In [16]:
from sklearn.neighbors import NearestNeighbors

In [32]:
models = {}

for label in labels:
    models[label] = NearestNeighbors()
    models[label].fit(dataframes[label][feature_cols])

In [33]:
models

{'bird': NearestNeighbors(),
 'cat': NearestNeighbors(),
 'dog': NearestNeighbors(),
 'automobile': NearestNeighbors()}

### 2.3 Using these models, answer the following questions.

* What is the nearest cat-labeled image in the training data to the cat image that is the first image in the test data ?

In [37]:
cat = test.iloc[[0]]

In [43]:
distance, index = models['cat'].kneighbors(cat[feature_cols], n_neighbors=1)

In [44]:
dataframes['cat'].iloc[index[0]]

Unnamed: 0,id,image,label,image_array0,image_array1,image_array2,image_array3,image_array4,image_array5,image_array6,...,deep_features4086,deep_features4087,deep_features4088,deep_features4089,deep_features4090,deep_features4091,deep_features4092,deep_features4093,deep_features4094,deep_features4095
669,16289,Height: 32 Width: 32,cat,215.0,219.0,231.0,215.0,219.0,232.0,216.0,...,0.0,0.0,0.0,0.479628,0.0,0.0,0.0,0.0,0.0,0.0


* What is the nearest dog-labeled image in the training data to the cat image that is the first image in the test data?

In [45]:
distance, index = models['dog'].kneighbors(cat[feature_cols], n_neighbors=1)

In [46]:
dataframes['dog'].iloc[index[0]]

Unnamed: 0,id,image,label,image_array0,image_array1,image_array2,image_array3,image_array4,image_array5,image_array6,...,deep_features4086,deep_features4087,deep_features4088,deep_features4089,deep_features4090,deep_features4091,deep_features4092,deep_features4093,deep_features4094,deep_features4095
703,16976,Height: 32 Width: 32,dog,16.0,17.0,11.0,18.0,19.0,13.0,20.0,...,0.727477,0.0,0.0,1.75611,0.0,0.0,0.0,0.318482,0.0,0.0


## Task 3: Try a simple example of nearest-neighbors classification

* For the first image in the test data, compute the mean distance between this image at its five nearest neighbors that are labeled ‘cat’ in the training data.

In [51]:
distances, indexes = models['cat'].kneighbors(cat[feature_cols], n_neighbors=5)

distances.mean()

36.15572932231885

* For the first image in the test data, compute the mean distance between this image at its five nearest neighbors that are labeled ‘dog’ in the training data.

In [50]:
distances, indexes = models['dog'].kneighbors(cat[feature_cols], n_neighbors=5)

distances.mean()

37.77071193352956

* On average, is the first image in the test data closer to its five nearest neighbors in the ‘cat’ data or in the ‘dog’ data?

The lower, the closer, so **cat**.

## Task 4: Compute nearest neighbors accuracy

In [56]:
dog_model = models['dog']
cat_model = models['cat']
bird_model = models['bird']
automobile_model = models['automobile']

### Split the test data

In [55]:
image_test_cat = test[test.label == 'cat']
image_test_dog  = test[test.label == 'dog']
image_test_bird = test[test.label == 'bird']
image_test_automobile = test[test.label == 'automobile']

### Find the closest neighbor to the dog test data using each of the trained models

In [63]:
dog_cat_neighbors = cat_model.kneighbors(image_test_dog[feature_cols], n_neighbors=1)

dog_dog_neighbors = dog_model.kneighbors(image_test_dog[feature_cols], n_neighbors=1)

dog_bird_neighbors = bird_model.kneighbors(image_test_dog[feature_cols], n_neighbors=1)

dog_automobile_neighbors = automobile_model.kneighbors(image_test_dog[feature_cols], n_neighbors=1)

### Create a Dataframe with the distances from the dog test examples to the respective nearest neighbors in each class in the training data.

In [67]:
dog_automobile_neighbors[0]

array([41.95798179])

In [70]:
dog_distances = pd.DataFrame(
    {
    'dog-automobile': dog_automobile_neighbors[0].flatten(),
    'dog-bird': dog_bird_neighbors[0].flatten(),
    'dog-cat': dog_bird_neighbors[0].flatten(),
    'dog-dog': dog_dog_neighbors[0].flatten(),
     }
)

In [71]:
dog_distances.head()

Unnamed: 0,dog-automobile,dog-bird,dog-cat,dog-dog
0,41.957982,41.753863,41.753863,33.477354
1,46.002135,41.338292,41.338292,32.845849
2,42.946231,38.615761,38.615761,35.039707
3,41.686604,37.089222,37.089222,33.901032
4,39.226973,38.272291,38.272291,37.484928


In [72]:
import numpy as np

In [108]:
np.max(np.array([1, 2, 3]))

3

In [176]:
import numpy as np
from functools import partial

def is_label_correct(row, col: str='dog-dog'):
    if row[col] <= np.min(row.values):
        return True
    else:
        return False

is_dog_correct = partial(is_label_correct, col='dog-dog')
is_cat_correct = partial(is_label_correct, col='cat-cat')

In [175]:
dog_distances.apply(is_dog_correct, axis=1).sum()

879

**Misclassified numbers**:

In [137]:
len(image_test_dog) - 879

121

**Note**

In [178]:
cat_distances = pd.DataFrame({
    'cat-automobile': automobile_model.kneighbors(image_test_cat[feature_cols], n_neighbors=1)[0].flatten(),
    'cat-bird': bird_model.kneighbors(image_test_cat[feature_cols], n_neighbors=1)[0].flatten(),
    'cat-cat': cat_model.kneighbors(image_test_cat[feature_cols], n_neighbors=1)[0].flatten(),
    'cat-dog': dog_model.kneighbors(image_test_cat[feature_cols], n_neighbors=1)[0].flatten(),
})

In [179]:
cat_distances.head()

Unnamed: 0,cat-automobile,cat-bird,cat-cat,cat-dog
0,39.671057,38.074267,34.623722,37.464261
1,43.008906,36.367401,33.86806,29.347234
2,38.600997,35.303936,32.461518,32.259961
3,39.356629,38.894403,35.77082,35.385205
4,38.357236,34.282033,31.157764,30.044301


In [180]:
cat_distances.apply(is_cat_correct, axis='columns').sum()

548