# Project REJECTTED
- This notebook shows how to train a simple classifier to detect whether an instagram media is a legitimate post about tourism, as opposed to other miscellaneous images
- We have manually labelled data, where the label can be found at: `data/labels.csv`
- We will use `RandomForestClassifier` from `sklearn`.

In [None]:
import sys, pathlib

import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
%matplotlib inline

sys.path.append('../../backend')
from core.db.persistence import load_dataset

from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [None]:
ROOT_DIR = pathlib.Path('../..')
DATA_DIR = ROOT_DIR / 'data'

##### Load and build dataset

In [None]:
# load media id's
dataset = load_dataset(ROOT_DIR)

In [None]:
# load labels
label_map = pd.read_csv(DATA_DIR / 'labels.csv', header=None)
label_map.columns = ['id', 'label']

In [None]:
# load thumbnails
images = []
for data in dataset:
    image = np.array(data.load_thumbnail(ROOT_DIR))
    images.append(image)
images = np.array(images)

In [None]:
# for each media id, find the correct label
labels = []
for data in dataset:
    label = np.array(label_map.loc[label_map['id'] == int(data.id)].iloc[0].label)
    labels.append(label)
labels = np.array(labels)

In [None]:
# add manual collected tourism images to aid the classifier
tourism_image_paths = pathlib.Path(DATA_DIR / 'thumbnails').rglob('_manual*')

tourism_images = []
for tourism_image_path in tourism_image_paths:
    tourism_images.append(np.array(Image.open(tourism_image_path)))
tourism_images = np.array(tourism_images)
print(f'{len(tourism_images)} manually labelled image')

# the labels are 1 for all manually collected data
tourism_labels = np.repeat(1, len(tourism_images))

# add to the current data
all_images = np.concatenate([images, tourism_images])
all_labels = np.concatenate([labels, tourism_labels])

In [None]:
inputs = all_images.reshape((len(all_images), -1))
targets = all_labels.ravel()

##### Training, validation and testing
Split data into three subsets

In [None]:
n_sample = len(all_images)
n_train = int(n_sample * 0.7)
n_valid = int(n_sample * 0.15)
n_test = n_sample - (n_train + n_valid)

# suffle the dataset
idx = np.arange(n_sample)
np.random.shuffle(idx)
train_idx = idx[:n_train]
val_idx = idx[n_train:n_train+n_valid]
test_idx = idx[n_train+n_valid:]

# assign
train_data = inputs[train_idx]
train_label = targets[train_idx]
val_data = inputs[val_idx]
val_label = targets[val_idx]
test_data = inputs[test_idx]
test_label = targets[test_idx]

print(f'{len(train_idx)} train, {len(val_idx)} val and {len(test_idx)} test images')

In [None]:
# do some analysis on training and validation data

In [None]:
print(f'In training data, there are:')
print(f' - {np.sum(train_label == 0)} non-tourism images')
print(f' - {np.sum(train_label == 1)} tourism images')
print(f'In val data, there are:')
print(f' - {np.sum(val_label == 0)} non-tourism images')
print(f' - {np.sum(val_label == 1)} tourism images')

##### Model

In [None]:
classifier = RandomForestClassifier()

##### Training

In [None]:
# fit the data
classifier.fit(train_data, train_label)

##### Validation

In [None]:
# get validation accuracy
val_preds = classifier.predict(val_data)
errors = val_label == val_preds

In [None]:
print('Validation Results')
print(classification_report(val_label, val_preds, target_names=['Ads', 'Tourism']))

In [None]:
# visualise some cases to analyse

In [None]:
# get first 100 images for visualization
val_images = val_data[:100].reshape((100, 64, 64, 3))

In [None]:
# lets visualize the thumbnails!
label_title = ['MISC', 'TOUR']
fig, axes = plt.subplots(nrows=10, ncols=10, figsize=(20,20))
axes = axes.ravel()
for i, (val_img, pred, err) in enumerate(zip(val_images, val_preds, errors)):
    axes[i].imshow(val_img);
    axes[i].axis('off');
    axes[i].set_title(f'{label_title[pred]}:{"CORRECT" if err else "WRONG"}')
plt.subplots_adjust()
plt.show()

##### Test

In [None]:
# get test accuracy
preds = classifier.predict(test_data)

In [None]:
print('Test Results')
print(classification_report(test_label, preds, target_names=['Ads', 'Tourism']))