# COMP309 Capstone Project
#### Marrick Lip, 2019

## [0] Setup

### 0.1 Get the dependencies

In [3]:
#%%capture
# ^ hide the output

#!conda update --all --yes
#!conda install cudnn --yes

!pip install --upgrade pip
!pip install tensorflow-gpu
!pip install imutils
!pip install tqdm
!pip install pyyaml h5py
!pip install tensorflow-hub

Requirement already up-to-date: pip in /home/ec2-user/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages (19.3.1)
Collecting tensorflow-gpu
  Using cached https://files.pythonhosted.org/packages/25/44/47f0722aea081697143fbcf5d2aa60d1aee4aaacb5869aee2b568974777b/tensorflow_gpu-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl
Processing /home/ec2-user/.cache/pip/wheels/d7/de/2e/efa132238792efb6459a96e85916ef8597fcb3d2ae51590dfd/wrapt-1.11.2-cp36-cp36m-linux_x86_64.whl
Processing /home/ec2-user/.cache/pip/wheels/7c/06/54/bc84598ba1daf8f970247f550b175aaaee85f68b4b0c5ab2c6/termcolor-1.1.0-cp36-none-any.whl
[31mERROR: fastai 1.0.55 requires nvidia-ml-py3, which is not installed.[0m
[31mERROR: thinc 6.12.1 has requirement msgpack<0.6.0,>=0.5.6, but you'll have msgpack 0.6.1 which is incompatible.[0m
[31mERROR: thinc 6.12.1 has requirement wrapt<1.11.0,>=1.10.0, but you'll have wrapt 1.11.2 which is incompatible.[0m
Installing collected packages: wrapt, termcolor, tensorflow-gpu
  Found exi

### 0.2 Import the dependencies

In [2]:
%matplotlib inline

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import os
import glob
import imageio
import random
import math
import shutil
import requests
import skimage

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import tensorflow_hub as hub
from collections import defaultdict

# create a directory if it doesn't already exist
make_dir = lambda path: os.path.exists(path) or os.makedirs(path)

ModuleNotFoundError: No module named 'tensorflow'

### 0.3 Constants

In [None]:
IMG_HEIGHT = 224 #300 
IMG_WIDTH = 224 # 300
train_pct = 0.8
batch_size = 24

## [1] Get the Data

### 1.1 Create directories for the test and train data

In [None]:
!rm -rf data
make_dir('data')
make_dir('data/train')
make_dir('data/test')

### 1.1 Get the provided dataset

#### 1.1.1 Download and unzip the images

In [None]:
if not os.path.exists('train.zip'):
    !curl https://ecs.victoria.ac.nz/foswiki/pub/Courses/COMP309_2019T2/Assignments/Train_data_2019.zip -o train.zip
!unzip -oq train.zip -d data

#### 1.1.2 Split the images using 309 as a seed

In [None]:
random.seed(309)
for label_dir in glob.glob('data/Train_data/*'):
    label = label_dir.split('/')[-1]
    make_dir(f'data/train/{label}')
    make_dir(f'data/test/{label}')
    
    images = glob.glob(f'{label_dir}/*')
    random.shuffle(images)
    
    split_index = math.floor(len(images) * train_pct)
    for i, image in tqdm(enumerate(images), label):
        train_or_test = 'train' if i < split_index else 'test'
        new_path = image.replace('Train_data', train_or_test)
        shutil.move(image, new_path)

# clean-up the unzipped directory (will be empty)
!rm -rf data/Train_data
!rm -rf data/__MACOSX

### 1.2 Get images from ImageNet

#### 1.2.1 Fetch the ImageNet urls

In [None]:
!ls imagenet_urls/

In [None]:
imagenet_urls_path = 'imagenet_urls/fall11_urls.txt'
if not os.path.exists(imagenet_urls_path):
    !rm -rf imagenet_urls 
    !mkdir imagenet_urls

    # n.b. the main site is down: use a mirror
    !wget https://v.im.cyut.edu.tw/ftp/18/imagenet_fall11_urls.tgz -O imagenet_urls/imagenet_fall11_urls.tgz
    !tar -xvf imagenet_urls/imagenet_fall11_urls.tgz -C imagenet_urls

raw_image_urls = list(open(imagenet_urls_path, encoding='ISO-8859-1'))

#### 1.2.2 Find images for each class

In [None]:
word_net_ids = {
    'strawberry': ['n07745940'],
    'cherry': ['n07757132', 'n07757312', 'n07757874', 'n07757990'],
    'tomato': ['n07734292', 'n07734292']
}

image_urls_by_class = defaultdict(list)
for line in tqdm(raw_image_urls):
    image_id = line.split('\t')[0].split('_')[0]
    image_url = line.split('\t')[1].strip()
    
    for class_name, ids in word_net_ids.items():
        if image_id in ids:
            image_urls_by_class[class_name] += [image_url]

#### 1.2.3 Download applicable images from ImageNet
Note: images from Flickr are filtered out as they may be in the evaluation data set.

In [None]:
try:
    dead_images
except:
    dead_images = [] # don't retry these again this session
    
for class_name, urls in image_urls_by_class.items():
    # filter out images from flickr
    not_flickr = [url for url in urls if 'flickr' not in url]
    for i, url in tqdm(list(enumerate(not_flickr)), class_name):
        extension = url.split('.')[-1].lower()
        if extension not in ['jpg', 'jpeg', 'png']: continue
        
        out_image_path = f'data/train/{class_name}/image_net_{i:03}.jpg'
        if os.path.exists(out_image_path) or url in dead_images:
            continue
        
        try:
            response = requests.get(url, timeout=(2, 5), allow_redirects=False)
            assert response.status_code == 200
        except (
            AssertionError, # status code wasn't 200
            requests.exceptions.ReadTimeout,
            requests.exceptions.ConnectionError,
            requests.exceptions.Timeout
        ): # can't download image
            dead_images += [url]
            continue
            
        temp_file_name = f'temp.{extension}'
        open(temp_file_name, 'wb').write(response.content)
        
        try:
            image = imageio.imread(temp_file_name)
            if image.shape[-1] == 4:
                image = skimage.color.rgba2rgb(image)
            
            if image.dtype != np.uint8:
                image = (image * 255).astype(np.uint8)
            imageio.imwrite(out_image_path, image)
        except ValueError: # isn't a valid image
            dead_images += [url]
            continue
        finally:
            os.remove(temp_file_name)

### 1.3 Process images from Google Images

In [None]:
# this is manually uploaded
!unzip -oq google_images.zip

all_images = enumerate(glob.glob('google_images/*/*'))
for i, image_path in tqdm(list(all_images)):
    class_name = image_path.split('/')[-2].replace('cherry_tomato', 'tomato')
    
    image = imageio.imread(image_path)
    imageio.imwrite(f'data/train/{class_name}/google_images_{i:04}.jpg', image)
    
!rm -rf google_images

In [None]:
for class_name in ['tomato', 'cherry', 'strawberry']:
    print(class_name, len(glob.glob(f'data/train/{class_name}/*')))

## [2] Pre-processing

### 2.1 Specify the data augmentation to use

In [None]:
data_augmentation = dict(
    horizontal_flip=True,
    rotation_range=35,
    zoom_range=0.25,
    width_shift_range=0.15,
    height_shift_range=0.1,
    shear_range=25,
    brightness_range=[0.75, 1.25],
)

### 2.2 Build the ImageDataGenerators
Note: the test data isn't augmented here, but TTA is later implemented

In [None]:
print('Train:', end=' ')
train_data_gen = ImageDataGenerator(rescale=1.0/255, **data_augmentation).flow_from_directory(
    batch_size=batch_size,
    directory='data/train',
    target_size=(IMG_HEIGHT, IMG_WIDTH),
)

print('Test:', end=' ')
test_data_gen = ImageDataGenerator(rescale=1.0/255).flow_from_directory(
    batch_size=batch_size,
    directory='data/test',
    target_size=(IMG_HEIGHT, IMG_WIDTH),
)

### 2.3 Visualise the data augmentation

In [None]:
_, axes = plt.subplots(4, 5)

for image in range(axes.shape[0]):
    for example in tqdm(range(axes.shape[1])):
        axes[image,example].imshow(train_data_gen[0][0][image])

## [3] Training

### 3.1 Create a transfer learning model

In [None]:
train_histories = []

# Create the base model from the pre-trained model MobileNet V2
base_model = tf.keras.applications.MobileNetV2(
    input_shape=(IMG_WIDTH, IMG_HEIGHT, 3),
    include_top=False,
    weights='imagenet'
)

base_model.trainable = False # freeze the pre-trained bit

# wrap the model in a sequential model
model = tf.keras.Sequential([
    base_model,
    tf.keras.layers.AveragePooling2D(),
    Flatten(),
    Dropout(0.25),
    Dense(3, activation='softmax'),
])

### 3.2 Train the trail of the model

In [None]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

r = model.fit_generator(
    train_data_gen,
    validation_data=test_data_gen,
    epochs=15,
)

train_histories.append(r.history)

### 3.3 Unfreeze some of the pre-trained model and continue training

In [None]:
base_model.trainable = True
for layer in base_model.layers[:100]:
    layer.trainable = False

model.compile(
    optimizer=tf.keras.optimizers.RMSprop(lr=1e-5),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

r = model.fit_generator(
    train_data_gen,
    validation_data=test_data_gen,
    initial_epoch=15,
    epochs=30,
)

train_histories.append(r.history)

### 3.4 Unfreeze more of the pre-trained model and continue training

In [None]:
base_model.trainable = True
for layer in base_model.layers[:50]:
    layer.trainable = False

model.compile(
    optimizer=tf.keras.optimizers.RMSprop(lr=3e-6),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

r = model.fit_generator(
    train_data_gen,
    validation_data=test_data_gen,
    initial_epoch=15,
    epochs=30,
)

train_histories.append(r.history)