In [None]:
#Imports
import pandas as pd
import os
import glob
import numpy as np
from tqdm import tqdm
import shutil
import pickle
from PIL import Image
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import PIL
import sklearn
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, models
from torchvision import models
from torch.autograd import Variable
from torch.optim import lr_scheduler
from torch import optim
from torchvision.utils import make_grid
from pprint import pprint
from imgaug import augmenters as iaa

# Specify path
PATH = ('ADD_PATH/AMLS_20-21_20164490/Datasets/celeba/')


#Read in the csv data
data = pd.read_csv(PATH + 'labels.csv', delim_whitespace=True)
#Create an index
data.set_index('img_name', inplace=True)
#Instead of values 1,-1 I set 0,1 (replace all -1 with 0)
data.replace(-1,0, inplace= True)
#Read in to pickle file
data.to_pickle(PATH + 'data_pkl.pkl')


# for i in ['train', 'valid']:
#     os.mkdir(os.path.join(PATH , i))

In [None]:
#Find the image files/names
filenames = glob.glob(PATH + 'img/*jpg')
#Random shuffle so avoid biased training
shuffle = np.random.permutation(len(filenames))

#Create dataframes
training_df = pd.DataFrame()
valid_df = pd.DataFrame()

#Create seperate folders for training and validation & show progress bar

#Split 4,500 image files into training folder
for j in tqdm(shuffle[:4500]):
    file = filenames[j].split('/')[-1]
    training_df = training_df.append( data[data.index == file])
    shutil.copy(PATH + 'img/' + file, PATH + 'train/' + file)

#Split the remaining 500 image files into validation folder
for j in tqdm(shuffle[4500:]):
    file = filenames[j].split('/')[-1]
    valid_df = valid_df.append(data[data.index == file])
    shutil.copy(PATH +'img/'+ file, PATH + 'valid/' + file)

In [None]:
#Create csv and pickle files 
training_df.to_csv(PATH + 'train.csv')
training_df.to_pickle((PATH + 'train.pkl'))

valid_df.to_csv(PATH + 'valid.csv')
valid_df.to_pickle(PATH + 'valid.pkl')

In [None]:


#Outsource computing power to Google servers to avoid computer damage
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#Define a class that will load the data when called
class Smiling_loader(Dataset):
    def __init__(self, df, img_dir, transform = None):
        self.dataframe = df
        self.img_dir = img_dir
        self.transform = transform
        self.filename = df.index
        self.label = df.smiling.values

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):

        image = Image.open(os.path.join(self.img_dir, self.filename[idx]))
        label = self.label[idx]
        sample = {'image': image, 'label': label}
        if self.transform:
            image = self.transform(sample['image'])
            sample = {'image': image, 'label': label}

        return sample

In [None]:
#Use pre-constructed model architectures accessed via urls


#Specify model - use vgg19 as it is most accurate
model = models.vgg16_bn(pretrained = True)

for param in model.parameters():
    param.requires_grad = True

#Change the final output layer to the number of classes required in our mdoel.
n_inputs = model.classifier[6].in_features
model.classifier[6] = nn.Sequential(
    nn.Linear(n_inputs, 2048), nn.ReLU(), nn.Dropout(0.65),
    nn.Linear(2048, 1024), nn.ReLU(),
    nn.Dropout(0.6),
    nn.Linear(1024, 512), nn.ReLU(),
    nn.Linear(512, 2))

#Send the Model to the GPU
model.to(device)

In [None]:
#Class specifying image transforms to improve performancce of model
class ImAugtransforms:
    def __init__(self):
        self.aug = iaa.Sequential([
            iaa.Sometimes(0.3, iaa.GaussianBlur(sigma=(0, 2.0))),
            iaa.Affine(rotate=(-30, 30), mode='symmetric'),
            iaa.Sometimes(0.25,
                          iaa.OneOf([iaa.Dropout(p=(0, 0.1)),
                                     iaa.CoarseDropout(0.1, size_percent=0.5)])),
            iaa.AddToHueAndSaturation(value=(-10, 10), per_channel=True)
        ])

    def __call__(self, img):
        img = np.array(img)
        return self.aug.augment_image(img)

#More transformations
train_trns = transforms.Compose([
    ImAugtransforms(),
    lambda x: PIL.Image.fromarray(x),
    transforms.Resize(256),
    transforms.CenterCrop(224),
    #transforms.RandomHorizontalFlip(p=0.5),
    #transforms.RandomGrayscale(p=0.35),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

valid_trns = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),

])

In [None]:
#Read the training dataframe
training_df = pd.read_pickle(PATH +'train.pkl')
training_directory = PATH + 'train/'

#Read the validation dataframe
valid_df = pd.read_pickle(PATH + 'valid.pkl')
validation_directory = PATH + 'valid/'

#Use smiling_loader class to call images
training_dataloader = Smiling_loader(training_df, training_directory, transform=train_trns)
validation_dataloader = Smiling_loader(valid_df, validation_directory, transform=valid_trns)

#Plot three images to check transformations
plt.imshow(make_grid(training_dataloader[1]['image'].permute(1, 2, 0)))
plt.show()
plt.imshow(make_grid(training_dataloader[7]['image'].permute(1, 2, 0)))
plt.show()
plt.imshow(make_grid(training_dataloader[9]['image'].permute(1, 2, 0)))
plt.show()
print(training_dataloader[1]['label'])
print(training_dataloader[7]['label'])
print(training_dataloader[9]['label'])

In [None]:
#Specify batch size, epochs
BATCH_SIZE = 32
EPOCHS = 5

#Set-up training and validation dataloaders
train_dl = DataLoader(training_dataloader, shuffle=True, batch_size=BATCH_SIZE)
valid_dl = DataLoader(validation_dataloader, shuffle=False, batch_size=BATCH_SIZE)

#Specify optimizer and loss function
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.classifier.parameters(), lr=0.01, momentum=0.9 )

def get_num_correct(preds, labels):
    return preds.argmax(dim=1).eq(labels).sum().item()

In [None]:
#Define model function
def fit_model(epochs, model, dataloader, phase='training', volatile=False):
    pprint("Epoch: {}".format(epochs))

    #If training phase, train model
    if phase == 'training':
        model.train()
    #If validation phase, evaluate accuracy and loss of validation image set with trained model
    if phase == 'validataion':
        model.eval()
        volatile = True

    running_loss = []
    running_acc = []
    b = 0
    for i, data in enumerate(dataloader):
      
        inputs, target = data['image'].cuda(), data['label'].cuda()

        inputs, target = Variable(inputs), Variable(target)

        #If training phase, train the model, evaluate and print the loss and accuracy
        if phase == 'training':
            optimizer.zero_grad()

        outputs = model(inputs.float())

        outputs = outputs.to(device=device, dtype=torch.float32)
        _, preds = torch.max(outputs.data, 1)
        loss = criterion(outputs, target)

        acc_ = []

        accuracy = (get_num_correct(outputs, target)/BATCH_SIZE)
        acc_.append(accuracy)

        # print('In Epoch', epochs)
        # print('')
        # print('predictions', preds)
        # print('targets    ', target)
        # print('Batch Accuracy is ' + "{:.2%}".format(accuracy))
        # print('')
        # print('Batch Loss is     ', loss)
        # print('')        

        running_loss.append(loss.item())
        running_acc.append(np.asarray(acc_).mean())

        b += 1

        #If training phase, send loss backwards so model can 'learn'
        if phase == 'training':
            loss.backward()

            optimizer.step()

            torch.no_grad()

    total_batch_loss = np.asarray(running_loss).mean()
    total_batch_acc = np.asarray(running_acc).mean()

    pprint("{} loss is {} ".format(phase, total_batch_loss))
    pprint("{} accuracy is {} ".format(phase, total_batch_acc))


    return total_batch_loss, total_batch_acc

In [None]:
#Set-up empty lists to improve performance
trn_losses = [];
trn_acc = []
val_losses = [];
val_acc = []

In [None]:
#Train model and display progress
for i in tqdm(range(1, EPOCHS +1)):
    trn_l, trn_a = fit_model(i, model, train_dl)
    val_l, val_a = fit_model(i, model, valid_dl, phase='validation')
    trn_losses.append(trn_l);
    trn_acc.append(trn_a)
    val_losses.append(val_l);
    val_acc.append(val_a)