# Load data and define pipeline

In [87]:
import pickle
import numpy as np
from skimage import io
import pandas as pd

from tqdm import tqdm, tqdm_notebook
from PIL import Image
from pathlib import Path

import torch
from torch import tensor
import torchvision
from torchvision import transforms
from multiprocessing.pool import ThreadPool
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

from matplotlib import colors, pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [19]:
DATA_MODES = ['train', 'val', 'test']
RESCALE_SIZE = 299
DEVICE = torch.device("cuda")

In [20]:
class SimpsonsDataset(Dataset):

    def __init__(self, files, mode):
        super().__init__()
        self.files = sorted(files)
        self.mode = mode

        if self.mode not in DATA_MODES:
            print(f"{self.mode} is not correct; correct modes: {DATA_MODES}")
            raise NameError

        self.len_ = len(self.files)
     
        self.label_encoder = LabelEncoder()

        if self.mode != 'test':
            self.labels = [path.parent.name for path in self.files]
            self.label_encoder.fit(self.labels)

            with open('label_encoder.pkl', 'wb') as le_dump_file:
                  pickle.dump(self.label_encoder, le_dump_file)
                      
    def __len__(self):
        return self.len_
      
    def load_sample(self, file):
        image = Image.open(file)
        image.load()
        return image
  
    def __getitem__(self, index):
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.ColorJitter(brightness=0.5),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomVerticalFlip(p=0.3),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 
        ])
        x = self.load_sample(self.files[index])
        x = self._prepare_sample(x)
        x = np.array(x / 255, dtype='float32')
        x = transform(x)
        if self.mode == 'test':
            return x
        else:
            label = self.labels[index]
            label_id = self.label_encoder.transform([label])
            y = label_id.item()
            return x, y
        
    def _prepare_sample(self, image):
        image = image.resize((RESCALE_SIZE, RESCALE_SIZE))
        return np.array(image)

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
! unzip drive/MyDrive/datasets/journey-springfield.zip

In [24]:
TRAIN_DIR = Path('train/simpsons_dataset')
TEST_DIR = Path('testset/testset')

train_val_files = sorted(list(TRAIN_DIR.rglob('*.jpg')))
test_files = sorted(list(TEST_DIR.rglob('*.jpg')))

In [25]:
from sklearn.model_selection import train_test_split

train_val_labels = [path.parent.name for path in train_val_files]
train_files, val_files = train_test_split(train_val_files, test_size=0.25, \
                                          stratify=train_val_labels)

In [26]:
def fit_epoch(model, train_loader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    running_corrects = 0
    processed_data = 0
  
    for inputs, labels in train_loader:
        inputs = inputs.to(DEVICE)
        labels = labels.to(DEVICE)
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        preds = torch.argmax(outputs, 1)
        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)
        processed_data += inputs.size(0)
              
    train_loss = running_loss / processed_data
    train_acc = running_corrects.cpu().numpy() / processed_data
    return train_loss, train_acc

In [27]:
def eval_epoch(model, val_loader, criterion):
    model.eval()
    running_loss = 0.0
    running_corrects = 0
    processed_size = 0

    for inputs, labels in val_loader:
        inputs = inputs.to(DEVICE)
        labels = labels.to(DEVICE)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            preds = torch.argmax(outputs, 1)

        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)
        processed_size += inputs.size(0)
    val_loss = running_loss / processed_size
    val_acc = running_corrects.double() / processed_size
    return val_loss, val_acc

In [28]:
def train(train_dataset, val_dataset, model, epochs, batch_size):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    history = []
    log_template = "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} \
    val_loss {v_loss:0.4f} train_acc {t_acc:0.4f} val_acc {v_acc:0.4f}"

    with tqdm(desc="epoch", total=epochs) as pbar_outer:
        opt = torch.optim.Adam(model.parameters())
        criterion = nn.CrossEntropyLoss()

        for epoch in range(epochs):
            train_loss, train_acc = fit_epoch(model, train_loader, criterion, opt)
            print("loss", train_loss)
            
            val_loss, val_acc = eval_epoch(model, val_loader, criterion)
            history.append((train_loss, train_acc, val_loss, val_acc))
            
            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=epoch+1, t_loss=train_loss,\
                                           v_loss=val_loss, t_acc=train_acc, v_acc=val_acc))
            
    return history

In [29]:
def predict(model, test_loader):
    with torch.no_grad():
        logits = []
    
        for inputs in test_loader:
            inputs = inputs.to(DEVICE)
            model.eval()
            outputs = model(inputs).cpu()
            logits.append(outputs)
            
    probs = nn.functional.softmax(torch.cat(logits), dim=-1).numpy()
    return probs

# Make Oversmpling

In [30]:
data = dict()

for item in train_files:
    if item.parent.name not in data.keys():
        data[item.parent.name] = [item]
        continue
    data[item.parent.name].append(item)

for key in data.keys():
    print(str(key) + ': ' + str(len(data[key])))

kent_brockman: 373
homer_simpson: 1684
abraham_grampa_simpson: 685
ned_flanders: 1090
apu_nahasapeemapetilon: 467
principal_skinner: 895
marge_simpson: 968
lisa_simpson: 1015
chief_wiggum: 739
milhouse_van_houten: 809
cletus_spuckler: 35
bart_simpson: 1006
krusty_the_clown: 904
moe_szyslak: 1089
charles_montgomery_burns: 895
barney_gumble: 80
comic_book_guy: 352
edna_krabappel: 343
sideshow_bob: 658
nelson_muntz: 269
mayor_quimby: 185
lenny_leonard: 233
groundskeeper_willie: 91
selma_bouvier: 77
maggie_simpson: 96
waylon_smithers: 136
ralph_wiggum: 67
carl_carlson: 74
agnes_skinner: 32
martin_prince: 53
professor_john_frink: 49
miss_hoover: 13
snake_jailbird: 41
otto_mann: 24
patty_bouvier: 54
fat_tony: 20
rainier_wolfcastle: 34
gil: 20
sideshow_mel: 30
disco_stu: 6
troy_mcclure: 6
lionel_hutz: 2


In [31]:
for key in data.keys():
    if len(data[key]) < 100:
        data[key] *= 100 // len(data[key]) + 1

In [32]:
for key in data.keys():
    print(str(key) + ': ' + str(len(data[key])))

kent_brockman: 373
homer_simpson: 1684
abraham_grampa_simpson: 685
ned_flanders: 1090
apu_nahasapeemapetilon: 467
principal_skinner: 895
marge_simpson: 968
lisa_simpson: 1015
chief_wiggum: 739
milhouse_van_houten: 809
cletus_spuckler: 105
bart_simpson: 1006
krusty_the_clown: 904
moe_szyslak: 1089
charles_montgomery_burns: 895
barney_gumble: 160
comic_book_guy: 352
edna_krabappel: 343
sideshow_bob: 658
nelson_muntz: 269
mayor_quimby: 185
lenny_leonard: 233
groundskeeper_willie: 182
selma_bouvier: 154
maggie_simpson: 192
waylon_smithers: 136
ralph_wiggum: 134
carl_carlson: 148
agnes_skinner: 128
martin_prince: 106
professor_john_frink: 147
miss_hoover: 104
snake_jailbird: 123
otto_mann: 120
patty_bouvier: 108
fat_tony: 120
rainier_wolfcastle: 102
gil: 120
sideshow_mel: 120
disco_stu: 102
troy_mcclure: 102
lionel_hutz: 102


In [33]:
train_files = []
for key in data.keys():
    train_files.extend(data[key])

In [76]:
val_dataset = SimpsonsDataset(val_files, mode='val')
train_dataset = SimpsonsDataset(train_files, mode='train')
test_dataset = SimpsonsDataset(test_files, mode='test')

# Taining model

In [35]:
model = torchvision.models.inception_v3(pretrained=True, aux_logits=False)

In [36]:
model.fc = nn.Linear(2048, 42)

In [None]:
model.to(DEVICE)

In [None]:
for i in range(5):
    train(train_dataset, val_dataset, model, 10, 64)
    path = f'/content/drive/MyDrive/Colab Notebooks/weigths_sympsons{i+1}0.pt'
    torch.save(model.state_dict(), path)

# Make submission

In [None]:
model.load_state_dict(torch.load('/content/weigths_sympsons50.pth'))

In [62]:
load_test = DataLoader(test_dataset)
probs = predict(model, load_test)
probs = np.argmax(probs, axis=1)

In [77]:
with open('/content/label_encoder.pkl', 'rb') as f:
    enc = pickle.load(f)

In [None]:
probs = probs.tolist()
subs = enc.inverse_transform(probs)

In [None]:
names = [path.name for path in test_files]
names = pd.Series(names, name='Id')
subs = pd.Series(subs, name='Expected')
submission = pd.DataFrame({'Id': names, 'Expected': subs})

In [96]:
submission.to_csv('sub.csv', index=False)