In [19]:
import torch
torch.cuda.current_device()
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pytorch_lightning as pl
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from PIL import Image
import os, csv
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

import re
from nltk.tokenize import wordpunct_tokenize

# Data

In [3]:

data_folder = '../dataset/'
with open(data_folder + 'genres.txt', 'r') as f:
    genres_list = [g.replace('\n','') for g in f.readlines()]
movies_train = pd.read_csv(data_folder + 'movies_train.csv')
movies_test = pd.read_csv(data_folder + 'movies_test.csv')
movies_val = pd.read_csv(data_folder + 'movies_val.csv')


## Title

In [4]:
TITLE_MAX_LEN = 15
pad_token = '<PAD>'
unk_token = '<UNK>'

def tokenize(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    tokens = wordpunct_tokenize(text)
    return tokens

def create_vocab(dataset, column='title'):
    df = dataset.copy()
    titles = df[column].tolist()
    vocab = set()
    for title in titles:
        tokens = tokenize(title)
        vocab.update(tokens)
    vocab = list(vocab)
    vocab.append(pad_token)
    vocab.append(unk_token)
    return vocab

def onehot_vectorize(title, title2int):
    tokens = tokenize(title)
    tokens = tokens[:TITLE_MAX_LEN]
    while len(tokens) < TITLE_MAX_LEN:
        tokens.append(pad_token)
    title_vec = np.zeros((TITLE_MAX_LEN,len(title2int)), dtype=np.float32)
    for i, token in enumerate(tokens):
        if token in title2int:
            title_vec[i][title2int[token]] = 1
        else:
            title_vec[i][title2int[unk_token]] = 1
    return title_vec
    
def multihot_genres(genres,  genres_dict):
    genres = genres.strip('][').replace("'", "").split(', ')
    multi_hot = np.zeros(len(genres_dict))
    for genre in genres:
        if genre in genres_dict:
                multi_hot[genres_dict[genre]] = 1
    return multi_hot

## Poster

In [5]:
# Train the model with different image sizes
IMAGE_SIZE_TEST_MAP = {"16": (16, 16),
                       "24": (24, 24),
                       "32": (32, 32),
                       "40": (40, 40),
                       "64": (64, 64),
                       "72": (72, 72),
                       "128": (128, 128),
                       "224": (224, 224),
                       "256": (256, 256)}
IMAGE_SIZE = IMAGE_SIZE_TEST_MAP['224']
transformer = transforms.Compose([
    transforms.Resize(size=IMAGE_SIZE),
    transforms.ToTensor()
])

## Dataset

In [6]:
class titleDataset(Dataset):
    def __init__(self, df):
        self.df = df

        # title process
        vocab = create_vocab(df, column='title')
        self.vocab_size = len(vocab)
        self.title2int = {word: i for i, word in enumerate(vocab)}

        # image process
        self.transformer = transformer

        # genres process
        genres_list = ['Crime', 'Thriller', 'Fantasy', 'Horror', 'Sci-Fi', 'Comedy', 'Documentary', 'Adventure', 'Film-Noir', 'Animation', 'Romance', 'Drama', 'Western', 'Musical', 'Action', 'Mystery', 'War', "Children's"]
        self.genre2int = {genre: i for i, genre in enumerate(genres_list)} 

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        title = self.df.iloc[idx]['title']
        title_vec = onehot_vectorize(title, self.title2int)
        genres = self.df.iloc[idx]['genre']
        genres_vec = multihot_genres(genres, self.genre2int)
        img_path = self.df.iloc[idx]['img_path']
        if not os.path.exists(img_path):
            img_path = '../dataset/images/0.jpg'
        img = Image.open(img_path)
        if len(img.getbands()) == 1: # check if the image have only one channel
            trans = transforms.Grayscale(num_output_channels=3)
            img = trans(img) # convert image to a three-channel image
        img = self.transformer(img)
        return title_vec, img, genres_vec
    
    def merge_vocab(self, other):
        self.title2int.update(other.title2int)
        self.genre2int.update(other.genre2int)
        self.vocab_size = len(self.title2int)

In [7]:
NUM_WORKERS = os.cpu_count()
train = titleDataset(movies_train)
val = titleDataset(movies_val)
test = titleDataset(movies_test)
train.merge_vocab(val)
train.merge_vocab(test)
val.merge_vocab(train)
test.merge_vocab(train)
train_dataloader = DataLoader(train, batch_size=32, shuffle=True, num_workers=NUM_WORKERS)
val_dataloader = DataLoader(val, batch_size=32, shuffle=False, num_workers=NUM_WORKERS)
test_dataloader = DataLoader(test, batch_size=32, shuffle=False, num_workers=NUM_WORKERS)

# Model

In [24]:
class theModel(pl.LightningModule):
    def __init__(self, titleParam, posterParam, num_labels=18, device='cpu'):
        """
        The main model using LSTM for title and TinyVGG for poster
        :param tuple titleParam: (input_size : int, hidden_size : int, num_layers : int, bidirectional : bool)
        :param tuple posterParam: (input_shape, hidden_units)
        """
        super(theModel, self).__init__()
        self.dev = device  # device variable was taken, so using dev instead :(
        self.input_size, self.hidden_size, self.num_layers, self.bidirectional = titleParam
        self.input_shape, self.hidden_units = posterParam
        self.num_labesls = num_labels

        # Title
        self.core = nn.LSTM(
            input_size=self.input_size,
            hidden_size=self.hidden_size,
            num_layers=self.num_layers,
            bidirectional=self.bidirectional
        )
        linear_size = self.hidden_size
        if self.bidirectional:
            linear_size *= 2

        self.linear = nn.Sequential(
            nn.Linear(linear_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, self.num_labesls),
            # nn.ReLU()
        )

        # Poster
        self.conv_block_1 = nn.Sequential(
            nn.Conv2d(in_channels=self.input_shape, out_channels=self.hidden_units,
                      kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=self.hidden_units,
                      out_channels=self.hidden_units, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv_block_2 = nn.Sequential(
            nn.Conv2d(in_channels=self.hidden_units,
                      out_channels=self.hidden_units, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=self.hidden_units,
                      out_channels=self.hidden_units, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=self.hidden_units *
                      int(IMAGE_SIZE[0]/4)*int(IMAGE_SIZE[0]/4), out_features=1024),
            nn.ReLU(),
                nn.Linear(in_features=1024, out_features=512),
                nn.ReLU(),
                nn.Linear(in_features=512, out_features=256),
                nn.ReLU(),
            nn.Linear(in_features=256, out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=self.num_labesls)
        )

        # Assembling
        self.fc = nn.Linear(2*self.num_labesls, self.num_labesls)

    def forward(self, title, poster):
        # Title
        Tout, _ = self.core(title)  # , h0)
        # out = [batch_size, seq_len, hidden_size*bidirectional (vector size)]
        # => only take the last element (many to one RNN)
        Tout = Tout[:, -1, :]
        Tout = self.linear(Tout)

        # Poster
        Pout = self.conv_block_1(poster)
        Pout = self.conv_block_2(Pout)
        Pout = self.classifier(Pout)
        
        # Assembling
        out = self.fc(torch.cat((Tout, Pout), dim=1))
        return out

    def training_step(self, train_batch, batch_idx):
        title, img, genres = train_batch
        title_tensor = title.clone().detach().to(self.dev)
        img_tensor = img.clone().detach().to(self.dev)
        genre_tensor = genres.clone().detach().to(self.dev)

        output = self.forward(title_tensor, img_tensor)
        loss = self.cross_entropy_loss(output, genre_tensor)
        return loss

    def validation_step(self, val_batch, batch_idx):
        title, img, genres = val_batch
        title_tensor = title.clone().detach().to(self.dev)
        img_tensor = img.clone().detach().to(self.dev)
        genre_tensor = genres.clone().detach().to(self.dev)

        output = self.forward(title_tensor, img_tensor)
        loss = self.cross_entropy_loss(output, genre_tensor)
        self.log('val_loss', loss)
        # print('val_loss', loss)

    # def test_step(self, test_batch, batch_idx):
    #     title, genres = test_batch
    #     title_tensor = title.clone().detach().to(self.dev)
    #     genre_tensor = genres.clone().detach().to(self.dev)

    #     output = self.forward(title_tensor)

    #     MAP = MAP_at_K(3, output, genre_tensor)
    #     print('MAP@3', MAP)
    #     self.log('MAP@3', MAP)
    #     return {'MAP@3': MAP}

    # def on_test_epoch_end(self, outputs):
    #     avg_MAP = torch.stack([x['MAP@3'] for x in outputs]).mean()
    #     print('avg_MAP@3', avg_MAP)
    #     self.log('avg_MAP@3', avg_MAP)
    #     return {'avg_MAP@3': avg_MAP}

    def predict_step(self, test_batch, batch_idx):
        title, img, genres = test_batch
        title_tensor = title.clone().detach().to(self.dev)
        img_tensor = img.clone().detach().to(self.dev)
        genre_tensor = genres.clone().detach().to(self.dev)

        output = self.forward(title_tensor, img_tensor)
        return output, genre_tensor

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=0.001)

    def cross_entropy_loss(self, logits, labels):
        return F.cross_entropy(logits, labels)

# Train

In [25]:
NUM_EPOCHS = 20
BATCH_SIZE = 32
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(torch.cuda.is_available())
print(train_dataloader.dataset.vocab_size)
titleParam = (train_dataloader.dataset.vocab_size, 128, 2, True)
posterParam = (3, 32)
model = theModel(titleParam, posterParam, num_labels=18, device=device)

trainer = pl.Trainer(max_epochs=NUM_EPOCHS, num_sanity_val_steps=0)
trainer.fit(model, train_dataloader, val_dataloader)

True
4071


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type       | Params
--------------------------------------------
0 | core         | LSTM       | 4.7 M 
1 | linear       | Sequential | 19.1 K
2 | conv_block_1 | Sequential | 10.1 K
3 | conv_block_2 | Sequential | 18.5 K
4 | classifier   | Sequential | 103 M 
5 | fc           | Linear     | 666   
--------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
432.793   Total estimated model params size (MB)


Epoch 19: 100%|██████████| 78/78 [00:19<00:00,  4.05it/s, v_num=81]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 78/78 [00:22<00:00,  3.43it/s, v_num=81]


# Infer and Metric

In [26]:
def P_at_K(k, pred, truth):
    # print(pred)
    _, indices = torch.topk(pred, k=k)
    correct = 0
    for id in indices:
        if truth[id] > 0:
            correct += 1
    return correct / k

def AP_at_K(k, pred, truth):
    AP = 0
    for i in range(1, k+1):
        AP += P_at_K(i, pred, truth) 
    return AP / k

def MAP_at_K(k, pred_list, truth_list):
    MAP = 0
    for i in range(len(pred_list)):
        MAP += AP_at_K(k, pred_list[i], truth_list[i])
    return MAP / len(pred_list)


def normalize(pred, topk=False):
    pred_1 = torch.zeros(pred.shape)
    for i in range(pred.shape[0]):
        if topk:
            ids = torch.topk(pred[i], k=3).indices
        else:
            ids = [j*(pred[i][j] > 0.0) for j in range(len(pred[i]))]
        for id in ids:
            pred_1[i][id] = 1
    return pred_1

In [28]:
res = trainer.predict(model, dataloaders=test_dataloader, ckpt_path="last")
pred = torch.cat([ep[0] for ep in res])
truth = torch.cat([ep[1] for ep in res])

pred_1 = normalize(pred, topk=True)
print(pred.shape)
print(pred_1.shape)
print(truth.shape)

/home/huy/miniconda3/envs/ml/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py:186: .predict(ckpt_path="last") is set, but there is no last checkpoint available. No checkpoint will be loaded.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 25/25 [00:01<00:00, 17.66it/s]
torch.Size([777, 18])
torch.Size([777, 18])
torch.Size([777, 18])


In [31]:
print("--------------------------------------")
print('MAP@1 ', MAP_at_K(1, pred, truth))
print('MAP@2 ', MAP_at_K(2, pred, truth))
print('MAP@3 ', MAP_at_K(3, pred, truth))
print('MAP@4 ', MAP_at_K(4, pred, truth))

print("--------------------------------------")
# from torchmetrics.functional.classification import multilabel_f1_score
from torchmetrics.classification import MultilabelF1Score

thres=0.0
# print('mF1 - micro:    ', multilabel_f1_score(pred, truth, num_labels=18, threshold=thres, average='micro').tolist())
# print('mF1 - macro:    ', multilabel_f1_score(pred, truth, num_labels=18, threshold=thres, average='macro').tolist())
# print('mF1 - weighted: ', multilabel_f1_score(pred, truth, num_labels=18, threshold=thres, average='weighted').tolist())

f1 = MultilabelF1Score(num_labels=18, threshold=thres, average='macro')
print('F1 :', f1(pred, truth).tolist())

print("--------------------------------------")
# from torchmetrics.functional.classification import multilabel_accuracy
from torchmetrics.classification import MultilabelAccuracy
acc = MultilabelAccuracy(num_labels=18, threshold=thres)
print('Accuracy :', acc(pred, truth).tolist())

print("--------------------------------------")
# from torchmetrics.functional.classification import multilabel_precision
from torchmetrics.classification import MultilabelPrecision
prec = MultilabelPrecision(num_labels=18, threshold=thres, average='macro')
print('Precision :', prec(pred, truth).tolist())

print("--------------------------------------")
# from torchmetrics.functional.classification import multilabel_recall
from torchmetrics.classification import MultilabelRecall
rec = MultilabelRecall(num_labels=18, threshold=thres, average='macro')
print('Recall :', rec(pred, truth).tolist())


--------------------------------------
MAP@1  0.39768339768339767
MAP@2  0.37773487773487774
MAP@3  0.3442013442013438
MAP@4  0.3176748176748195
--------------------------------------
F1 : 0.14092479646205902
--------------------------------------
Accuracy : 0.08451308310031891
--------------------------------------
Precision : 0.08451308310031891
--------------------------------------
Recall : 0.9444444179534912
