In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pytorch_lightning as pl
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import os, csv
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

import re
from nltk.tokenize import wordpunct_tokenize

In [2]:

data_folder = '../dataset/'
with open(data_folder + 'genres.txt', 'r') as f:
    genres_list = [g.replace('\n','') for g in f.readlines()]
movies_train = pd.read_csv(data_folder + 'movies_train.csv')
movies_test = pd.read_csv(data_folder + 'movies_test.csv')
movies_val = pd.read_csv(data_folder + 'movies_val.csv')
# movies_train, movies_val = train_test_split(movies_train, test_size=0.1, random_state=42)
# movies_train.reset_index(drop=True, inplace=True)
# movies_test.reset_index(drop=True, inplace=True)
# movies_val.reset_index(drop=True, inplace=True)


In [3]:
# get the maximum number of genres
movies_train['genre'].str.len().idxmax()

1788

In [4]:
TITLE_MAX_LEN = 15
pad_token = '<PAD>'
unk_token = '<UNK>'

def tokenize(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    tokens = wordpunct_tokenize(text)
    return tokens

def create_vocab(dataset, column='title'):
    df = dataset.copy()
    titles = df[column].tolist()
    vocab = set()
    for title in titles:
        tokens = tokenize(title)
        vocab.update(tokens)
    vocab = list(vocab)
    vocab.append(pad_token)
    vocab.append(unk_token)
    return vocab

def onehot_vectorize(title, title2int):
    tokens = tokenize(title)
    tokens = tokens[:TITLE_MAX_LEN]
    while len(tokens) < TITLE_MAX_LEN:
        tokens.append(pad_token)
    title_vec = np.zeros((TITLE_MAX_LEN,len(title2int)), dtype=np.float32)
    for i, token in enumerate(tokens):
        if token in title2int:
            title_vec[i][title2int[token]] = 1
        else:
            title_vec[i][title2int[unk_token]] = 1
    return title_vec
    
def multihot_genres(genres,  genres_dict):
    genres = genres.strip('][').replace("'", "").split(', ')
    multi_hot = np.zeros(len(genres_dict))
    for genre in genres:
        if genre in genres_dict:
                multi_hot[genres_dict[genre]] = 1
    return multi_hot

In [5]:
class titleDataset(Dataset):
    def __init__(self, df):
        self.df = df

        # title process
        vocab = create_vocab(df, column='title')
        self.vocab_size = len(vocab)
        self.title2int = {word: i for i, word in enumerate(vocab)}

        # genres process
        genres_list = ['Crime', 'Thriller', 'Fantasy', 'Horror', 'Sci-Fi', 'Comedy', 'Documentary', 'Adventure', 'Film-Noir', 'Animation', 'Romance', 'Drama', 'Western', 'Musical', 'Action', 'Mystery', 'War', "Children's"]
        self.genre2int = {genre: i for i, genre in enumerate(genres_list)} 

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        title = self.df.iloc[idx]['title']
        title_vec = onehot_vectorize(title, self.title2int)
        genres = self.df.iloc[idx]['genre']
        genres_vec = multihot_genres(genres, self.genre2int)
        # print('*****************************')
        # print(title)
        # print(title_vec)
        # print(genres)
        # print(genres_vec)
        # print('*****************************')
        return title_vec, genres_vec
    
    def merge_vocab(self, other):
        self.title2int.update(other.title2int)
        self.genre2int.update(other.genre2int)
        self.vocab_size = len(self.title2int)

In [6]:
def P_at_K(k, pred, truth):
    # print(pred)
    _, indices = torch.topk(pred, k=k)
    correct = 0
    for id in indices:
        if truth[id] > 0:
            correct += 1
    return correct / k

def AP_at_K(k, pred, truth):
    AP = 0
    for i in range(1, k+1):
        AP += P_at_K(i, pred, truth) 
    return AP / k

def MAP_at_K(k, pred_list, truth_list):
    MAP = 0
    for i in range(len(pred_list)):
        MAP += AP_at_K(k, pred_list[i], truth_list[i])
    return MAP / len(pred_list)


In [7]:
train = titleDataset(movies_train)
val = titleDataset(movies_val)
test = titleDataset(movies_test)
train.merge_vocab(val)
train.merge_vocab(test)
val.merge_vocab(train)
test.merge_vocab(train)
train_dataloader = DataLoader(train, batch_size=32, shuffle=True, num_workers=6)
val_dataloader = DataLoader(val, batch_size=32, shuffle=False, num_workers=6)
test_dataloader = DataLoader(test, batch_size=32, shuffle=False, num_workers=6)

In [89]:
class titleModel(pl.LightningModule):
    def __init__(self, core, input_size, hidden_size, num_layers=1, batch_first=True, bidirectional = False, device = 'cpu'):
        super(titleModel,self).__init__()
        self.dev = device # device variable was taken, so using dev instead :(
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        if core == 'RNN':
            self.core = nn.RNN(
                input_size = input_size, 
                hidden_size = hidden_size, 
                num_layers = num_layers, 
                batch_first = batch_first, 
                nonlinearity = 'relu', 
                bidirectional = bidirectional
            )
        else:
            self.core = nn.LSTM(
                input_size = input_size, 
                hidden_size = hidden_size, 
                num_layers = num_layers, 
                batch_first = batch_first, 
                bidirectional = bidirectional
            )
        linear_size = hidden_size
        if bidirectional:
            linear_size *= 2

        # self.linear64 = nn.Linear(linear_size, 64)
        # self.linear32 = nn.Linear(64, 32)
        # self.linear18 = nn.Linear(32, 18)
        self.linear = nn.Sequential(
            nn.Linear(linear_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 18),
            nn.ReLU()
        )

        # self.sigmoid = nn.Sigmoid()
        # self.softmax = nn.Softmax(dim=1)
        
    
    def forward(self,x):
        out, _ = self.core(x)#, h0)  
        # out = [batch_size, seq_len, hidden_size*bidirectional (vector size)]
        # => only take the last element (many to one RNN)
        out = out[:, -1, :]  
        out = self.linear(out)
        # out = self.sigmoid(out)
        # out = self.softmax(out)
        return out
    
    def training_step(self, train_batch, batch_idx):
        title, genres = train_batch
        # print("-----------------------------------", title.shape, genres.shape)
        # title_tensor = torch.tensor(title).to(self.dev)
        # genre_tensor = torch.tensor(genres).to(self.dev)
        title_tensor = title.clone().detach().to(self.dev)
        genre_tensor = genres.clone().detach().to(self.dev)

        # print("***********************************", title_tensor)
        output = self.forward(title_tensor)
        loss = self.cross_entropy_loss(output, genre_tensor)
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        title, genres = val_batch
        title_tensor = title.clone().detach().to(self.dev)
        genre_tensor = genres.clone().detach().to(self.dev)
        
        output = self.forward(title_tensor)
        loss = self.cross_entropy_loss(output, genre_tensor)
        self.log('val_loss', loss)
        # print('val_loss', loss)
    
    def test_step(self, test_batch, batch_idx):
        title, genres = test_batch
        title_tensor = title.clone().detach().to(self.dev)
        genre_tensor = genres.clone().detach().to(self.dev)
        
        output = self.forward(title_tensor)
        
        MAP = MAP_at_K(3, output, genre_tensor)
        print('MAP@3', MAP)
        self.log('MAP@3', MAP)
        return {'MAP@3': MAP}
    
    # def on_test_epoch_end(self, outputs):
    #     avg_MAP = torch.stack([x['MAP@3'] for x in outputs]).mean()
    #     print('avg_MAP@3', avg_MAP)
    #     self.log('avg_MAP@3', avg_MAP)
    #     return {'avg_MAP@3': avg_MAP}

    def predict_step(self, test_batch, batch_idx):
        title, genres = test_batch
        title_tensor = title.clone().detach().to(self.dev)
        genre_tensor = genres.clone().detach().to(self.dev)
        
        output = self.forward(title_tensor)
        return output, genre_tensor


    def configure_optimizers(self):
        return optim.Adam(self.parameters(),lr=0.001)
    
    def cross_entropy_loss(self,logits,labels):
        return F.cross_entropy(logits,labels)

In [90]:
NUM_EPOCHS = 20
BATCH_SIZE = 32
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(train_dataloader.dataset.vocab_size)
titleModel = titleModel(core = 'LSTM',input_size = train_dataloader.dataset.vocab_size, hidden_size = 128, num_layers=2, bidirectional= True,device = device)

trainer = pl.Trainer(max_epochs=NUM_EPOCHS, num_sanity_val_steps=0)
trainer.fit(titleModel, train_dataloader, val_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type       | Params
--------------------------------------
0 | core   | LSTM       | 4.7 M 
1 | linear | Sequential | 19.1 K
--------------------------------------
4.7 M     Trainable params
0         Non-trainable params
4.7 M     Total params
18.865    Total estimated model params size (MB)


4071
Epoch 19: 100%|██████████| 78/78 [00:02<00:00, 30.75it/s, v_num=73]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 78/78 [00:02<00:00, 28.72it/s, v_num=73]


In [91]:
res = trainer.predict(titleModel, dataloaders=test_dataloader, ckpt_path="last")
pred = torch.cat([ep[0] for ep in res])
truth = torch.cat([ep[1] for ep in res])
print('MAP@1 ', MAP_at_K(1, pred, truth))
print('MAP@2 ', MAP_at_K(2, pred, truth))
print('MAP@3 ', MAP_at_K(3, pred, truth))
print('MAP@4 ', MAP_at_K(4, pred, truth))

/home/huy/miniconda3/envs/ml/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py:186: .predict(ckpt_path="last") is set, but there is no last checkpoint available. No checkpoint will be loaded.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 25/25 [00:00<00:00, 89.24it/s]
MAP@1  0.33719433719433717
MAP@2  0.32496782496782495
MAP@3  0.3025883025883025
MAP@4  0.28582153582153647


In [96]:
for p in pred:
    print(p)

tensor([0.00000, 1.47501, 0.00000, 0.36525, 0.00000, 1.01148, 0.00000, 0.00000,
        0.00000, 0.00000, 0.13485, 1.31689, 0.00000, 0.00000, 1.42163, 0.00000,
        0.00000, 0.00000])
tensor([0.00000, 1.39997, 0.00000, 0.00000, 0.00000, 1.65418, 0.00000, 0.00000,
        0.00000, 0.00000, 1.00899, 2.22062, 0.00000, 0.00000, 1.29291, 0.00000,
        0.00000, 0.00000])
tensor([0.00000, 1.23837, 0.00000, 1.21736, 0.00000, 0.12938, 0.00000, 0.00000,
        0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 1.49696, 0.00000,
        0.01562, 0.00000])
tensor([0.00000, 1.47576, 0.00000, 0.65318, 0.00000, 0.62037, 0.00000, 0.00000,
        0.00000, 0.00000, 0.00000, 0.74178, 0.00000, 0.00000, 1.46586, 0.00000,
        0.00000, 0.00000])
tensor([0.00000, 1.24361, 0.00000, 0.00000, 0.00000, 3.29511, 0.00000, 0.00000,
        0.00000, 0.00000, 2.92094, 4.09458, 0.00000, 0.00000, 0.98222, 0.00000,
        0.00000, 0.00000])
tensor([0.00000, 1.24806, 0.00000, 1.19965, 0.00000, 0.13238, 0.0

In [100]:
torch.set_printoptions(precision=5, sci_mode=False)
# print(pred[:3])
pred_1 = torch.zeros(pred.shape)
for i in range(pred.shape[0]):
    # ids = torch.topk(pred[i], k=3).indices
    ids = [j*(pred[i][j] > 0.0) for j in range(len(pred[i]))]
    for id in ids:
        pred_1[i][id] = 1
# accuracy
acc = 0
for i in range(pred_1.shape[0]):
    for j in range(pred_1.shape[1]):
        acc += int(pred_1[i][j] == truth[i][j])
print('Accuracy: ', acc / (pred.shape[0] * pred.shape[1]))
print('F1 - micro:    ',f1_score(truth, pred_1, average='micro'))
print('F1 - macro:    ',f1_score(truth, pred_1, average='macro'))
print('F1 - weighted: ',f1_score(truth, pred_1, average='weighted'))
print('F1 - samples:  ',f1_score(truth, pred_1, average='samples'))

Accuracy:  0.7030602030602031
F1 - micro:     0.2914178467838253
F1 - macro:     0.11382277737271286
F1 - weighted:  0.31737571394033803
F1 - samples:   0.2829050757622186


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [102]:
from torchmetrics.functional.classification import multilabel_f1_score
from torchmetrics.classification import MultilabelF1Score

thres=0.0
print('mF1 - micro:    ', multilabel_f1_score(pred, truth, num_labels=18, threshold=thres, average='micro').tolist())
print('mF1 - macro:    ', multilabel_f1_score(pred, truth, num_labels=18, threshold=thres, average='macro').tolist())
print('mF1 - weighted: ', multilabel_f1_score(pred, truth, num_labels=18, threshold=thres, average='weighted').tolist())

f1 = MultilabelF1Score(num_labels=18, threshold=thres, average='macro')
print('F1 :', f1(pred, truth).tolist())

mF1 - micro:     0.15585443377494812
mF1 - macro:     0.14092479646205902
mF1 - weighted:  0.3325090706348419
F1 : 0.14092479646205902


: 

In [95]:
pred_1[0]

tensor([1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0.])