In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pytorch_lightning as pl
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import os, csv
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

import re
from nltk.tokenize import wordpunct_tokenize

In [2]:

data_folder = '../dataset/'
with open(data_folder + 'genres.txt', 'r') as f:
    genres_list = [g.replace('\n','') for g in f.readlines()]
movies_train = pd.read_csv(data_folder + 'movies_train.csv')
movies_test = pd.read_csv(data_folder + 'movies_test.csv')
movies_val = pd.read_csv(data_folder + 'movies_val.csv')
# movies_train, movies_val = train_test_split(movies_train, test_size=0.1, random_state=42)
# movies_train.reset_index(drop=True, inplace=True)
# movies_test.reset_index(drop=True, inplace=True)
# movies_val.reset_index(drop=True, inplace=True)


In [3]:
TITLE_MAX_LEN = 15
pad_token = '<PAD>'
unk_token = '<UNK>'

def tokenize(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    tokens = wordpunct_tokenize(text)
    return tokens

def create_vocab(dataset, column='title'):
    df = dataset.copy()
    titles = df[column].tolist()
    vocab = set()
    for title in titles:
        tokens = tokenize(title)
        vocab.update(tokens)
    vocab = list(vocab)
    vocab.append(pad_token)
    vocab.append(unk_token)
    return vocab

def onehot_vectorize(title, title2int):
    tokens = tokenize(title)
    tokens = tokens[:TITLE_MAX_LEN]
    while len(tokens) < TITLE_MAX_LEN:
        tokens.append(pad_token)
    title_vec = np.zeros((TITLE_MAX_LEN,len(title2int)), dtype=np.float32)
    for i, token in enumerate(tokens):
        if token in title2int:
            title_vec[i][title2int[token]] = 1
        else:
            title_vec[i][title2int[unk_token]] = 1
    return title_vec
    
def multihot_genres(genres,  genres_dict):
    genres = genres.strip('][').replace("'", "").split(', ')
    multi_hot = np.zeros(len(genres_dict) + 1)
    for genre in genres:
        if genre in genres_dict:
                multi_hot[genres_dict[genre]] = 1
    multi_hot[-1] = len(genres)
    return multi_hot

In [4]:
class titleDataset(Dataset):
    def __init__(self, df):
        self.df = df

        # title process
        vocab = create_vocab(df, column='title')
        self.vocab_size = len(vocab)
        self.title2int = {word: i for i, word in enumerate(vocab)}

        # genres process
        genres_list = ['Crime', 'Thriller', 'Fantasy', 'Horror', 'Sci-Fi', 'Comedy', 'Documentary', 'Adventure', 'Film-Noir', 'Animation', 'Romance', 'Drama', 'Western', 'Musical', 'Action', 'Mystery', 'War', "Children's"]
        self.genre2int = {genre: i for i, genre in enumerate(genres_list)} 

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        title = self.df.iloc[idx]['title']
        title_vec = onehot_vectorize(title, self.title2int)
        genres = self.df.iloc[idx]['genre']
        genres_vec = multihot_genres(genres, self.genre2int)
        # print('*****************************')
        # print(title)
        # print(title_vec)
        # print(genres)
        # print(genres_vec)
        # print('*****************************')
        return title_vec, genres_vec
    
    def merge_vocab(self, other):
        self.title2int.update(other.title2int)
        self.genre2int.update(other.genre2int)
        self.vocab_size = len(self.title2int)

In [5]:
def P_at_K(k, pred, truth):
    # print(pred)
    _, indices = torch.topk(pred, k=k)
    correct = 0
    for id in indices:
        if truth[id] > 0:
            correct += 1
    return correct / k

def AP_at_K(k, pred, truth):
    AP = 0
    for i in range(1, k+1):
        AP += P_at_K(i, pred, truth) 
    return AP / k

def MAP_at_K(k, pred_list, truth_list):
    MAP = 0
    for i in range(len(pred_list)):
        MAP += AP_at_K(k, pred_list[i], truth_list[i])
    return MAP / len(pred_list)


In [6]:
train = titleDataset(movies_train)
val = titleDataset(movies_val)
test = titleDataset(movies_test)
train.merge_vocab(val)
train.merge_vocab(test)
val.merge_vocab(train)
test.merge_vocab(train)
train_dataloader = DataLoader(train, batch_size=32, shuffle=True, num_workers=6)
val_dataloader = DataLoader(val, batch_size=32, shuffle=False, num_workers=6)
test_dataloader = DataLoader(test, batch_size=32, shuffle=False, num_workers=6)

In [12]:
class titleModel(pl.LightningModule):
    def __init__(self, core, input_size, hidden_size, num_layers=1, batch_first=True, bidirectional = False, device = 'cpu'):
        super(titleModel,self).__init__()
        self.dev = device # device variable was taken, so using dev instead :(
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        if core == 'RNN':
            self.core = nn.RNN(
                input_size = input_size, 
                hidden_size = hidden_size, 
                num_layers = num_layers, 
                batch_first = batch_first, 
                nonlinearity = 'relu', 
                bidirectional = bidirectional
            )
        else:
            self.core = nn.LSTM(
                input_size = input_size, 
                hidden_size = hidden_size, 
                num_layers = num_layers, 
                batch_first = batch_first, 
                bidirectional = bidirectional
            )
        linear_size = hidden_size
        if bidirectional:
            linear_size *= 2
        self.linear = nn.Linear(linear_size, 18)
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)
        
    
    def forward(self,x):
        out, _ = self.core(x)#, h0)  
        # out = [batch_size, seq_len, hidden_size*bidirectional (vector size)]
        # => only take the last element (many to one RNN)
        out = out[:, -1, :]  
        out = self.linear(out)
        # out = self.sigmoid(out)
        # out = self.softmax(out)
        return out
    
    def training_step(self, train_batch, batch_idx):
        title, genres = train_batch
        # print("-----------------------------------", title.shape, genres.shape)
        # title_tensor = torch.tensor(title).to(self.dev)
        # genre_tensor = torch.tensor(genres).to(self.dev)
        title_tensor = title.clone().detach().to(self.dev)
        genre_tensor = genres.clone().detach().to(self.dev)

        # print("***********************************", title_tensor)
        output = self.forward(title_tensor)
        loss = self.cross_entropy_loss(output, genre_tensor)
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        title, genres = val_batch
        title_tensor = title.clone().detach().to(self.dev)
        genre_tensor = genres.clone().detach().to(self.dev)
        
        output = self.forward(title_tensor)
        loss = self.cross_entropy_loss(output, genre_tensor)
        self.log('val_loss', loss)
        # print('val_loss', loss)
    
    def test_step(self, test_batch, batch_idx):
        title, genres = test_batch
        title_tensor = title.clone().detach().to(self.dev)
        genre_tensor = genres.clone().detach().to(self.dev)
        
        output = self.forward(title_tensor)
        
        MAP = MAP_at_K(3, output, genre_tensor)
        print('MAP@3', MAP)
        self.log('MAP@3', MAP)
        return {'MAP@3': MAP}
    
    # def on_test_epoch_end(self, outputs):
    #     avg_MAP = torch.stack([x['MAP@3'] for x in outputs]).mean()
    #     print('avg_MAP@3', avg_MAP)
    #     self.log('avg_MAP@3', avg_MAP)
    #     return {'avg_MAP@3': avg_MAP}

    def predict_step(self, test_batch, batch_idx):
        title, genres = test_batch
        title_tensor = title.clone().detach().to(self.dev)
        genre_tensor = genres.clone().detach().to(self.dev)
        
        output = self.forward(title_tensor)
        return output, genre_tensor


    def configure_optimizers(self):
        return optim.Adam(self.parameters(),lr=0.001)
    
    def cross_entropy_loss(self,logits,labels):
        return F.cross_entropy(logits,labels)

In [13]:
NUM_EPOCHS = 20
BATCH_SIZE = 32
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(train_dataloader.dataset.vocab_size)
titleModel = titleModel(core = 'LSTM',input_size = train_dataloader.dataset.vocab_size, hidden_size = 128, num_layers=2, bidirectional= True,device = device)

trainer = pl.Trainer(max_epochs=NUM_EPOCHS, num_sanity_val_steps=0)
trainer.fit(titleModel, train_dataloader, val_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type    | Params
------------------------------------
0 | core    | LSTM    | 4.7 M 
1 | linear  | Linear  | 4.6 K 
2 | sigmoid | Sigmoid | 0     
3 | softmax | Softmax | 0     
------------------------------------
4.7 M     Trainable params
0         Non-trainable params
4.7 M     Total params
18.807    Total estimated model params size (MB)


4071
Epoch 19: 100%|██████████| 78/78 [00:02<00:00, 33.89it/s, v_num=58]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 78/78 [00:02<00:00, 31.82it/s, v_num=58]


In [10]:
# checkpoint_path = "../model/lightning_logs/version_39/checkpoints/epoch=0-step=78.ckpt"
checkpoint_path = "last"
trainer.test(titleModel, dataloaders=test_dataloader, ckpt_path=checkpoint_path)


/home/huy/miniconda3/envs/ml/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py:186: .test(ckpt_path="last") is set, but there is no last checkpoint available. No checkpoint will be loaded.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0:   0%|          | 0/25 [00:00<?, ?it/s]MAP@3 0.2934027777777777
Testing DataLoader 0:   4%|▍         | 1/25 [00:00<00:01, 13.75it/s]MAP@3 0.2864583333333332
Testing DataLoader 0:   8%|▊         | 2/25 [00:00<00:01, 20.93it/s]MAP@3 0.3541666666666665
Testing DataLoader 0:  12%|█▏        | 3/25 [00:00<00:01, 19.48it/s]MAP@3 0.32812499999999994
Testing DataLoader 0:  16%|█▌        | 4/25 [00:00<00:00, 22.76it/s]MAP@3 0.2465277777777777
Testing DataLoader 0:  20%|██        | 5/25 [00:00<00:00, 25.15it/s]MAP@3 0.2135416666666666
Testing DataLoader 0:  24%|██▍       | 6/25 [00:00<00:00, 27.07it/s]MAP@3 0.23263888888888884
Testing DataLoader 0:  28%|██▊       | 7/25 [00:00<00:00, 28.48it/s]MAP@3 0.29861111111111094
Testing DataLoader 0:  32%|███▏      | 8/25 [00:00<00:00, 29.73it/s]MAP@3 0.29513888888888884
Testing DataLoader 0:  36%|███▌      | 9/25 [00:00<00:00, 30.66it/s]MAP@3 0.1961805555555555
Testing DataLoader 0:  40%|████      | 10/25 [00:00<00:00, 31.59it/s]MAP@3

[{'MAP@3': 0.2926497757434845}]

In [14]:
res = trainer.predict(titleModel, dataloaders=test_dataloader, ckpt_path="last")
pred = torch.cat([ep[0] for ep in res])
truth = torch.cat([ep[1] for ep in res])
print('MAP@1 ', MAP_at_K(1, pred, truth))
print('MAP@2 ', MAP_at_K(2, pred, truth))
print('MAP@3 ', MAP_at_K(3, pred, truth))
print('MAP@4 ', MAP_at_K(4, pred, truth))

/home/huy/miniconda3/envs/ml/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py:186: .predict(ckpt_path="last") is set, but there is no last checkpoint available. No checkpoint will be loaded.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 25/25 [00:00<00:00, 71.01it/s]
MAP@1  0.33462033462033464
MAP@2  0.2985842985842986
MAP@3  0.27999427999428045
MAP@4  0.2616366366366373


In [None]:
for p in pred:
    print(p)

In [15]:
torch.set_printoptions(precision=5, sci_mode=False)
print(pred[:3])
pred_1 = torch.zeros(pred.shape)
for i in range(pred.shape[0]):
    ids = torch.topk(pred[i], k=3).indices
    # ids = [j*(pred[i][j] > 0.5) for j in range(len(pred[i]))]
    for id in ids:
        pred_1[i][id] = 1
print(pred_1[:3])
print(truth[:3])
print('F1 - micro:    ',f1_score(truth, pred_1, average='micro'))
print('F1 - macro:    ',f1_score(truth, pred_1, average='macro'))
print('F1 - weighted: ',f1_score(truth, pred_1, average='weighted'))
print('F1 - samples:  ',f1_score(truth, pred_1, average='samples'))

tensor([[-3.13211,  1.15558, -0.63910,  1.28267,  1.19266,  1.32292,  1.39921,
          0.84996, -1.10016,  1.36085, -1.07900, -2.36797,  0.04489,  0.66294,
          0.58657, -2.41772, -1.56134, -6.19012],
        [-3.75969,  1.24626, -0.32128,  1.62122,  1.53668,  0.86352,  1.16034,
          1.28893, -1.95686,  1.00306, -1.17818, -2.74277, -0.85065, -0.03989,
          1.07012, -3.07357, -0.97900, -6.96506],
        [-4.79051,  0.38106,  1.56849,  2.92475,  1.93658, -1.07158, -1.52262,
          2.62998, -3.97206, -0.67645, -0.89711, -3.19688, -3.67912, -2.83179,
          2.61953, -3.98802,  1.36768, -7.33003]])
tensor([[0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]])
tensor([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [20]:
from torchmetrics.functional.classification import multilabel_f1_score 
thres=0.5
print('mF1 - micro:    ', multilabel_f1_score(pred, truth, num_labels=18, threshold=thres, average='micro'))
print('mF1 - macro:    ', multilabel_f1_score(pred, truth, num_labels=18, threshold=thres, average='macro'))
print('mF1 - weighted: ', multilabel_f1_score(pred, truth, num_labels=18, threshold=thres, average='weighted'))

mF1 - micro:     tensor(0.25838)
mF1 - macro:     tensor(0.15315)
mF1 - weighted:  tensor(0.34076)


In [22]:
for k in range(1, 4):
    pred_1 = torch.zeros(pred.shape)
    for i in range(pred.shape[0]):
        ids = torch.topk(pred[i], k=k).indices
        for id in ids:
            pred_1[i][id] = 1

    print('mF1@{} - macro:    '.format(k), multilabel_f1_score(pred_1, truth, num_labels=18, threshold=thres, average='macro'))

mF1@1 - macro:     tensor(0.07092)
mF1@2 - macro:     tensor(0.11685)
mF1@3 - macro:     tensor(0.12911)
