In [None]:
!pip install -q transformers datasets

In [None]:
import os
import copy
import tqdm
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import seaborn as sns
from torch.optim import Adam
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [None]:
!pip install huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import huggingface_hub
huggingface_hub.notebook_login()

In [None]:
# Login to the Hub from the notebook to access the dataset
from datasets import load_dataset
dataset = load_dataset("datadrivenscience/movie-genre-prediction")

In [None]:
example = dataset['train'][6]
example

{'id': 53777,
 'movie_name': 'Candid',
 'synopsis': 'A video voyeur stalks women in the city with a digital camera until he crosses paths with beautiful model who harbors a dark secret; she is a serial killer.',
 'genre': 'horror'}

In [None]:
import pandas as pd
test_df = pd.DataFrame( dataset['test'] )
train_df = pd.DataFrame( dataset['train'] )


train_df.head()

Unnamed: 0,id,movie_name,synopsis,genre
0,44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy
1,50185,Entity Project,A director and her friends renting a haunted h...,horror
2,34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family
3,78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi
4,2206,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action


In [None]:
possible_labels = train_df.genre.unique()

label_dict = {
    possible_label: index
    for index, possible_label in enumerate(possible_labels)
}
label_dict

{'fantasy': 0,
 'horror': 1,
 'family': 2,
 'scifi': 3,
 'action': 4,
 'crime': 5,
 'adventure': 6,
 'mystery': 7,
 'romance': 8,
 'thriller': 9}

In [None]:
train_df['label'] = train_df.genre.replace(label_dict)
test_df['label'] = test_df.genre.replace(label_dict)

## Dataset preparation

In [None]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
from torch.utils.data import Dataset, DataLoader

class MGPDataset(Dataset):

    def __init__(self, synopses,movie_names,labels, seq_len, bert_variant = "roberta-base"):
        self.synopses = synopses
        self.movie_names = movie_names
        self.labels = labels
        self.seq_len = seq_len
        self.tokenizer = tokenizer.from_pretrained(bert_variant)

    def __len__(self):
        """
        Returns the length of the dataset i.e. the number of movies present in the dataset
        """
        return len(self.synopses)

    def __getitem__(self, idx):
        token_out = self.tokenizer(self.movie_names[idx],self.synopses[idx], max_length=self.seq_len, padding="max_length", truncation = False, return_tensors="pt")
        input_ids = token_out["input_ids"]
        mask = token_out["attention_mask"]
        label = self.labels[idx]
        return input_ids.squeeze(0), mask.squeeze(0), label

In [None]:

train_df, val_df= train_test_split(train_df,test_size=0.15,random_state=42,stratify=train_df.label.values)

In [None]:
seq_len = 128
batch_size = 32

train_synopses,train_movie_names,train_labels = train_df["synopsis"].values, train_df["movie_name"].values, train_df["label"].values
val_synopses,val_movie_names,val_labels = val_df["synopsis"].values, val_df["movie_name"].values, val_df["label"].values
test_synopses,test_movie_names,test_labels = test_df["synopsis"].values, test_df["movie_name"].values, test_df["label"].values

train_dataset = MGPDataset(train_synopses, train_movie_names, train_labels, seq_len=seq_len)
val_dataset = MGPDataset(val_synopses, val_movie_names, val_labels, seq_len=seq_len)
test_dataset = MGPDataset(test_synopses, test_movie_names, test_labels, seq_len=seq_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

## Training

In [None]:
from transformers import RobertaModel
model = RobertaModel.from_pretrained('roberta-base')
model

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [None]:
movie_name = 'sound of music'
synopsis = "a high-spirited musical that exquisitely blends music , and high drama ."
tokenizer_output = tokenizer(synopsis,movie_name,return_tensors="pt")
input_ids, attn_mask = tokenizer_output["input_ids"], tokenizer_output["attention_mask"]

output = model(input_ids, attention_mask = attn_mask)
last_hidden_state = output.last_hidden_state
print(f"input_ids shape: {input_ids.shape}")
print(f"last_hidden_state shape: {last_hidden_state.shape}")

input_ids shape: torch.Size([1, 24])
last_hidden_state shape: torch.Size([1, 24, 768])


In [None]:
class RobertaClassifierModel(nn.Module):

    def __init__(self, d_hidden = 768, bert_variant = "roberta-base"):
        super(RobertaClassifierModel, self).__init__()
        self.bert_layer = RobertaModel.from_pretrained(bert_variant)
        self.output_layer = nn.Linear(d_hidden, 10)
        self.log_softmax_layer = nn.LogSoftmax(dim=-1)

    def forward(self, input_ids, attn_mask):
        output = self.bert_layer(input_ids,attn_mask)
        output = self.output_layer(output.pooler_output)
        output = self.log_softmax_layer(output)
        return output

In [None]:
def evaluate(model, test_dataloader, threshold = 0.5, device = "cpu"):
    model.eval()
    model = model.to(device)
    accuracy = 0

    with torch.no_grad():
        for test_batch in test_dataloader:
            features,masks,labels = test_batch
            features = features.long().to(device)
            masks = masks.to(device)
            labels = labels.long().to(device)

            pred_logprobs = model(features,masks)
            pred_logprobs = pred_logprobs.detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()
            pred_labels = np.argmax(pred_logprobs,axis=1)
            batch_accuracy = (pred_labels == labels).mean()

            accuracy += batch_accuracy

        accuracy = accuracy / len(test_dataloader)

    return accuracy

In [None]:
def train(model, train_dataloader, val_dataloader,
          lr = 1e-5, num_epochs = 3,
          device = "cuda"):
    epoch_loss = 0
    model = model.to(device)

    best_val_accuracy = float("-inf")
    best_model = None

    loss_fn = nn.NLLLoss()
    optimizer = Adam(model.parameters(), lr = lr)

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0

        for train_batch in tqdm.tqdm(train_dataloader):
          # Zero out any gradients stored in the previous steps
            optimizer.zero_grad()

          # Unwrap the batch to get features and labels
            features,masks,labels = train_batch
            features = features.long()
            masks = masks
            labels = labels.long()
            features = features.to(device)
            masks = masks.to(device)
            labels = labels.to(device)
            preds = model(features,masks)
            loss = loss_fn(preds,labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        epoch_loss = epoch_loss / len(train_dataloader)
        val_accuracy = evaluate(model, val_dataloader, threshold = 0.5, device = device)
        print(f'Validation Accuracy: {val_accuracy}')
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model = copy.deepcopy(model)

        print(f"Epoch {epoch} completed | Average Training Loss: {epoch_loss} | Validation Accuracy: {val_accuracy}")

    return best_model, best_val_accuracy

In [None]:
torch.manual_seed(42)
model = RobertaClassifierModel()
best_model, best_val_acc = train(model, train_loader, val_loader, num_epochs = 3, device = "cuda")
print(f"Best Validation Accuracy: {best_val_acc}")

Training on 100 data points for sanity check


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1435/1435 [15:1

Validation Accuracy: 0.40994094488188976
Epoch 0 completed | Average Training Loss: 1.7597119994279815 | Validation Accuracy: 0.40994094488188976


100%|██████████| 1435/1435 [15:16<00:00,  1.57it/s]


Validation Accuracy: 0.41486220472440943
Epoch 1 completed | Average Training Loss: 1.569253020253331 | Validation Accuracy: 0.41486220472440943


100%|██████████| 1435/1435 [15:15<00:00,  1.57it/s]


Validation Accuracy: 0.41227854330708663
Epoch 2 completed | Average Training Loss: 1.4685330064869924 | Validation Accuracy: 0.41227854330708663
Best Validation Accuracy: 0.41486220472440943


In [None]:
best_model

RobertaClassifierModel(
  (bert_layer): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Layer

In [None]:
def predict_text(model, test_dataloader, device = "cuda"):

    model = model.to(device)
    model.eval()
    results  = []
    for test_batch in tqdm.tqdm(test_dataloader):
        features,masks,labels = test_batch
        features = features.long().to(device)
        masks = masks.to(device)
        pred_logprobs = model(features,masks)
        pred_logprobs = pred_logprobs.detach().cpu().numpy()
        pred_labels = np.argmax(pred_logprobs,axis=1)
        results.extend(pred_labels)
    return results