In [1]:
!pip install -U bitsandbytes transformers

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

In [2]:
from transformers import DebertaTokenizer
from torch.utils.data import Dataset
from transformers import DebertaModel
import torch.nn as nn
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import torch

In [3]:
class MovieGenreDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, mlb=None, is_train=False, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Handle multi-label binarization
        if is_train:
            self.mlb = MultiLabelBinarizer()
            self.labels = torch.FloatTensor(self.mlb.fit_transform(labels))
        else:
            if mlb is None:
                raise ValueError("Must provide MLBinarizer for val/test sets")
            self.mlb = mlb
            self.labels = torch.FloatTensor(self.mlb.transform(labels))

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': self.labels[idx]
        }

    def get_mlb(self):
        return self.mlb

In [4]:
class DebertaForMultiLabel(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.deberta = DebertaModel.from_pretrained('microsoft/deberta-base')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.deberta.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.deberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        sequence_output = outputs[0]
        pooled_output = sequence_output[:, 0, :]  # Use [CLS] token
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [7]:
with open('features/data.pkl', 'rb') as f:
    movies_clean = pickle.load(f)

with open('features/genres.pkl', 'rb') as f:
    genres = pickle.load(f)

# First split: separate test set (80% train+val, 20% test)
X_temp, X_test, y_temp, y_test = train_test_split(
    movies_clean["cleaned_overview"].tolist(), genres, test_size=0.2, random_state=42,
)

# Second split: separate train and validation from temp (0.8 * 0.8 = 64% train, 0.8 * 0.2 = 16% val)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.2, random_state=42,
)

tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

# Create datasets for each split
train_dataset = MovieGenreDataset(X_train, y_train, tokenizer, is_train=True)
mlb = train_dataset.get_mlb()
val_dataset = MovieGenreDataset(X_val, y_val, tokenizer, mlb=mlb)
test_dataset = MovieGenreDataset(X_test, y_test, tokenizer, mlb=mlb)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Testing samples: {len(test_dataset)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

Training samples: 6380
Validation samples: 1596
Testing samples: 1995


In [8]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import DebertaTokenizer
from sklearn.metrics import f1_score
import numpy as np

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1)
test_loader = DataLoader(test_dataset, batch_size=1)
mlb = train_dataset.mlb
# Initialize model
model = DebertaForMultiLabel(num_labels=len(mlb.classes_))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training parameters
num_epochs = 5
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCEWithLogitsLoss()

# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_preds = []
    train_labels = []

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        # Calculate train predictions and labels for F1
        preds = torch.sigmoid(outputs) > 0.5
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    # Calculate training F1 score
    train_f1 = f1_score(train_labels, train_preds, average='micro')


    # Validation
    model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device) # Keep labels on device for loss calculation

            outputs = model(input_ids, attention_mask)

            # Calculate validation loss
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Calculate validation predictions and labels for F1
            preds = torch.sigmoid(outputs) > 0.5
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy()) # Move labels to CPU for F1 calculation

    # Calculate validation F1 score
    val_f1 = f1_score(val_labels, val_preds, average='micro')

    print(f'Epoch {epoch+1}:')
    print(f'Average Train Loss: {train_loss/len(train_loader):.4f}')
    print(f'Train F1: {train_f1:.4f}')
    print(f'Average Validation Loss: {val_loss/len(val_loader):.4f}')
    print(f'Validation F1: {val_f1:.4f}')

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/559M [00:00<?, ?B/s]

Epoch 1:
Average Train Loss: 0.2938
Train F1: 0.4680
Average Validation Loss: 0.2656
Validation F1: 0.5382
Epoch 2:
Average Train Loss: 0.2370
Train F1: 0.6170
Average Validation Loss: 0.2482
Validation F1: 0.5989
Epoch 3:
Average Train Loss: 0.1944
Train F1: 0.7048
Average Validation Loss: 0.2606
Validation F1: 0.6015
Epoch 4:
Average Train Loss: 0.1576
Train F1: 0.7729
Average Validation Loss: 0.2749
Validation F1: 0.6141
Epoch 5:
Average Train Loss: 0.1252
Train F1: 0.8292
Average Validation Loss: 0.2961
Validation F1: 0.6185
