In [12]:
#IT SHOULD BE RUN IN KAGGLE

!pip install transformers datasets
!pip install transformers torch scikit-learn
# Import libraries
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
import numpy as np

from transformers import RobertaTokenizer
from sklearn.utils.class_weight import compute_class_weight


import torch.nn as nn
from transformers import RobertaForSequenceClassification, RobertaTokenizer, AdamW, get_scheduler
from sklearn.metrics import precision_recall_fscore_support
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm




In [13]:
# Load dataset
file_path = '../input/dataset/CW2-training-dataset.csv'  # Adjust the path as needed
data = pd.read_csv(file_path)

print("Dataset Loaded:")
print(data.head())

Dataset Loaded:
                                     ID                        title  \
0  8f5203de-b2f8-4c0c-b0c1-835ba92422e9                   Si wang ta   
1  6416fe15-6f8a-41d4-8a78-3e8f120781c7          Shattered Vengeance   
2  4979fe9a-0518-41cc-b85f-f364c91053ca                 L'esorciccio   
3  b672850b-a1d9-44ed-9cff-025ee8b61e6f  Serendipity Through Seasons   
4  b4d8e8cc-a53e-48f8-be6a-6432b928a56d                The Liability   

                                       plot_synopsis  comedy  cult  flashback  \
0  After a recent amount of challenges, Billy Lo ...       0     0          0   
1  In the crime-ridden city of Tremont, renowned ...       0     0          0   
2  Lankester Merrin is a veteran Catholic priest ...       0     1          0   
3  "Serendipity Through Seasons" is a heartwarmin...       0     0          0   
4  Young and naive 19-year-old slacker, Adam (Jac...       0     0          1   

   historical  murder  revenge  romantic  scifi  violence  
0   

In [14]:
from torch.utils.data import TensorDataset

# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenize plot_synopsis
def tokenize_synopses(data, tokenizer, max_length=512):
    """
    Tokenizes the plot_synopsis column or the last column dynamically 
    if plot_synopsis does not exist.
    """
    # Dynamically detect the text column
    text_column = 'plot_synopsis' if 'plot_synopsis' in data.columns else data.columns[-1]
    
    tokenized = tokenizer(
        data[text_column].tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    return tokenized

# Apply tokenization
max_seq_length = 256  # Adjust as needed
tokenized_data = tokenize_synopses(data, tokenizer, max_length=max_seq_length)

# Check tokenized output
print("Tokenized Data Shapes:", tokenized_data['input_ids'].shape, tokenized_data['attention_mask'].shape)

# Check if the dataset is training/validation or test
if all(col in data.columns for col in ["comedy", "cult", "flashback", "historical", "murder", "revenge", "romantic", "scifi", "violence"]):
    # Training or Validation dataset
    labels = torch.tensor(data[["comedy", "cult", "flashback", "historical", "murder", "revenge", "romantic", "scifi", "violence"]].values, dtype=torch.float32)
    print("Labels shape:", labels.shape)

    # Compute class weights for handling imbalance
    def compute_class_weights(data):
        """
        Computes class weights for multi-label data.
        """
        labels = data[["comedy", "cult", "flashback", "historical", "murder", "revenge", "romantic", "scifi", "violence"]].values
        weights = []
        for i in range(labels.shape[1]):
            class_weight = compute_class_weight(
                class_weight='balanced',
                classes=[0, 1],
                y=labels[:, i]
            )
            weights.append(class_weight[1])  # Positive class weight
        return torch.tensor(weights, dtype=torch.float32)

    # Compute weights
    class_weights = compute_class_weights(data)
    print("Class Weights:", class_weights)

    # Create TensorDataset for PyTorch DataLoader
    dataset = TensorDataset(
        tokenized_data['input_ids'], 
        tokenized_data['attention_mask'], 
        labels
    )

    # Define DataLoader
    batch_size = 16  # Adjust based on memory availability
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Check DataLoader
    for batch in dataloader:
        input_ids, attention_mask, batch_labels = batch
        print("Input IDs shape:", input_ids.shape)
        print("Attention Mask shape:", attention_mask.shape)
        print("Labels shape:", batch_labels.shape)
        break

else:
    # Test dataset (no labels)
    labels = None
    print("No labels found. Skipping DataLoader creation for test dataset.")




Tokenized Data Shapes: torch.Size([8257, 256]) torch.Size([8257, 256])
Labels shape: torch.Size([8257, 9])
Class Weights: tensor([ 3.2714,  2.2923,  2.0705, 22.1962,  1.0272,  2.4916,  2.0581, 20.2377,
         1.3474])
Input IDs shape: torch.Size([16, 256])
Attention Mask shape: torch.Size([16, 256])
Labels shape: torch.Size([16, 9])


In [15]:
# Determine the number of labels dynamically
num_labels = batch_labels.shape[1]  # This will ensure num_labels matches the dataset

# Define advanced model with dropout
class AdvancedRobertaClassifier(nn.Module):
    def __init__(self, model_name="roberta-base", num_labels=num_labels, dropout_rate=0.3):
        super(AdvancedRobertaClassifier, self).__init__()
        self.roberta = RobertaForSequenceClassification.from_pretrained(
            model_name, num_labels=num_labels, problem_type="multi_label_classification"
        )
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.dropout(outputs.logits)
        return logits

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model with the correct number of labels
model = AdvancedRobertaClassifier(model_name="roberta-base", num_labels=num_labels, dropout_rate=0.3)
model.to(device)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)



# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
epochs = 5
num_training_steps = len(dataloader) * epochs
scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Define loss function
criterion = nn.BCEWithLogitsLoss()

# Training loop with mixed precision and gradient accumulation
def train_model(model, dataloader, optimizer, scheduler, criterion, epochs=5, accumulation_steps=4):
    """
    Trains the model using mixed precision and gradient accumulation.
    """
    scaler = GradScaler()
    model.train()

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        total_loss = 0
        optimizer.zero_grad()

        for i, batch in enumerate(tqdm(dataloader)):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            # Mixed precision forward pass
            with autocast():
                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)
                total_loss += loss.item()

            # Backward pass
            scaler.scale(loss).backward()

            # Gradient accumulation
            if (i + 1) % accumulation_steps == 0 or (i + 1) == len(dataloader):
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1} Average Loss: {avg_loss}")

# Train the model
train_model(model, dataloader, optimizer, scheduler, criterion, epochs=5, accumulation_steps=4)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


Epoch 1/5


  with autocast():
100%|██████████| 517/517 [01:56<00:00,  4.43it/s]


Epoch 1 Average Loss: 0.5302232068673324
Epoch 2/5


100%|██████████| 517/517 [01:54<00:00,  4.51it/s]


Epoch 2 Average Loss: 0.48521665995310076
Epoch 3/5


100%|██████████| 517/517 [01:55<00:00,  4.49it/s]


Epoch 3 Average Loss: 0.4704490538607252
Epoch 4/5


100%|██████████| 517/517 [01:55<00:00,  4.48it/s]


Epoch 4 Average Loss: 0.4546010616438993
Epoch 5/5


100%|██████████| 517/517 [01:55<00:00,  4.49it/s]

Epoch 5 Average Loss: 0.44046224287676855





In [16]:
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params

# Count and print parameters
total_params, trainable_params = count_parameters(model)
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")


Total parameters: 124652553
Trainable parameters: 124652553


In [17]:
validation_file_path = "../input/dataset/CW2-test-dataset.csv"  # Update path if needed
data = pd.read_csv(validation_file_path)

# Check if it's validation or test dataset
is_validation = all(col in data.columns for col in ["comedy", "cult", "flashback", "historical", "murder", "revenge", "romantic", "scifi", "violence"])

if is_validation:
    print("Validation dataset detected.")
else:
    print("Test dataset detected.")

# Tokenize the plot_synopsis column
text_column = 'plot_synopsis' if 'plot_synopsis' in data.columns else data.columns[-1]
tokenized_data = tokenize_synopses(data, tokenizer, max_length=max_seq_length)

# Move tokenized inputs to GPU
input_ids_val = tokenized_data['input_ids'].to(device)
attention_mask_val = tokenized_data['attention_mask'].to(device)

# Predict on the dataset and save results
model.eval()
results = []
true_labels = []

with torch.no_grad():
    for i in tqdm(range(len(input_ids_val))):
        input_ids = input_ids_val[i].unsqueeze(0)
        attention_mask = attention_mask_val[i].unsqueeze(0)

        # Forward pass
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = torch.sigmoid(logits).cpu().numpy()

        # Convert logits to binary predictions (threshold = 0.5)
        predictions = (logits > 0.5).astype(int).flatten().tolist()

        # Append results with movie ID and predictions
        movie_id = data.iloc[i, 0]  # Assuming the first column in the dataset is "ID"
        results.append([movie_id] + predictions)

        # If validation dataset, collect true labels for evaluation
        if is_validation:
            true_labels.append(data.iloc[i, 1:].values.tolist())  # Assuming genre columns are after the ID column

# Save predictions to CSV
columns = ["ID", "comedy", "cult", "flashback", "historical", "murder", "revenge", "romantic", "scifi", "violence"]
result_df = pd.DataFrame(results, columns=columns)
result_df.to_csv("../working/11028972_task2_results.csv", index=False)
print("Predictions saved to 11028972_task2_results.csv")

# If validation dataset, compute metrics
if is_validation:
    from sklearn.metrics import precision_recall_fscore_support

    # Ensure true labels are numeric
    true_labels = data[["comedy", "cult", "flashback", "historical", "murder", "revenge", "romantic", "scifi", "violence"]].values
    true_labels = true_labels.astype(float)  # Convert to float if necessary

    # Convert predictions to numpy format, excluding the ID column
    y_pred = np.array([row[1:] for row in results], dtype=float)

    # Compute metrics
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, y_pred, average='weighted')
    print(f"Validation Metrics - Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")


Test dataset detected.


100%|██████████| 1204/1204 [00:23<00:00, 50.49it/s]

Predictions saved to 11028972_task2_results.csv



