In [None]:
!pip install transformers datasets




In [None]:
#Iva Jorgusheska, 26.11.2024

# === Imports ===
import numpy as np
import pandas as pd
import nltk
import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, BatchNormalization, LayerNormalization
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.metrics import Precision, Recall
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_scheduler
from sklearn.metrics import precision_recall_fscore_support
import numpy as np


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from google.colab import drive
drive.mount('/content/drive')
train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
val_data = pd.read_csv('/content/drive/MyDrive/validation.csv')
test_data = pd.read_csv('/content/drive/MyDrive/validation.csv')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Mounted at /content/drive


In [None]:
# === Preprocessing ===
class MovieDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, genre_columns):
        self.texts = dataframe['plot_synopsis'].tolist()
        self.labels = dataframe[genre_columns].values
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.float32),
        }

In [None]:
# === Tokenization ===
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_length = 512
genre_columns = ['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']

train_dataset = MovieDataset(train_data, tokenizer, max_length, genre_columns)
val_dataset = MovieDataset(val_data, tokenizer, max_length, genre_columns)
test_dataset = MovieDataset(test_data, tokenizer, max_length, genre_columns)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
from torch.nn import BCEWithLogitsLoss

# === Dynamic Weighting Function ===
# === Dynamic Weighting with Smoothing ===
def update_class_weights(val_loader, predictions, true_labels, current_weights, alpha=0.7):
    """
    Update class weights dynamically based on validation predictions,
    using a combination of current weights and error-driven adjustments.
    """
    # Count how often each label is misclassified
    class_errors = np.abs(predictions - true_labels).sum(axis=0)
    dynamic_weights = {
        i: min(max(len(true_labels) / (len(class_errors) * max(err, 1)), 0.5), 5.0)
        for i, err in enumerate(class_errors)
    }

    # Smooth the weights using a moving average
    smoothed_weights = {
        i: alpha * current_weights[i] + (1 - alpha) * dynamic_weights[i]
        for i in dynamic_weights.keys()
    }

    return torch.tensor(list(smoothed_weights.values()), dtype=torch.float32).to("cuda")


In [None]:
# === Load Pretrained Model ===
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(genre_columns),
    #problem_type="multi_label_classification",
)
model = model.to("cuda" if torch.cuda.is_available() else "cpu")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# === Optimizer and Scheduler ===
from transformers import get_linear_schedule_with_warmup
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 3  # Assuming 3 epochs
lr_scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)



In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
# === Train Model with Dynamic Weighting ===
# === Train Model with Smooth Weight Updates ===
def train_model_with_smooth_weights(model, train_loader, val_loader, optimizer, scheduler, device, epochs=3, alpha=0.7):
    """
    Train a BERT model with smooth dynamic weighting.
    """
    model.to(device)

    # Initialize class weights based on training class frequency
    initial_class_weights = {i: 1.0 for i in range(model.config.num_labels)}
    current_weights = torch.tensor(list(initial_class_weights.values()), dtype=torch.float32).to("cuda")

    for epoch in range(epochs):
        print(f"\nStarting epoch {epoch + 1}/{epochs}")

        # === Training ===
        model.train()
        total_loss = 0
        for batch_idx, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()

            # Forward pass
            outputs = model(**batch)
            loss_fn = BCEWithLogitsLoss(pos_weight=current_weights)
            loss = loss_fn(outputs.logits, batch["labels"])
            total_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_loss = total_loss / len(train_loader)
        print(f"Training Loss: {avg_loss:.4f}")

        # === Validation ===
        print("Evaluating on validation set...")
        model.eval()
        val_predictions, val_true_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                probs = torch.sigmoid(outputs.logits)

                val_predictions.extend(probs.cpu().numpy())
                val_true_labels.extend(labels.cpu().numpy())

        val_predictions = np.array(val_predictions)
        val_true_labels = np.array(val_true_labels)

        # Update class weights dynamically
        current_weights = update_class_weights(
            val_loader, val_predictions, val_true_labels, current_weights=current_weights, alpha=alpha
        )

    return model, val_predictions, val_true_labels

# Train the model
trained_model, val_preds, val_labels = train_model_with_smooth_weights(
    model, train_loader, val_loader, optimizer, lr_scheduler, device="cuda", epochs=3, alpha=0.7
)



Starting epoch 1/3
Training Loss: 0.4202
Evaluating on validation set...

Starting epoch 2/3
Training Loss: 0.3346
Evaluating on validation set...

Starting epoch 3/3
Training Loss: 0.2616
Evaluating on validation set...


In [None]:
from sklearn.metrics import precision_recall_curve

# # === Threshold Tuning ===
# def tune_thresholds(val_preds, val_labels):
#     """
#     Tune thresholds for each class to optimize F1 scores.
#     """
#     optimal_thresholds = []
#     for i in range(val_preds.shape[1]):
#         precision, recall, thresholds = precision_recall_curve(val_labels[:, i], val_preds[:, i])
#         f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
#         optimal_thresholds.append(thresholds[np.argmax(f1_scores)])
#     return optimal_thresholds

# optimal_thresholds = tune_thresholds(val_preds, val_labels)
# print(f"Optimal Thresholds: {optimal_thresholds}")



# === Threshold Tuning with Cap ===
def tune_thresholds(val_preds, val_labels):
    """
    Tune thresholds for each class to optimize F1 scores, with a cap of 0.4.
    """
    optimal_thresholds = []
    for i in range(val_preds.shape[1]):
        precision, recall, thresholds = precision_recall_curve(val_labels[:, i], val_preds[:, i])
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
        best_threshold = thresholds[np.argmax(f1_scores)]
        optimal_thresholds.append(min(best_threshold, 0.4))  # Cap the threshold at 0.4
    return optimal_thresholds

# Example usage
optimal_thresholds = tune_thresholds(val_preds, val_labels)
print(f"Optimal Thresholds: {optimal_thresholds}")

Optimal Thresholds: [0.13311605, 0.25577983, 0.2306842, 0.26423338, 0.14677562, 0.22634448, 0.2848653, 0.25675604, 0.32961148]


In [None]:
torch.cuda.empty_cache()

# === Save Model ===
model.save_pretrained("/content/drive/MyDrive/bert_movie_genre")
tokenizer.save_pretrained("/content/drive/MyDrive/bert_movie_genre")

('/content/drive/MyDrive/bert_movie_genre/tokenizer_config.json',
 '/content/drive/MyDrive/bert_movie_genre/special_tokens_map.json',
 '/content/drive/MyDrive/bert_movie_genre/vocab.txt',
 '/content/drive/MyDrive/bert_movie_genre/added_tokens.json')

In [None]:
import torch
import pandas as pd


model.eval()

predictions = []

#====Calculate the predictions for the validation set===========
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to("cuda")
        attention_mask = batch['attention_mask'].to("cuda")

        outputs = model(input_ids, attention_mask=attention_mask)
        probs = torch.sigmoid(outputs.logits)

        # batch_preds = (probs > 0.5).int()
        batch_preds = (probs > torch.tensor(optimal_thresholds, device="cuda")).int()
        predictions.extend(batch_preds.cpu().numpy())

predictions_df = pd.DataFrame(predictions, columns=["comedy", "cult", "flashback", "historical", "murder", "revenge", "romantic", "scifi", "violence"])

predictions_df['ID'] = test_data['ID'].values

predictions_df = predictions_df[['ID', 'comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']]

predictions_df.to_csv('/content/11114620_task2_results.csv', index=False,header=None)

In [None]:
class TestMovieDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.texts = dataframe.iloc[:, 2].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
        }


In [None]:
test_data = pd.read_csv('/content/CW2-test-dataset.csv', header = None)

test_dataset = TestMovieDataset(test_data, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [None]:

model.eval()
predictions = []
#========calculate predictions for the test dataset
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to("cuda")
        attention_mask = batch['attention_mask'].to("cuda")

        outputs = model(input_ids, attention_mask=attention_mask)
        probs = torch.sigmoid(outputs.logits)

        # Apply optimal thresholds to determine the final predictions
        batch_preds = (probs > torch.tensor(optimal_thresholds, device="cuda")).int()
        predictions.extend(batch_preds.cpu().numpy())

# Convert predictions to DataFrame
predictions_test = pd.DataFrame(predictions, columns=["comedy", "cult", "flashback", "historical", "murder", "revenge", "romantic", "scifi", "violence"])

# Add IDs from the test data
predictions_test['ID'] = test_data[0].values  # Assuming the ID column is the first column in your test dataset

# Reorder columns to have ID first
predictions_test = predictions_test[['ID', 'comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']]

# Save the predictions
predictions_test.to_csv('/content/test_task2_results.csv', index=False, header=None)
