In [1]:
#Iva Jorgusheska, 26/11/2024
#UID: 11114620

# === Import Libraries ===
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, precision_recall_fscore_support
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM,  LayerNormalization, Bidirectional,BatchNormalization, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.losses import BinaryCrossentropy
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
import tensorflow.keras.backend as K
from keras.models import Sequential, load_model
import pickle
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential


# === Load Data ===
from google.colab import drive
drive.mount('/content/drive')
train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
val_data = pd.read_csv('/content/drive/MyDrive/validation.csv')
test_data = pd.read_csv('/content/CW2-test-dataset.csv', header = None)
# Genre columns for multi-label classification
genre_columns = ['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# === Preprocess Text ===
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    return ' '.join(token for token in tokens if token.isalnum() and (token not in stop_words or token in ['not', 'no']))

train_data['processed_plot'] = train_data['plot_synopsis'].apply(preprocess_text)
val_data['processed_plot'] = val_data.iloc[:, 2].apply(preprocess_text)
#test_data['processed_plot'] = test_data['plot_synopsis'].apply(preprocess_text)
test_data['processed_plot'] = test_data.iloc[:, 2].apply(preprocess_text)
print(test_data['processed_plot'][:5])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


0    young girl bird aunt boyfriend waiting car vis...
1    oliver pease burgess meredith deceived bride m...
2    ann sarah polley mother two small daughters un...
3    ben jamie brett gabel arthur sam mraovich gay ...
4    new york city 16th precinct police detective d...
Name: processed_plot, dtype: object


In [3]:
print(val_data['processed_plot'][:5])

0    enchanting city verona italy renowned painter ...
1    walker works friend mal reese steal large amou...
2    film consists several thematically linked scen...
3    gentleman dignity careers love lives urban pro...
4    carmen brown beyoncé seductive aspiring actres...
Name: processed_plot, dtype: object


In [4]:
# === Tokenize and Pad Text ===
vocab_size = 30000
max_length = 600
embedding_dim = 100

tokenizer = Tokenizer(num_words=vocab_size + 1, oov_token='<OOV>')
tokenizer.fit_on_texts(train_data['processed_plot'])
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_data['processed_plot'])
val_sequences = tokenizer.texts_to_sequences(val_data['processed_plot'])
test_sequences = tokenizer.texts_to_sequences(test_data['processed_plot'])

padded_train_texts = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
padded_val_texts = pad_sequences(val_sequences, maxlen=max_length, padding='post', truncating='post')
padded_test_texts = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')


In [5]:
# Extract labels
train_labels = train_data[genre_columns].values
val_labels = val_data[genre_columns].values


In [6]:
# === Dynamic Class Weights ===
label_counts = train_labels.sum(axis=0)
total_samples = len(train_labels)
#calculate weights to make up for undersamled/oversampled classes
class_weights = {
    i: min(max(total_samples / (len(label_counts) * count), 0.5), 5.0)
    for i, count in enumerate(label_counts)
}
print(f"Class Weights: {class_weights}")


Class Weights: {0: 0.7269765803838704, 1: 0.5094083533839225, 2: 0.5, 3: 4.932497013142174, 4: 0.5, 5: 0.5536779990612217, 6: 0.5, 7: 4.497276688453159, 8: 0.5}


In [7]:
# === Learning Rate Scheduler ===
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-4, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)


In [8]:
# === Load Pre-trained Word Embeddings ===
embedding_dim = 100
embedding_index = {}

with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [9]:
# === Focal Loss Function ===
def focal_loss(gamma=2., alpha=0.25):
    def loss(y_true, y_pred):
        pt = tf.where(tf.equal(y_true, 1), y_pred, 1 - y_pred)
        return -tf.reduce_mean(alpha * (1 - pt) ** gamma * tf.math.log(pt + tf.keras.backend.epsilon()))
    return loss

In [10]:
# === Step 6: Build the Enhanced Model ===
from tensorflow.keras.metrics import Precision, Recall
from sklearn.metrics import f1_score, precision_score, recall_score
from tensorflow.keras.models import Model

from tensorflow.keras.layers import Attention, Input, GlobalAveragePooling1D, Concatenate

# === Build Model with Attention ===
from tensorflow.keras.layers import (
    Embedding, Bidirectional, LSTM, Attention, Dense, Dropout, BatchNormalization,
    LayerNormalization, GlobalAveragePooling1D, Input
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

def build_hybrid_model(vocab_size, embedding_dim, embedding_matrix, max_length, num_classes,
                       lstm_units=128, dropout_rate=0.2, learning_rate=1e-4, use_focal_loss=False):
    """
    Hybrid model combining stacked LSTMs with attention mechanism.

    Parameters:
    - vocab_size: Vocabulary size
    - embedding_dim: Dimensionality of embedding space
    - embedding_matrix: Pre-trained embedding weights
    - max_length: Max sequence length
    - num_classes: Number of output classes (multi-label)
    - lstm_units: Number of units in each LSTM layer
    - dropout_rate: Dropout rate for regularization
    - learning_rate: Learning rate for optimizer
    - use_focal_loss: If True, use focal loss; else use binary cross-entropy

    Returns:
    - Compiled Keras model
    """
    # === Input Layer ===
    input_layer = Input(shape=(max_length,))

    # === Embedding Layer ===
    embedding_layer = Embedding(
        input_dim=vocab_size + 1,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_length,
        trainable=True  # Allow fine-tuning
    )(input_layer)

    # === Stacked Bidirectional LSTMs ===
    lstm_1 = Bidirectional(LSTM(lstm_units//2, return_sequences=True))(embedding_layer)
    lstm_1 = BatchNormalization()(lstm_1)
    lstm_1 = Dropout(dropout_rate)(lstm_1)

    lstm_2 = Bidirectional(LSTM(lstm_units//4, return_sequences=True))(lstm_1)
    lstm_2 = LayerNormalization()(lstm_2)
    lstm_2 = Dropout(dropout_rate)(lstm_2)

    # === Attention Mechanism ===
    attention_output = Attention()([lstm_2, lstm_2])
    pooled_output = GlobalAveragePooling1D()(attention_output)

    # === Fully Connected Layers ===
    dense = Dense(lstm_units // 2, activation='relu', kernel_regularizer='l2')(pooled_output)
    dropout = Dropout(dropout_rate)(dense)
    output_layer = Dense(num_classes, activation='sigmoid')(dropout)

    # === Compile Model ===
    model = Model(inputs=input_layer, outputs=output_layer)

    # Select loss function dynamically
    loss_function = (
        focal_loss(gamma=2.0) if use_focal_loss else "binary_crossentropy"
    )

    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss=loss_function,
        metrics=["accuracy", tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
    )

    return model


model = build_hybrid_model(vocab_size, embedding_dim, embedding_matrix, max_length, 9)




In [11]:
# === Train Model with Callbacks ===
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-4, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)

history = model.fit(
    padded_train_texts,
    train_labels,
    validation_data=(padded_val_texts, val_labels),
    epochs=10,
    batch_size=32,
    class_weight=class_weights,
    callbacks=[reduce_lr, early_stopping]
)


Epoch 1/10
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 96ms/step - accuracy: 0.0948 - loss: 0.9444 - precision: 0.2434 - recall: 0.1281 - val_accuracy: 0.2045 - val_loss: 0.9424 - val_precision: 0.7211 - val_recall: 0.0787 - learning_rate: 1.0000e-04
Epoch 2/10
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 90ms/step - accuracy: 0.2582 - loss: 0.7237 - precision: 0.6126 - recall: 0.1248 - val_accuracy: 0.3148 - val_loss: 0.7761 - val_precision: 0.7195 - val_recall: 0.1718 - learning_rate: 1.0000e-04
Epoch 3/10
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 90ms/step - accuracy: 0.2766 - loss: 0.6045 - precision: 0.6441 - recall: 0.1861 - val_accuracy: 0.3224 - val_loss: 0.6732 - val_precision: 0.6938 - val_recall: 0.2336 - learning_rate: 1.0000e-04
Epoch 4/10
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 90ms/step - accuracy: 0.2932 - loss: 0.5077 - precision: 0.6579 - recall: 0.2327 - val_accuracy: 0.3

In [12]:
from sklearn.metrics import precision_recall_curve
#Tune threshold for each class
def tune_individual_thresholds(y_true, y_pred_prob):
    optimal_thresholds = []
    for i in range(y_pred_prob.shape[1]):
        precision, recall, thresholds = precision_recall_curve(y_true[:, i], y_pred_prob[:, i])
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
        best_threshold = thresholds[np.argmax(f1_scores)]
        optimal_thresholds.append(best_threshold)
    return np.clip(optimal_thresholds, 0.1, 0.9)  # Cap thresholds for stability

val_predictions_prob = model.predict(padded_val_texts)
optimal_thresholds = tune_individual_thresholds(val_labels, val_predictions_prob)

# Use these thresholds in prediction


[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step


In [13]:
# === Predict and Save Results ===
def predict_and_save_results(model, val_padded, val_data, genre_columns, thresholds, output_file='validation_predictions.csv'):
    predictions_prob = model.predict(val_padded)
    predictions_binary = (predictions_prob > thresholds).astype(int)
    result = pd.DataFrame({'ID': val_data.iloc[:, 0]})
    result[genre_columns] = predictions_binary
    result.to_csv(output_file, index=False, header=False)
    print(f"Predictions saved to {output_file}")


# def predict_and_save_results_test(model, test_padded, test_data, thresholds, output_file):
#     predictions_prob = model.predict(test_padded)
#     predictions_binary = (predictions_prob > thresholds).astype(int)

#     # Create a DataFrame for the results
#     result = pd.DataFrame(test_data.iloc[:, 0])  # Keep only the ID column from input

#     # Add columns for predictions
#     for i in range(predictions_binary.shape[1]):
#         result[f'genre_{i+1}'] = predictions_binary[:, i]


#     # Save the DataFrame
#     result.to_csv(output_file, index=False, header=False)
#     print(f"Predictions saved to {output_file}")

predict_and_save_results(
    model,
    padded_val_texts,
    val_data,
    genre_columns,
    optimal_thresholds,
    output_file='/content/val_predictions.csv'
)

predict_and_save_results(
    model,
    padded_test_texts,
    test_data,
    genre_columns,
    optimal_thresholds,
    output_file='/content/drive/MyDrive/test_predictions.csv'
)
predict_and_save_results(
    model,
    padded_test_texts,
    test_data,
    genre_columns,
    optimal_thresholds,
    output_file='/content/test_predictions.csv'
)

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step
Predictions saved to /content/val_predictions.csv
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step
Predictions saved to /content/drive/MyDrive/test_predictions.csv
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step
Predictions saved to /content/test_predictions.csv
