In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist

# Download necessary NLTK models and data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

In [3]:
# NLTK, gensim,Hugging Face, PyTorch, Tensorflow/Keras and scikit-learn

import numpy as np
import pandas as pd # for reading csv files
import string
from gensim.models import Word2Vec
# from sklearn.metrics.pairwise import cosine_similarity as cos_similarity, cosine_distances

In [4]:
from google.colab import drive
drive.mount('/content/drive')
# connecting Google Drive for reading file of the training data

Mounted at /content/drive


### Text preprocessing

In [5]:
# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Mapping between Treebank and WordNet part of speech tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN



In [6]:
# To remove punctuations and other none useful tokens
punc_to_empty_table = str.maketrans(
    "",
    "",
    # characters to be removed
    '''
    !"#$%&'()*+, -./:;<=>?@[\]^_`{|}~,
    '''
)

In [7]:
# Preprocess the synopsis
def preprocess_synopsis(synopsis):
    # Tokenize the synopsis
    tokens = word_tokenize(synopsis.lower())

    # POS tagging
    pos_tags = nltk.pos_tag(tokens)

    # Lemmatize tokens using POS tags
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]

    # Remove punctuation and contractions
    tokens_no_punct = [word.translate(punc_to_empty_table) for word in tokens if word != "" and word not in ["'m", "'re", "'ve", "n't"]]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens_no_stop = [word for word in tokens_no_punct if word.lower() not in stop_words]

    # Calculate frequency distribution
    freq_dist = FreqDist(tokens_no_stop)

    # Remove least occurred tokens with length more than 17
    tokens_filtered = [word for word in tokens_no_stop if not (len(word) > 17 and freq_dist[word] == 1)]

    return tokens_filtered



In [8]:
df = pd.read_csv('./data/Training-dataset.csv')
df.head()

Unnamed: 0,ID,title,plot_synopsis,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,8f5203de-b2f8-4c0c-b0c1-835ba92422e9,Si wang ta,"After a recent amount of challenges, Billy Lo ...",0,0,0,0,1,1,0,0,1
1,6416fe15-6f8a-41d4-8a78-3e8f120781c7,Shattered Vengeance,"In the crime-ridden city of Tremont, renowned ...",0,0,0,0,1,1,1,0,1
2,4979fe9a-0518-41cc-b85f-f364c91053ca,L'esorciccio,Lankester Merrin is a veteran Catholic priest ...,0,1,0,0,0,0,0,0,0
3,b672850b-a1d9-44ed-9cff-025ee8b61e6f,Serendipity Through Seasons,"""Serendipity Through Seasons"" is a heartwarmin...",0,0,0,0,0,0,1,0,0
4,b4d8e8cc-a53e-48f8-be6a-6432b928a56d,The Liability,"Young and naive 19-year-old slacker, Adam (Jac...",0,0,1,0,0,0,0,0,0


In [9]:
# Apply preprocessing to each plot synopsis
df['processed_synopsis'] = df['plot_synopsis'].apply(preprocess_synopsis)

In [10]:
# Show the processed synopsis for the first row as an example
df['processed_synopsis'].head()

0    [recent, amount, challenges, , billy, lo, , br...
1    [crimeridden, city, tremont, , renowned, inves...
2    [lankester, merrin, veteran, catholic, priest,...
3    [, serendipity, seasons, , heartwarming, roman...
4    [young, naive, 19yearold, slacker, , adam, , j...
Name: processed_synopsis, dtype: object

### Method-b Bi-LSTM

In [11]:
# Parameters
vocab_size = 125433  # This should be adjusted based on the vocabulary of your dataset
# 125433is the number of the
max_length = 200    # This should be set to the length that covers most of your input data
embedding_dim = 100  # Size of the embedding vector

In [12]:
# Tokenize the synopsis
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(df['processed_synopsis'])
sequences = tokenizer.texts_to_sequences(df['processed_synopsis'])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

In [13]:
# Prepare the labels
labels = df[['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']].values

In [14]:
# Build the BI-LSTM model
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dense(9, activation='sigmoid')  # Use sigmoid for multi-label classification
])

In [15]:
# Build word2vec model
word2vec_model = Word2Vec(sentences=df['processed_synopsis'], vector_size=embedding_dim, window=5, min_count=1, workers=4)
word2vec_model.save("word2vec.model")

In [16]:
# Create a word index
word_index = tokenizer.word_index

In [17]:
# Create a weight matrix for words in training docs
# Initialize the matrix with zeros
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

# Populate the matrix with word vectors from Word2Vec model
for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_vector = word2vec_model.wv[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [18]:
# Build the model
model = Sequential([
    Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dense(9, activation='sigmoid')  # Use sigmoid for multi-label classification
])


In [19]:
# Compile the model
optimizer_aw = tf.keras.optimizers.AdamW()
model.compile(loss='binary_crossentropy', optimizer=optimizer_aw, metrics=['accuracy'])

In [20]:
# Train the model
model.fit(padded_sequences, labels, epochs=15, validation_split=0.2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7d99e833a470>

In [21]:
# Load the validation dataset
# validation: Task-2-validation-dataset.csv
# test: Task-2-test-dataset1.csv
validation_df = pd.read_csv('./data/Task-2-validation-dataset.csv')

# Preprocess the text data
validation_sequences = tokenizer.texts_to_sequences(validation_df['plot_synopsis'])
validation_padded_sequences = pad_sequences(validation_sequences, maxlen=max_length, padding='post')

In [22]:
# Predict the genres
predictions = model.predict(validation_padded_sequences)

# Thershold value testing:
# 0.3--> P:0.3469 R:0.3632
# 0.5--> P:0.2865 R:0.2596
# 0.2--> P: 0.3382 R: 0.4150
thresholds = [0] * 9  # Set threshold for all labels
# Define different thresholds for each label
thresholds[0] = 0.15  # Label1: 0.5->
thresholds[1] = 0.2  # Label2:
thresholds[2] = 0.25  # Label3:
thresholds[3] = 0.009  # Label4: Lower threshold for label 4 (index 3 because indices start at 0)
# about 90% of the data has lower than 0.006 value for label 4
# Only 31 out of 1189 data has label 4 as 1
thresholds[4] = 0.25  # Label5:
thresholds[5] = 0.3  # Label6:
thresholds[6] = 0.15  # Label7:
thresholds[7] = 0.3  # Label8:
thresholds[8] = 0.2  # Label9:

# Apply different thresholds to each label
binary_predictions = np.array([[1 if prediction[i] > thresholds[i] else 0 for i in range(len(prediction))] for prediction in predictions])

# Rest of the code remains the same

# If a row has only 0s, assign the most confident prediction as 1
for i, row in enumerate(binary_predictions):
    if row.sum() == 0:  # if all predictions are 0
        most_confident_index = np.argmax(predictions[i])
        binary_predictions[i, most_confident_index] = 1  # set the most confident prediction to 1




In [23]:
# Create a new DataFrame with the doc_id and the predicted labels
results_df = pd.DataFrame(binary_predictions, columns=['label1', 'label2', 'label3', 'label4', 'label5', 'label6', 'label7', 'label8', 'label9'])
results_df.insert(0, 'doc_id', validation_df['ID'])

# Save this DataFrame to a new CSV file
# Assuming 'results_df' is your DataFrame with the classification results
# Drop the header and save the file without an index
# validation:10638746-Task2-method-b-validation.csv
# Test:10638746-Task2-method-b.csv
results_df.to_csv('./data/10638746-Task2-method-b-validation.csv', header=False, index=False)


### Method-c BERT

In [25]:
!pip install torchvision



In [26]:
!pip install transformers



In [27]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn import BCEWithLogitsLoss
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast
from sklearn.metrics import f1_score
import torch
import numpy as np

In [28]:
# Preparing the dataset and labels
labels = df[['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']].values
texts = df['plot_synopsis'].tolist()

In [29]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

In [30]:
# Before Text-preprocessing for BERT

# Lowercasing: Use bert-base-uncased
# the text may need to be lowercased as the model might be case-insensitive

# Padding and Truncation:
# Sequences must be padded to a uniform length or truncated to a maximum sequence length that the BERT model can handle.

# Handling Out-of-Vocabulary Words:
# BERT's tokenizer can handle OOV words by breaking them down into subword units that exist in its vocabulary.

# Lemmatization: These techniques reduce words to their base or root form.
# However, they are generally not used with BERT, as the model benefits from the full context of words.

# Stopword Removal:
# For some tasks, removing stopwords (common words that do not contribute much meaning) can be beneficial,
# but for BERT models, this is typically not necessary as the model can learn the importance of all words in context.

In [31]:
# Tokenize and encode sequences in the dataset
input_ids = []
attention_masks = []

for text in texts:
    encoded_dict = tokenizer.encode_plus(
                        text,                      # Text to encode
                        add_special_tokens=True,   # Add '[CLS]' and '[SEP]'
                        return_attention_mask=True,   # Construct attn. masks
                        return_tensors='pt',      # Return pytorch tensors
                        padding='max_length',     # Pad to max_length
                        max_length=105,           # Set a maximum sequence length
                        # max_length is 105 because 99.99% of the sentences in data is under 105 words
                        truncation=True           # Ensure truncation to the max_length
                   )
    # Add the encoded sentence to the list
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding)
    attention_masks.append(encoded_dict['attention_mask'])

In [32]:
# Convert the lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels, dtype=torch.float32)

In [33]:
# Use train_test_split to split our data into train and validation sets
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=42, test_size=0.1)

In [34]:
# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=4)

In [35]:
# Create the DataLoader for our validation set
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=4)

In [36]:
# Load the BERT model for sequence classification with the number of labels
model = BertForSequenceClassification.from_pretrained(
    "bert-large-uncased-whole-word-masking", # Use the 24-layer BERT model, with an uncased vocab.
    num_labels = 9,     # The number of output labels.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)



model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased-whole-word-masking and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
# Tell pytorch to run this model on the GPU.
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,

In [38]:
epochs = 8

In [39]:
# Define the optimizer
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# Define the loss function
loss_fn = BCEWithLogitsLoss()

In [40]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


In [41]:
# Store the average loss after each epoch so we can plot them.
loss_values = []

In [42]:
# For each epoch...
for epoch_i in range(0, epochs):
    # Perform one full pass over the training set.
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        # Unpack this training batch from our dataloader.
        b_input_ids = batch[0].cuda()
        b_input_mask = batch[1].cuda()
        b_labels = batch[2].cuda()

        # Clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        loss = outputs[0]
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print(f"Average training loss: {avg_train_loss}")

print("Training complete!")

Average training loss: 0.4298724924225212
Average training loss: 0.36194904776544695
Average training loss: 0.2554870513503403
Average training loss: 0.1605689763266942
Average training loss: 0.09656339153107076
Average training loss: 0.06039509989433512
Average training loss: 0.03939985301254359
Average training loss: 0.027134175191681525
Training complete!


In [43]:
# Save the fine-tuned BERT model
# # model.bin saved
# model.save_pretrained('/content/drive/MyDrive/COMP34711_NLP/NLP_CW/BERTmodel')

In [44]:
# Load the test dataset
# validation: Task-2-validation-dataset.csv
# test: Task-2-test-dataset1.csv
val_df = pd.read_csv('./data/Task-2-validation-dataset.csv')

In [45]:
# Tokenize all of the sentences and map the tokens to their word IDs
val_input_ids = []
val_attention_masks = []

for text in val_df['plot_synopsis']:
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 105,           # Pad & truncate all sentences
                        padding = 'max_length',
                        return_attention_mask = True,
                        return_tensors = 'pt',
                        truncation = True
                   )

    val_input_ids.append(encoded_dict['input_ids'])
    val_attention_masks.append(encoded_dict['attention_mask'])

In [46]:
# Convert lists to tensors
val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_masks = torch.cat(val_attention_masks, dim=0)

In [47]:
# Create the DataLoader
prediction_data = TensorDataset(val_input_ids, val_attention_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=4)

In [48]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [49]:
# Prediction on test set
model.eval()  # Evaluation mode
predictions = []

for batch in prediction_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    logits = outputs.logits
    logits = logits.sigmoid()  # Use sigmoid to get probabilities
    predictions.append(logits.cpu().numpy())

In [50]:
# Concatenate logits from each batch
predictions = np.concatenate(predictions, axis=0)

# Apply threshold to get binary values
# predictions[predictions >= threshold] = 1
# predictions[predictions < threshold] = 0
# predictions = predictions.astype(int)


# Define different thresholds for each label
thresholds = [0.15, 0.2, 0.25, 0.02, 0.3, 0.3, 0.15, 0.3, 0.2]

# Apply different thresholds to each label to get binary values
binary_predictions = np.zeros_like(predictions, dtype=int)
for i, threshold in enumerate(thresholds):
    binary_predictions[:, i] = (predictions[:, i] >= threshold).astype(int)

# Combine the binary predictions with the ID column from the original test data
submission = pd.DataFrame(binary_predictions, columns=['label1', 'label2', 'label3', 'label4', 'label5', 'label6', 'label7', 'label8', 'label9'])

# Insert the 'doc_id' column from the validation DataFrame
submission.insert(0, 'doc_id', val_df['ID'].values)

# Save the DataFrame to a CSV file with integer values only
# validation: 10638746-Task2-method-c-validation.csv
# test: 10638746-Task2-method-c.csv
submission.to_csv('./data/10638746-Task2-method-c-validation.csv', index=False, header=False)