In [1]:
import nltk
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
# from nltk.cluster import GAAClusterer

In [2]:
from nltk import download

download("stopwords")
download("wordnet")
download("punkt")
download("omw-1.4")
download("averaged_perceptron_tagger")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')
# connecting Google Drive for reading file of the training data

Mounted at /content/drive


In [4]:
# NLTK, gensim,Hugging Face, PyTorch, Tensorflow/Keras and scikit-learn

import numpy as np
import pandas as pd # for reading csv files
import string
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity as cos_similarity, cosine_distances

In [5]:
df = pd.read_csv('/content/drive/MyDrive/COMP34711_NLP/NLP_CW/Training-dataset.csv')
df.head()

Unnamed: 0,ID,title,plot_synopsis,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,8f5203de-b2f8-4c0c-b0c1-835ba92422e9,Si wang ta,"After a recent amount of challenges, Billy Lo ...",0,0,0,0,1,1,0,0,1
1,6416fe15-6f8a-41d4-8a78-3e8f120781c7,Shattered Vengeance,"In the crime-ridden city of Tremont, renowned ...",0,0,0,0,1,1,1,0,1
2,4979fe9a-0518-41cc-b85f-f364c91053ca,L'esorciccio,Lankester Merrin is a veteran Catholic priest ...,0,1,0,0,0,0,0,0,0
3,b672850b-a1d9-44ed-9cff-025ee8b61e6f,Serendipity Through Seasons,"""Serendipity Through Seasons"" is a heartwarmin...",0,0,0,0,0,0,1,0,0
4,b4d8e8cc-a53e-48f8-be6a-6432b928a56d,The Liability,"Young and naive 19-year-old slacker, Adam (Jac...",0,0,1,0,0,0,0,0,0


### TEXT PREPROCESSING for method-b


In [6]:
# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Mapping between Treebank and WordNet part of speech tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN



In [7]:
# To remove punctuations and other none useful tokens
punc_to_empty_table = str.maketrans(
    "",
    "",
    # characters to be removed
    '''
    !"#$%&'()*+, -./:;<=>?@[\]^_`{|}~,
    '''
)

In [8]:
# Preprocess the synopsis
def preprocess_synopsis(synopsis):
    # Tokenize the synopsis
    tokens = word_tokenize(synopsis.lower())

    # POS tagging
    pos_tags = nltk.pos_tag(tokens)

    # Lemmatize tokens using POS tags
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]

    # Remove punctuation and contractions
    tokens_no_punct = [word.translate(punc_to_empty_table) for word in tokens if word != "" and word not in ["'m", "'re", "'ve", "n't"]]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens_no_stop = [word for word in tokens_no_punct if word.lower() not in stop_words]

    # Calculate frequency distribution
    freq_dist = FreqDist(tokens_no_stop)

    # Remove least occurred tokens with length more than 17
    tokens_filtered = [word for word in tokens_no_stop if not (len(word) > 17 and freq_dist[word] == 1)]

    return tokens_filtered



In [9]:
# Apply preprocessing to each plot synopsis
df['processed_synopsis'] = df['plot_synopsis'].apply(preprocess_synopsis)

In [10]:
# Show the processed synopsis for the first row as an example
df['processed_synopsis'].head()

0    [recent, amount, challenges, , billy, lo, , br...
1    [crimeridden, city, tremont, , renowned, inves...
2    [lankester, merrin, veteran, catholic, priest,...
3    [, serendipity, seasons, , heartwarming, roman...
4    [young, naive, 19yearold, slacker, , adam, , j...
Name: processed_synopsis, dtype: object

In [11]:
# Acquiring words and each respective counts
word_counts = {}
for plot in df['processed_synopsis']:
  for word in plot:
    if word in word_counts:
        word_counts[word] += 1
    else:
        word_counts[word] = 1

# Sorting
sorted_words_counts = sorted(
    [[key, value] for key, value in word_counts.items()], key=lambda val: val[1], reverse=True
)

### method-b Word2VEc

In [12]:
def get_word_vector(w, model):
  '''
  From the given word and model, return the word vector of the model.
  If the word is not in the model return zero vetor
  '''
  try:
      return model.wv[w]
  except KeyError:
      return [0.0] * model.vector_size

In [13]:
def vectors_cos_sim(vector1, vector2):
  '''
  Return cosine similarity of the two vectors
  output range : [-1,1]
  close to -1: the words have close to opposite meanings
  close to 0: the words have low correlation in terms of their meanings
  close to 1: the words have similar meanings
  '''
  return cos_similarity([vector1], [vector2])[0][0]

In [14]:
# Getting term pair ids from the validation data
# validation: Task-1-validation-dataset.csv
# test: Task-1-test-dataset1.csv
FILE_PATH = "./data/"
task1_val = pd.read_csv(FILE_PATH + '/Task-1-validation-dataset.csv', header=None)

In [15]:
### About the data inside the plot synopsis ###

# @
# @ is found only in two cases: an email, a name of something

# IMPORTANT!!
# ()
# There are a lot of brackets, mostly containing Actors' name or a discription of the story
# Since the Name of the actor is not related to the synopsis itself,
# I could be better to remove it or store them elsewhere

#;
# many use of ; for further explaination

#'
# uses of 've, 'm, 's, n't
# they are all stopwords and since sentiment is not the key point of the genre of the movies it is better remove them
# may try to remove 's

#numbers
# not related to task so removing it
# TOTAL_NUMBER_OF_PLOTS = 8257

In [16]:
# # processed text data list into pandas DataFrame
# df = pd.DataFrame(columns=['processed_plot_synopsis'])
# for i in range(len(processed_plots)):
#   df.at[i,'processed_plot_synopsis'] = processed_plots[i]

In [17]:
# # Saving preprocessed text data as file
# df.to_csv(FILE_PATH+'/processed_plot_synopsis_mk1.csv')

In [18]:
vector_size = 1000

In [19]:
# Word2Vec defining modeling
'''
exp1
model = Word2Vec(sentences=df['processed_plot_synopsis'], vector_size=100, window=5, min_count=1, workers=4)
'''
model = Word2Vec(sentences=df['processed_synopsis'], vector_size=vector_size, window=5, min_count=1, workers=4, alpha=0.025)

In [20]:
model.epochs

5

In [21]:
model.corpus_count

8257

In [22]:
# Training model

model.train(df['processed_synopsis'],total_examples=model.corpus_count, epochs=20)



(77581376, 94791840)

In [23]:
def get_multiword_vector(term, model):
    '''
    From the given term (which can be a single or multi-word term) and model,
    return the average word vector of the term using the model.
    If a word in the term is not in the model, ignore that word.
    '''
    words = term.split()  # Split the term into words
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

In [24]:
# model.save(FILE_PATH+"word2vec_model.bin")

# get_vector_sum to manually compute the average
def get_vector_sum(tokens, model, vector_size):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    vector_sum = np.zeros(vector_size)
    for vec in vectors:
        vector_sum = vector_sum + vec
    return vector_sum / len(vectors) if vectors else vector_sum

df['word2vec_vector'] = df['processed_synopsis'].apply(lambda tokens: get_vector_sum(tokens, model, vector_size=vector_size))

print(df['word2vec_vector'])


0       [0.05696497970410073, -0.11086633720307265, -0...
1       [-0.02653990158684213, -0.10165011599257757, 0...
2       [-0.019787736329418944, -0.12486549186542564, ...
3       [-0.10647954577744798, 0.03466306967133734, 0....
4       [-0.035159835149472934, 0.011362532316279301, ...
                              ...                        
8252    [-0.02246979431993168, -0.13736332534858775, -...
8253    [-0.006612206346151159, -0.03982820091778965, ...
8254    [-0.007803041700963312, -0.11040922094692054, ...
8255    [-0.07270904485039133, -0.026766059537683445, ...
8256    [0.01755592050961196, -0.08709028103946112, 0....
Name: word2vec_vector, Length: 8257, dtype: object


In [25]:
cos_similarity(model.wv['friend'].reshape(1, -1), model.wv['dad'].reshape(1, -1))

array([[0.08866222]], dtype=float32)

In [26]:
results = []
for idx, row in task1_val.iterrows():
    term_pair_id = int(row[0])
    term1 = row[1]
    term2 = row[2]

    term1_v = get_multiword_vector(term1, model)
    term2_v = get_multiword_vector(term2, model)
    similarity = vectors_cos_sim(term1_v, term2_v)

    results.append([term_pair_id, similarity])

results_df = pd.DataFrame(results, columns=['term_pair_id', 'similarity'])

# validation: 10638746-Task1-method-b-validation.csv
# test: 10638746-Task1-method-b.csv
results_df.to_csv(FILE_PATH+'/10638746-Task1-method-b-validation.csv', header=False, index=False)

print(results_df)

     term_pair_id  similarity
0               1    0.108482
1               2    0.157832
2               3    0.584777
3               4   -0.000080
4               6    0.160660
..            ...         ...
145           177    0.435217
146           178    0.138796
147           179    0.123883
148           181    0.206445
149           182    0.319224

[150 rows x 2 columns]


### METHOD-c BERT

In [27]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, BertModel
from transformers import Trainer, TrainingArguments
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn import BCEWithLogitsLoss
from torch import nn
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast
from sklearn.metrics import f1_score
from sklearn.preprocessing import normalize
import torch

In [28]:
df.head()

Unnamed: 0,ID,title,plot_synopsis,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence,processed_synopsis,word2vec_vector
0,8f5203de-b2f8-4c0c-b0c1-835ba92422e9,Si wang ta,"After a recent amount of challenges, Billy Lo ...",0,0,0,0,1,1,0,0,1,"[recent, amount, challenges, , billy, lo, , br...","[0.05696497970410073, -0.11086633720307265, -0..."
1,6416fe15-6f8a-41d4-8a78-3e8f120781c7,Shattered Vengeance,"In the crime-ridden city of Tremont, renowned ...",0,0,0,0,1,1,1,0,1,"[crimeridden, city, tremont, , renowned, inves...","[-0.02653990158684213, -0.10165011599257757, 0..."
2,4979fe9a-0518-41cc-b85f-f364c91053ca,L'esorciccio,Lankester Merrin is a veteran Catholic priest ...,0,1,0,0,0,0,0,0,0,"[lankester, merrin, veteran, catholic, priest,...","[-0.019787736329418944, -0.12486549186542564, ..."
3,b672850b-a1d9-44ed-9cff-025ee8b61e6f,Serendipity Through Seasons,"""Serendipity Through Seasons"" is a heartwarmin...",0,0,0,0,0,0,1,0,0,"[, serendipity, seasons, , heartwarming, roman...","[-0.10647954577744798, 0.03466306967133734, 0...."
4,b4d8e8cc-a53e-48f8-be6a-6432b928a56d,The Liability,"Young and naive 19-year-old slacker, Adam (Jac...",0,0,1,0,0,0,0,0,0,"[young, naive, 19yearold, slacker, , adam, , j...","[-0.035159835149472934, 0.011362532316279301, ..."


### Text Preprocessing for method-c

In [29]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [30]:
def preprocess_text(text):
    # Case folding
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Lemmatization and removal of stop words and punctuation
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token not in string.punctuation]

    # Join tokens back into a single string
    text = ' '.join(tokens)

    return text

In [31]:
def remove_infrequent_words(text):
    return " ".join(word for word in text.split() if word not in rare_words)

In [32]:
df['processed_plot_synopsis'] = df['plot_synopsis'].apply(preprocess_text)

In [33]:
# For removal of infrequent words, need to count word occurrences across the corpus
word_freq = pd.Series(' '.join(df['processed_plot_synopsis']).split()).value_counts()
rare_words = word_freq[word_freq < 2]  # for example, words that appear less than twice


In [34]:
df['processed_plot_synopsis'] = df['processed_plot_synopsis'].apply(remove_infrequent_words)

# Now you can use 'processed_plot_synopsis' for your BERT embeddings
plot_synopses = df['processed_plot_synopsis'].tolist()

In [35]:
#  I am using uncased based bert because there are many proper noun
#  ex) acctors name, location, organisation, new concept

## Building BERT **model**

In [36]:
# Initialize the BERT tokenizer and model
# by using uncased the tokens are automatically case folded
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking')
model = BertModel.from_pretrained('bert-large-uncased-whole-word-masking')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [37]:
# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [38]:
# Move the model to the selected device
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

In [39]:
def get_bert_embeddings_batch(texts, batch_size=4):
    # Encode the texts
    #  99.99% of the sentences have lower or equeal to 105 tokens/words
    encoded_inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=105)
    # Move encoded inputs to the correct device
    encoded_inputs = {key: val.to(device) for key, val in encoded_inputs.items()}

    # Create a DataLoader to batch texts efficiently
    dataset = TensorDataset(encoded_inputs['input_ids'], encoded_inputs['attention_mask'])
    dataloader = DataLoader(dataset, batch_size=batch_size)

    # Collect all embeddings in this list
    all_embeddings = []

    # Process in batches
    for batch in dataloader:
        input_ids, attention_mask = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state
        # Apply mean pooling to get a single vector representation of the input text
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
        sum_embeddings = torch.sum(embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        # Move the embeddings to CPU for further processing/storage
        all_embeddings.extend(mean_embeddings.cpu().numpy())

    return all_embeddings

In [40]:
# Utilize the function
bert_embeddings = get_bert_embeddings_batch(plot_synopses, batch_size=4)

In [41]:
# Load the validation dataset
#  For validation dataset: Task-1-validation-dataset.csv
#  For test dataset:       Task-1-test-dataset1.csv
val_data_path = FILE_PATH+"/Task-1-validation-dataset.csv"
val_columns = ['term_pair_id','term1','term2' ,'golden_similarity']
val_df = pd.read_csv(val_data_path, header=None, names=val_columns)

In [42]:
val_df.head()

Unnamed: 0,term_pair_id,term1,term2,golden_similarity
0,1,absorb,learn,5.48
1,2,absorb,withdraw,2.97
2,3,achieve,accomplish,8.57
3,4,achieve,try,4.42
4,6,acquire,get,8.82


In [43]:
# Function to encode text using BERT tokenizer and get the BERT model output
def get_bert_embeddings(text):
    # Encode the text, adding the special tokens for BERT and truncating to the max length
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=512)
    # Get the output from BERT model
    with torch.no_grad():
        output = model(**encoded_input)
    # Get the embeddings from the last hidden state
    embeddings = output.last_hidden_state
    # Mean pooling to get a single vector representation of the input text
    input_mask_expanded = encoded_input['attention_mask'].unsqueeze(-1).expand(embeddings.size()).float()
    sum_embeddings = torch.sum(embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    mean_embeddings = sum_embeddings / sum_mask
    return mean_embeddings

In [44]:
# Function to get BERT embedding for a given term or a phrase
def get_term_embedding(term):
    # BERT can handle multi-word terms out of the box
    return get_bert_embeddings(term)

In [45]:
# Initialize an empty dictionary to store embeddings
embeddings_dict = {}

# If the embeddings for the terms are not precomputed, compute them now
# This is a placeholder; you will need to replace this with your actual method of obtaining embeddings
for term in set(val_df['term1'].tolist() + val_df['term2'].tolist()):
    if term not in embeddings_dict:
        embeddings_dict[term] = get_bert_embeddings_batch([term])[0]  # Assuming the function returns a batch of embeddings


In [46]:
# Function to calculate cosine similarity between two normalized vectors
def calculate_cosine_similarity(norm_vector1, norm_vector2):
    # Cosine similarity is the dot product of the vectors if they are normalized
    # We use np.squeeze to convert the result from an array to a single number
    return np.dot(norm_vector1, norm_vector2.T).squeeze()

In [47]:
# Normalize the embeddings before calculating similarity
for term, embedding in embeddings_dict.items():
    embeddings_dict[term] = normalize(embedding.reshape(1, -1))

In [48]:
# Now calculate the cosine similarity with normalized vectors
results = []

In [49]:

for idx, row in val_df.iterrows():
    term_pair_id = row['term_pair_id']
    term1 = row['term1']
    term2 = row['term2']

    # Retrieve normalized embeddings for the terms
    norm_embedding1 = embeddings_dict[term1]
    norm_embedding2 = embeddings_dict[term2]

    # Calculate cosine similarity
    cos_sim = calculate_cosine_similarity(norm_embedding1, norm_embedding2)

    # Store the results
    results.append({'term_pair_id': term_pair_id, 'similarity': cos_sim})

In [50]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save the results to CSV in the required format
# validation: 10638746-Task1-method-c-validation.csv
# test: 10638746-Task1-method-c.csv
results_df.to_csv(FILE_PATH+'/10638746-Task1-method-c-validation.csv',
                  index=False, columns=['term_pair_id', 'similarity'], header=False)