# Task 1

First we import all the libraries we want for this method, and import the training and test csv's.

In [79]:
import numpy as np
import pandas as pd
from pathlib import Path
import time
from tqdm import tqdm

# text preprocessing modules
import re
from nltk import word_tokenize
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models import Word2Vec

# import the training and test dataframes
train_df = pd.read_csv('./Training-dataset.csv')
validation_df = pd.read_csv('./Task-1-validation-dataset.csv', header=None)
test_df = pd.read_csv('./Task-1-test-dataset1.csv', header=None)


# Method (a) - BoW with TF-IDF
## Preprocessing
We need to return an array of cleaned synopses from this function.

In [74]:
def preprocess_synopses(synopses):
    # get a set of the stopwords to remove
    lemmatizer = WordNetLemmatizer()

    processed_synopses = []

    for synopsis in synopses:
        # for sentence in sent_tokenize(synopsis):
        # Remove non-alphabetic characters and convert to lowercase
        synopsis = re.sub('[^a-zA-Z]', ' ', synopsis).lower()
        # Tokenise the sentence
        synopsis = word_tokenize(synopsis)
        # Lemmatize the words
        synopsis = [lemmatizer.lemmatize(word) for word in synopsis]

        synopsis = ' '.join(synopsis)

        processed_synopses.append(synopsis)

    return processed_synopses

synopses = preprocess_synopses(train_df['plot_synopsis'])
print(len(synopses))

8257


Define the tf_idf function

In [84]:
def tf_idf(corpus):
    # instantiate the vectorizer
    vectorizer = TfidfVectorizer()
    # fit the vectorizer to our training corpus
    vector = vectorizer.fit_transform(corpus)
    # convert it to an array
    array = vector.toarray()
    print(array.shape)
    # get all the output feature words
    words = vectorizer.get_feature_names_out()
    df = pd.DataFrame(array, columns = words)
    return df

tf_idf_df = tf_idf(synopses)

(8257, 79903)


Make the cosine similarity function

In [76]:
def cosine_similarity(vector_a, vector_b):
    distance = np.dot(vector_a, vector_b) / (np.linalg.norm(vector_a) * np.linalg.norm(vector_b))
    return distance

In [82]:
def get_results(df):
    # Loop through the dataframe file and get similarity scores
    data = [] 
    lemmatizer = WordNetLemmatizer()

    for index, row in df.iterrows():
    
        # clean the words
        word1 = re.sub('[^a-zA-Z]', ' ', row[1]).lower()
        word2 = re.sub('[^a-zA-Z]', ' ', row[2]).lower()
        # Lemmatize the words
        word1 = lemmatizer.lemmatize(word1)
        word2 = lemmatizer.lemmatize(word2)
        
        # If the word is in our vocabulary, 
        if (word1 in tf_idf_df.columns and word2 in tf_idf_df.columns):
            predicted_similarity = cosine_similarity(tf_idf_df[word1], tf_idf_df[word2]) * 10
        else:
            predicted_similarity = 0.005
        data.append([row[0], predicted_similarity])
    output_df = pd.DataFrame(data)
    return output_df

In [85]:
# Get the cosine similarity scores for the validation df
output_df = get_results(validation_df)

# Output to a csv file
filepath = Path('./10861383-Task1-method-a-validation.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
print(output_df)
output_df.to_csv(filepath, index=False, header=False)  

       0         1
0      1  0.214886
1      2  0.000000
2      3  0.012292
3      4  0.353786
4      6  0.282594
..   ...       ...
145  177  0.745142
146  178  0.309537
147  179  0.617254
148  181  0.299473
149  182  0.771084

[150 rows x 2 columns]


In [172]:
start = time.time()
# Get the cosine similarity scores for the test df
output_df = get_results(test_df)
end = time.time()
print(end - start)

# Output to a csv file
filepath = Path('./10861383-Task1-method-a.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
print(output_df)
output_df.to_csv(filepath, index=False, header=False)  

18.827519178390503
        0         1
0     816  0.420122
1     957  0.429372
2     809  0.605738
3     911  0.277784
4     242  0.300453
..    ...       ...
97    160  4.158119
98     14  1.148824
99     16  0.611909
100  4012  0.005000
101  4013  0.005000

[102 rows x 2 columns]


# Method (b) - Word2Vec
## Preprocessing
Now let's clean up the training dataset's synopses to have no punctuation, tags, numbers or special characters.
This will get us an array of words.

In [136]:
cleaned_sentences = []
for synopsis in tqdm(list(train_df['plot_synopsis'])):
    sentences = re.split(r'\.|\?|\!', synopsis) # split the title/synopsis into sentences
    
    for sentence in sentences:
        # for sentence in sent_tokenize(synopsis):
        # Remove non-alphabetic characters and convert to lowercase
        sentence = re.sub('[^a-zA-Z]', ' ', sentence).lower()
        # Tokenise the sentence
        sentence = word_tokenize(sentence)
        # get a set of the stopwords to remove
        stop_words = set(stopwords.words('english'))
        # Remove stopwords
        sentence = [word for word in sentence if word not in stop_words]
        # Lemmatize the words
        lemmatizer = WordNetLemmatizer()
        sentence = [lemmatizer.lemmatize(word) for word in sentence]
    
        cleaned_sentences.append(sentence)

print("Number of sentences to get context from: ", len(cleaned_sentences))

100%|██████████| 8257/8257 [02:32<00:00, 54.16it/s]

Number of sentences to get context from:  394993





Call the model. This is where all the hyper parameters are set.

Setting the parameter sg will control between skip-gram and CBOW. (if sg=1, skipgram, else CBOW)


In [170]:
vocab = []
for sentence in cleaned_sentences:
    vocab += list(set(sentence))
vocab = list(set(vocab))
print(vocab)
print(len(vocab))
vocab_length = len(vocab)

79816


In [157]:
w2v = Word2Vec(cleaned_sentences, sg=0, vector_size=200, window=5, min_count=1, workers=64)

Define a cosine similarity function.

This function takes in a model, and 2 words to find the cosine distance between.

We return 0.005 if the word is OOV (out of vocabulary).

In [171]:
def w2v_cosine_similarity(model, word_a, word_b):
    try:
        vector_a = model.wv[word_a]
    except:
        return 0.005
    try:
        vector_b = model.wv[word_b]
    except:
        return 0.005

    distance = np.dot(vector_a, vector_b) / (np.linalg.norm(vector_a) * np.linalg.norm(vector_b))
    
    return distance

print(w2v_cosine_similarity(w2v, 'bone', 'teeth'))
print(w2v_cosine_similarity(w2v, 'bone', 'bone'))

0.8108403
0.99999994


In [159]:
def get_w2v_results(df):
    # Loop through the dataframe file and get similarity scores
    data = [] 
    lemmatizer = WordNetLemmatizer()

    for index, row in df.iterrows():
        # clean the words
        word1 = re.sub('[^a-zA-Z]', ' ', row[1]).lower()
        word2 = re.sub('[^a-zA-Z]', ' ', row[2]).lower()
        # Lemmatize the words
        word1 = lemmatizer.lemmatize(word1)
        word2 = lemmatizer.lemmatize(word2)

        predicted_similarity = w2v_cosine_similarity(w2v, word1, word2)
        data.append([row[0], predicted_similarity])
    output_df = pd.DataFrame(data)
    return output_df

In [160]:
# Get the cosine similarity scores for the validation df
output_df = get_w2v_results(validation_df)

# Output to a csv file
filepath = Path('./10861383-Task1-method-b-validation.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
print(output_df)
output_df.to_csv(filepath, index=False, header=False)  

       0         1
0      1  0.268912
1      2  0.608785
2      3  0.862858
3      4  0.069127
4      6  0.242382
..   ...       ...
145  177  0.707292
146  178  0.479634
147  179  0.429537
148  181  0.476661
149  182  0.495983

[150 rows x 2 columns]


In [173]:
start = time.time()
# Get the cosine similarity scores for the test df
output_df = get_w2v_results(test_df)
end = time.time()
print(end - start)

# Output to a csv file
filepath = Path('./10861383-Task1-method-b.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
print(output_df)
output_df.to_csv(filepath, index=False, header=False)  

0.07879900932312012
        0         1
0     816  0.681798
1     957  0.512058
2     809  0.541038
3     911  0.497556
4     242  0.527716
..    ...       ...
97    160  0.321230
98     14  0.695870
99     16  0.391330
100  4012  0.005000
101  4013  0.005000

[102 rows x 2 columns]
