In [1]:
import numpy as np
import pandas as pd

# text preprocessing modules
import re
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# import the training and test dataframes
train_df = pd.read_csv('./Training-dataset.csv')
test_df = pd.read_csv('./Task-1-validation-dataset.csv', header=None)

In [2]:
def preprocess_synopses(synopses):
    # get a set of the stopwords to remove
    stop_words = set(stopwords.words('english'))

    processed_synopses = []

    for synopsis in synopses:
        # for sentence in sent_tokenize(synopsis):
        # Remove non-alphabetic characters and convert to lowercase
        synopsis = re.sub('[^a-zA-Z]', ' ', synopsis).lower()
        # Tokenise the sentence
        synopsis = word_tokenize(synopsis)
        # Remove stopwords
        # synopsis = [word for word in synopsis if word not in stop_words]
        # Lemmatize the words
        lemmatizer = WordNetLemmatizer()
        synopsis = [lemmatizer.lemmatize(word) for word in synopsis]
        synopsis = ' '.join(synopsis)

        processed_synopses.append(synopsis)

    return processed_synopses

synopses = preprocess_synopses(train_df['plot_synopsis'])
print(len(synopses))

8257


In [3]:
# def tf_idf(corpus):
#     tfidf = TfidfVectorizer()
#     vector = tfidf.fit_transform(corpus)
#     array = vector.toarray()
#     words = tfidf.get_feature_names_out()
#     df = pd.DataFrame(array, columns = words)
#     return df

def tf_idf(corpus, representation):
    if representation == 'bow':
        vectorizer = CountVectorizer()
    elif representation == 'tfidf':
        vectorizer = TfidfVectorizer()
    else:
        raise ValueError("Invalid representation. Choose 'bow' or 'tfidf'.")
    
    vector = vectorizer.fit_transform(corpus)
    array = vector.toarray()
    print(array.shape)
    words = vectorizer.get_feature_names_out()
    df = pd.DataFrame(array, columns = words)
    return df

tf_idf_df = tf_idf(synopses, 'tfidf')


(8257, 79903)


In [4]:
def cosine_similarity(vector_a, vector_b):
    distance = np.dot(vector_a, vector_b) / (np.linalg.norm(vector_a) * np.linalg.norm(vector_b))
    return distance

In [5]:
data = [] 
for index, row in test_df.iterrows():
    if (row[1] in tf_idf_df.columns and row[2] in tf_idf_df.columns):
        predicted_similarity = cosine_similarity(tf_idf_df[row[1]], tf_idf_df[row[2]]) * 10
    else:
        predicted_similarity = 0.05
    data.append([row[0], predicted_similarity])
    print(row[3], predicted_similarity)
output_df = pd.DataFrame(data)

5.48 0.21488579234405136
2.97 0.0
8.57 0.01229217692963647
4.42 0.3537862009263995
8.82 0.28259446621760265
8.57 0.8545597214347915
6.38 0.5652805550382304
1.43 0.0
4.05 0.0015884908469520407
0.58 0.0
9.47 1.1488236951765567
2.07 0.611909209161084
4.85 0.9279440748470842
4.05 1.7292639096878906
3.65 0.06601524685506252
2.75 0.4141492899982926
3.83 0.5008522455157683
1.58 1.2836578891324855
2.58 0.09316589763656362
1.67 0.13613248965429695
0.58 0.271276345144046
0.48 0.46146705188885084
0.4 0.10277710645647363
3.4 1.9986487673344742
3.02 0.4548238653505756
7.3 0.11507211438631987
0.92 1.1323823733474658
3.5 0.5267197587416685
7.7 0.7752144272545198
5.08 0.5606528840770185
5.95 0.170531241197763
0.4 0.5832008457163697
7.85 0.09059991485229978
6.58 0.11351691686751707
7.03 0.023132956960538596
7.1 0.04892322617335416
4.17 0.04321033616216205
3.82 0.02335194094200042
4.17 0.1381859065189074
2.53 0.14663267104551297
4.17 0.18795878963488696
3.78 0.01690581182633486
7.53 1.3620999196900958
5

In [6]:
from pathlib import Path
filepath = Path('./10861383-Task1-method-a.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
print(output_df)
output_df.to_csv(filepath, index=False, header=False)  

       0         1
0      1  0.214886
1      2  0.000000
2      3  0.012292
3      4  0.353786
4      6  0.282594
..   ...       ...
145  177  0.745142
146  178  0.309537
147  179  0.617254
148  181  0.299473
149  182  0.771084

[150 rows x 2 columns]
