In [1]:
import gensim
import gensim.downloader as api
import numpy as np
import pandas as pd
import random

In [None]:
# Load the dataset
synonym_test_dataset = pd.read_csv('A2-DataSet/synonym.csv')

print(synonym_test_dataset.head())
print(synonym_test_dataset.info())

# Task 1

In [None]:
# takes up to about 5 minuntes to load
w2v_model = api.load('word2vec-google-news-300')

In [None]:
def assign_label(question_word, correct_answer, closest_synonym, guess_words, model):
    # Check if question word and at least one guess word are in the vocabulary
    if question_word in model.key_to_index and any(word in model.key_to_index for word in guess_words):
        # Check if the closest guess is correct
        if closest_synonym is not None and closest_synonym == correct_answer:
            return "correct"
        else:
            return "wrong"
    else:
        return "guess"

def closest_synonym(query, list_of_guess_words, model):
    closest_synonym = random.choice(list_of_guess_words) if list_of_guess_words else None
    max_similarity = -1

    # Check if the query word is in the model's vocabulary
    if query not in model.key_to_index:
        print(f"'{query}' is not in the vocabulary.")
        # Return a random guess word if the query is not in the vocabulary
        return closest_synonym

    for guess_word in list_of_guess_words:
        # Check if the guess word is in the model's vocabulary
        if guess_word in model.key_to_index:
            try:
                sim_score = model.similarity(query, guess_word)
                if sim_score > max_similarity:
                    closest_synonym = guess_word
                    max_similarity = sim_score
            except KeyError:
                # Handle the error if the word is not in the model's vocabulary
                continue
        else:
            print(f"'{guess_word}' is not in the vocabulary.")
            # Return a random guess word if the query is not in the vocabulary
            return closest_synonym

    return closest_synonym


# Function to proSV file and apply the closest_synonym function
def process_csv(file_path, model_name, model):
    
    # Size of the Vocabulary
    vocab_size = len(model.key_to_index)

    question_words = []
    answer_words = []
    guess_words = []
    labels = []
    C = 0
    V = 0
    
    # Read the CSV file into a Pandas DataFrame, skipping the first row
    synonym_test_dataset = pd.read_csv(file_path)

    # Process each row in the DataFrame
    for index, row in synonym_test_dataset.iterrows():
        # Split the row into words based on comma
        words = row.to_list()
        #print(words)

        # Store the first word in 'query' and the rest in 'list_of_guess_words'
        query = words[0]
        #print(query)
        answer = words[1]
        #print(answer)
        list_of_guess_words = words[2:]
        #print(list_of_guess_words)

        # Call the 'closest_synonym' function and store the result
        result = closest_synonym(query, list_of_guess_words, model)
        
        question_words.append(query)
        answer_words.append(answer)
        guess_words.append(result)
        label = assign_label(query, answer, result, list_of_guess_words, model)
        labels.append(label)
        
        if label == 'correct':
            C += 1
        if label != 'guess':
            V += 1
    
    if V == 0:
        accuracy = 0
    else: 
        accuracy = C/V
    
    results_df = pd.DataFrame({'question_word': question_words, 'answer_word': answer_words, 'guess_word': guess_words, 'label': labels})    
    results_df.to_csv(f"{model_name}-details.csv", index=False)
    
    analysis_df = pd.DataFrame({'model_name': [model_name], 'vocab_size': [vocab_size], 'C': [C], 'V': [V], 'accuracy': accuracy})    
    analysis_df.to_csv('analysis.csv', mode='a', index=False, header=not pd.io.common.file_exists('analysis.csv'))
    
    print(results_df)
    


In [None]:
file_path = 'A2-DataSet/synonym.csv' 
processed_results = process_csv(file_path, 'word2vec-google-news-300', w2v_model)

# Task 2 

Note: we got the corpus names from the offical gensim GitHub page (https://github.com/piskvorky/gensim-data).

In [None]:
# takes about 3 minuntes to load
gigaword_model = api.load('glove-wiki-gigaword-300')

In [None]:
# takes about 8 minuntes to load
fasttext_model = api.load('fasttext-wiki-news-subwords-300')

In [None]:
# takes about 5 minuntes to load
glove_twitter_200_model = api.load('glove-twitter-200')

In [None]:
# takes about 2 minuntes to load
glove_twitter_25_model = api.load('glove-twitter-25')

In [None]:
file_path = 'A2-DataSet/synonym.csv' 

processed_results = process_csv(file_path, 'glove-wiki-gigaword-300', gigaword_model)
processed_results = process_csv(file_path, 'fasttext-wiki-news-subwords-300', fasttext_model)
processed_results = process_csv(file_path, 'glove-twitter-200', glove_twitter_200_model)
processed_results = process_csv(file_path, 'glove-twitter-25', glove_twitter_25_model)

# Task 3

In [2]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [3]:
def read_and_process_docs(file_paths):
    
    documents = []
    
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            file_contents = file.read()
            documents.append(process_text(file_contents))
    
    print(documents)
    print(f"Processed {len(documents)} documents.")
    return documents


def process_text(text):
        # tokenize the text into sentences
        sentences = sent_tokenize(text)
        sentences_list = []
        
        for sentence in sentences:
            # tokenize the text
            tokens = word_tokenize(sentence)
            
            # remove punctuation and numbers and convert to lowercase
            tokens = [word.lower() for word in tokens if word.isalpha()]
            
            # join the tokens into a single string with each word separated by a space
            # tokens_string = ' '.join(tokens)
            
            # return the processed sentence
            sentences_list.extend(tokens)
            
        return sentences_list

def create_word2vec_embeddings(list_of_list_of_sentences, window_sizes, embedding_sizes):
    for window_size in window_sizes:
        for embedding_size in embedding_sizes:
            model_name = f"Word2Vec_w_{window_size}_e_{embedding_size}"
            print(f"Training model: {model_name}")

            # Concatenate all documents into a single list of sentences
            all_sentences = [sentence for doc in list_of_list_of_sentences for sentence in doc]

            # Train the model using all sentences
            model = gensim.models.Word2Vec(sentences=all_sentences, vector_size=embedding_size, window=window_size)

            # Save the model
            model.save(f"{model_name}.model")


In [4]:
file_paths = ['Online-Books/Moby_Dick_Or_The_Whale.txt', 'Online-Books/Peter_Pan.txt', 'Online-Books/Pride_and_Prejudice.txt', 'Online-Books/The_Complete_Works_of_William_Shakespeare.txt', 'Online-Books/The_Importance_of_Being_Earnest_A_Trivial_Comedy_for_Serious_People.txt', 'Online-Books/Winnie_the_Pooh.txt']
list_of_list_of_sentences = read_and_process_docs(file_paths)
window_size = [3, 7]
embedding_size = [25, 500]    
create_word2vec_embeddings(list_of_list_of_sentences, window_size, embedding_size)

Processed 6 documents.
Training model: Word2Vec_w_3_e_25
Training model: Word2Vec_w_3_e_500
Training model: Word2Vec_w_7_e_25
Training model: Word2Vec_w_7_e_500


In [5]:
def assign_label_word_vectors(question_word, correct_answer, closest_synonym, guess_words, model):
    word_vectors = model.wv
    # Check if question word and at least one guess word are in the vocabulary
    if question_word in word_vectors.key_to_index and any(word in word_vectors.key_to_index for word in guess_words):
        # Check if the closest guess is correct
        if closest_synonym is not None and closest_synonym == correct_answer:
            return "correct"
        else:
            return "wrong"
    else:
        return "guess"

def closest_synonym_word_vectors(query, list_of_guess_words, model):
    word_vectors = model.wv
    closest_synonym = random.choice(list_of_guess_words) if list_of_guess_words else None
    max_similarity = -1

    # Check if the query word is in the model's vocabulary
    if query not in word_vectors.key_to_index:
        print(f"'{query}' is not in the vocabulary.")
        # Return a random guess word if the query is not in the vocabulary
        return closest_synonym

    for guess_word in list_of_guess_words:
        # Check if the guess word is in the model's vocabulary
        if guess_word in word_vectors.key_to_index:
            try:
                sim_score = model.similarity(query, guess_word)
                if sim_score > max_similarity:
                    closest_synonym = guess_word
                    max_similarity = sim_score
            except KeyError:
                # Handle the error if the word is not in the model's vocabulary
                continue
        else:
            print(f"'{guess_word}' is not in the vocabulary.")
            # Return a random guess word if the query is not in the vocabulary
            return closest_synonym

    return closest_synonym

# Function to proSV file and apply the closest_synonym function
def process_csv_word_vectors(file_path, model_name, model):
    
    word_vectors = model.wv
    # Size of the Vocabulary
    vocab_size = len(word_vectors.key_to_index)

    question_words = []
    answer_words = []
    guess_words = []
    labels = []
    C = 0
    V = 0
    
    # Read the CSV file into a Pandas DataFrame, skipping the first row
    synonym_test_dataset = pd.read_csv(file_path)

    # Process each row in the DataFrame
    for index, row in synonym_test_dataset.iterrows():
        # Split the row into words based on comma
        words = row.to_list()
        #print(words)

        # Store the first word in 'query' and the rest in 'list_of_guess_words'
        query = words[0]
        #print(query)
        answer = words[1]
        #print(answer)
        list_of_guess_words = words[2:]
        #print(list_of_guess_words)

        # Call the 'closest_synonym' function and store the result
        result = closest_synonym_word_vectors(query, list_of_guess_words, model)
        
        question_words.append(query)
        answer_words.append(answer)
        guess_words.append(result)
        label = assign_label_word_vectors(query, answer, result, list_of_guess_words, model)
        labels.append(label)
        
        if label == 'correct':
            C += 1
        if label != 'guess':
            V += 1
    
    if V == 0:
        accuracy = 0
    else: 
        accuracy = C/V
    
    results_df = pd.DataFrame({'question_word': question_words, 'answer_word': answer_words, 'guess_word': guess_words, 'label': labels})    
    results_df.to_csv(f"{model_name}-details.csv", index=False)
    
    analysis_df = pd.DataFrame({'model_name': [model_name], 'vocab_size': [vocab_size], 'C': [C], 'V': [V], 'accuracy': accuracy})    
    analysis_df.to_csv('analysis.csv', mode='a', index=False, header=not pd.io.common.file_exists('analysis.csv'))
    
    print(results_df)
    


In [6]:
file_path = 'A2-DataSet/synonym.csv' 

w_3_e_25_model_name = "Word2Vec_w_3_e_25"
w_3_e_500_model_name = "Word2Vec_w_3_e_500"
w_7_e_25_model_name = "Word2Vec_w_7_e_25"
w_7_e_500_model_name = "Word2Vec_w_7_e_500"

w_3_e_25_model = gensim.models.Word2Vec.load(f"{w_3_e_25_model_name}.model")
w_3_e_500_model = gensim.models.Word2Vec.load(f"{w_3_e_500_model_name}.model")
w_7_e_25_model = gensim.models.Word2Vec.load(f"{w_7_e_25_model_name}.model")
w_7_e_500_model = gensim.models.Word2Vec.load(f"{w_7_e_500_model_name}.model")

for word in w_7_e_500_model.wv.key_to_index:
    print(word)


process_csv_word_vectors(file_path, w_3_e_25_model_name, w_3_e_25_model)
process_csv_word_vectors(file_path, w_3_e_500_model_name, w_3_e_500_model)
process_csv_word_vectors(file_path, w_7_e_25_model_name, w_7_e_25_model)
process_csv_word_vectors(file_path, w_7_e_500_model_name, w_7_e_500_model)

 
e
t
o
a
i
n
s
h
r
l
d
u
m
w
y
c
f
g
b
p
v
k
j
q
x
z
æ
ç
é
è
à
œ
'enormously' is not in the vocabulary.
'provisions' is not in the vocabulary.
'haphazardly' is not in the vocabulary.
'prominent' is not in the vocabulary.
'zenith' is not in the vocabulary.
'flawed' is not in the vocabulary.
'urgently' is not in the vocabulary.
'consumed' is not in the vocabulary.
'advent' is not in the vocabulary.
'concisely' is not in the vocabulary.
'salutes' is not in the vocabulary.
'solitary' is not in the vocabulary.
'hasten' is not in the vocabulary.
'perseverance' is not in the vocabulary.
'fanciful' is not in the vocabulary.
'showed' is not in the vocabulary.
'constantly' is not in the vocabulary.
'issues' is not in the vocabulary.
'furnish' is not in the vocabulary.
'costly' is not in the vocabulary.
'recognized' is not in the vocabulary.
'spot' is not in the vocabulary.
'make' is not in the vocabulary.
'often' is not in the vocabulary.
'easygoing' is not in the vocabulary.
'debate' is not in