In [None]:
import gensim.downloader as api
w2v_model = api.load('word2vec-google-news-300')

In [None]:
from gensim.similarities import Similarity

In [None]:
import numpy as np
import pandas as pd
import random

In [None]:
# Load the dataset
synonym_test_dataset = pd.read_csv('A2-DataSet/synonym.csv')

print(synonym_test_dataset.head())
print(synonym_test_dataset.info())

# Task 1

In [None]:
def assign_label(question_word, correct_answer, closest_synonym, guess_words):
    # Check if question word and at least one guess word are in the vocabulary
    if question_word in w2v_model.key_to_index and any(word in w2v_model.key_to_index for word in guess_words):
        # Check if the closest guess is correct
        if closest_synonym is not None and closest_synonym == correct_answer:
            return "correct"
        else:
            return "wrong"
    else:
        return "guess"

In [None]:
def closest_synonym(query, list_of_guess_words):
    closest_synonym = random.choice(list_of_guess_words) if list_of_guess_words else None
    max_similarity = -1

    # Check if the query word is in the model's vocabulary
    if query not in w2v_model.key_to_index:
        print(f"'{query}' is not in the vocabulary.")
        # Return a random guess word if the query is not in the vocabulary
        return closest_synonym

    for guess_word in list_of_guess_words:
        # Check if the guess word is in the model's vocabulary
        if guess_word in w2v_model.key_to_index:
            try:
                sim_score = w2v_model.similarity(query, guess_word)
                if sim_score > max_similarity:
                    closest_synonym = guess_word
                    max_similarity = sim_score
            except KeyError:
                # Handle the error if the word is not in the model's vocabulary
                continue
        else:
            print(f"'{guess_word}' is not in the vocabulary.")
            # Return a random guess word if the query is not in the vocabulary
            return closest_synonym

    return closest_synonym








# Function to proSV file and apply the closest_synonym function
def process_csv(file_path):
    
    question_words = []
    answer_words = []
    guess_words = []
    labels = []
    
    
    # Read the CSV file into a Pandas DataFrame, skipping the first row
    synonym_test_dataset = pd.read_csv(file_path)

    # Process each row in the DataFrame
    for index, row in synonym_test_dataset.iterrows():
        # Split the row into words based on comma
        words = row.to_list()
        #print(words)

        # Store the first word in 'query' and the rest in 'list_of_guess_words'
        query = words[0]
        #print(query)
        answer = words[1]
        #print(answer)
        list_of_guess_words = words[2:]
        #print(list_of_guess_words)

        # Call the 'closest_synonym' function and store the result
        result = closest_synonym(query, list_of_guess_words)
        
        question_words.append(query)
        answer_words.append(answer)
        guess_words.append(result)
        labels.append(assign_label(query, answer, result, list_of_guess_words))
    
    
    results_df = pd.DataFrame({'question_word': question_words, 'answer_word': answer_words, 'guess_word': guess_words, 'label': labels})    
    results_df.to_csv('word2vec-google-news-300-details.csv', index=False)
    
    print(results_df)

file_path = 'A2-DataSet/synonym.csv' 
processed_results = process_csv(file_path)

    