### COMP - 472
Assignment Two

AI GURUS: James Partsafas, Ghaith Chrit, Samuel Collette

---------

#### Task 1

1) Add imports

In [None]:
import os
import gc
import csv
import nltk 
import random
import pandas as pd
from urllib import request
from itertools import product
import matplotlib.pyplot as plt
import gensim.downloader as api
from gensim.models import KeyedVectors, Word2Vec

nltk.download('punkt')

2) Load model

In [None]:
model_name = 'word2vec-google-news-300'
model = api.load(model_name) 

3) Load test dataset

In [None]:
dataset_path = 'A2-DataSet/synonym.csv'
df = pd.read_csv(dataset_path)

4) Setup variables

In [None]:
details_output_file = os.path.join('output', f'{model_name}-details.csv')
analysis_output_file = os.path.join('output', f'analysis.csv')

5) Report result

In [None]:
def create_details_file(details_output_file_name, df, model):
    correct_labels = 0
    answered_questions = 0
    with open(details_output_file_name, mode='w', newline='') as details_file:
      details_writer = csv.writer(details_file)
      details_writer.writerow(['question-word', 'correct-answer', 'guess-word', 'label'])

      for _, row in df.iterrows():
        question_word, correct_answer, guess_words = row['question'], row['answer'], row[2:].to_list()
        guess_words_in_vocab = [word for word in guess_words if word in model.index_to_key]

        if question_word in model.index_to_key and len(guess_words_in_vocab) > 0:
          most_similar_word = model.most_similar_to_given(question_word, guess_words_in_vocab)
          label = 'correct' if most_similar_word == correct_answer else 'wrong'
          correct_labels += 1 if label == 'correct' else 0
          answered_questions += 1
        else:
          label = 'guess'
          most_similar_word = random.choice(guess_words)

        details_writer.writerow([question_word, correct_answer, most_similar_word, label])
    
    return (correct_labels, answered_questions)

def write_to_analysis_file(analysis_output_file, correct_labels, answered_questions, write_mode):
    accuracy = correct_labels / answered_questions if answered_questions > 0 else 0

    with open(analysis_output_file, mode=write_mode, newline='') as analysis_file:
      analysis_writer = csv.writer(analysis_file)
      if (write_mode == 'w'):  
          analysis_writer.writerow(['Model Name', 'Vocab Length', 'Number of Correct Labels', 'Number of Answered Questions', 'Accuracy of Answered Questions'])
      analysis_writer.writerow([model_name, len(model.index_to_key), correct_labels, answered_questions, accuracy])

6) Run Model

In [None]:
correct_labels, answered_questions = create_details_file(details_output_file, df, model)
write_to_analysis_file(analysis_output_file, correct_labels, answered_questions, 'w')

In [None]:
model = None
gc.collect()

------------
#### Task 2

1) Run two models (Glove-Twitter and Glove-Wiki-Gigaword) with two embedding size (50 and 100) each 

In [None]:
models = ['glove-twitter-50', 'glove-wiki-gigaword-50', 'glove-twitter-100', 'glove-wiki-gigaword-100']

for model in models:
  global model_name

  model_name = model
  model = api.load(model_name)

  details_output_file = os.path.join('output', f'{model_name}-details.csv')
  correct_labels, answered_questions = create_details_file(details_output_file, df, model)
  write_to_analysis_file(analysis_output_file, correct_labels, answered_questions, 'a')
  
  model = None
  gc.collect()


2) Graph Result For Analysis

In [None]:
def create_accuracy_bar_chart(models, outputFile):
    analysis_file = 'output/analysis.csv'
    df_analysis = pd.read_csv(analysis_file)
    analysis_rows = df_analysis.loc[df_analysis['Model Name'].isin(models)]
    
    students_file = 'A2-DataSet/COMP-472-per-question.csv'
    df_students = pd.read_csv(students_file, encoding='UTF-16 LE')
    students_average_row = df_students.loc[0]
    
    accuracy_data = {'Baseline': 25, 'Students': students_average_row['Accuracy']}
    for _, row in analysis_rows.iterrows():
        accuracy_data[row['Model Name']] = row['Accuracy of Answered Questions'] * 100

    keys = list(accuracy_data.keys())
    values = list(accuracy_data.values())
    
    fig, ax = plt.subplots()
    plt.bar(keys, values)
    ax.tick_params(axis='x', labelrotation=90, labelsize=6)
    fig.patch.set_facecolor('white')
    
    plt.xlabel('Models')
    plt.ylabel('Accuracy (%)')
    plt.title('Accuracy of different models')
    
    plt.tight_layout()
    plt.savefig(outputFile, dpi=300)
    plt.close(fig)

In [None]:
create_accuracy_bar_chart(['word2vec-google-news-300', 'glove-twitter-50', 'glove-wiki-gigaword-50', 'glove-twitter-100', 'glove-wiki-gigaword-100'], 'output/task2_accuracy.png')

------------
#### Task 3

1) Preprocesing all the books

In [None]:
def preprocess(urls):
    all_tokens = []
    
    for url in urls:
        response = request.urlopen(url)
        raw = response.read().decode('utf8')
        tokens_sen = nltk.sent_tokenize(raw)
        
        tokenized_sentences = []
        for sen in tokens_sen:
            words = nltk.word_tokenize(sen)
            tokenized_sentences.append(words)
        
        all_tokens.extend(tokenized_sentences)
    
    return all_tokens

2) Train a Word2Vec model

In [None]:
def train_model(tokens, window_size, embedding_size):
    name = f'custom-corpus-{embedding_size}-{window_size}'
    model = Word2Vec(sentences = tokens, window = window_size, vector_size=embedding_size)
    model.wv.save(f'output/custom-models/{name}.w2v')
    return name

3) Train models with different parameters

In [None]:
urls = [
    "https://www.gutenberg.org/cache/epub/2600/pg2600.txt",   # war-and-peace
    "https://www.gutenberg.org/cache/epub/28054/pg28054.txt", # brothers-karamazov
    "https://www.gutenberg.org/cache/epub/2554/pg2554.txt",   # crime-and-punishment
    "https://www.gutenberg.org/cache/epub/7178/pg7178.txt",   # Swann's-Way
    "https://www.gutenberg.org/cache/epub/1399/pg1399.txt",   # Anna-Karenina
]

names = []
tokens = preprocess(urls)

window_sizes = [3, 5]
embedding_sizes = [50, 100]

for window_size, embedding_size in product(window_sizes, embedding_sizes):
    global model_name

    model_name = train_model(tokens, window_size, embedding_size)
    model = KeyedVectors.load(f'output/custom-models/{model_name}.w2v')

    names.append(model_name)
    details_output_file = os.path.join('output', f'{model_name}-details.csv')
    correct_labels, answered_questions = create_details_file(details_output_file, df, model)
    write_to_analysis_file(analysis_output_file, correct_labels, answered_questions, 'a')

    model = None
    gc.collect()

create_accuracy_bar_chart(names, 'output/task3_accuracy.png')
    