In [None]:
!wget -O wikitext-filtered-full.zip https://www.dropbox.com/scl/fi/ibd4cmixckghx6hhb361c/wikitext-filtered-full.zip?rlkey=q71cebf0k5fvvwhmcntoswzhq&dl=1
!wget -O wikitext-filtered-10k.zip https://www.dropbox.com/scl/fi/ek174r3sg7qjx0aa9atop/wikitext-filtered-10k.zip?rlkey=zy6jqxv6qsc16lr9qm3ki9uhf&dl=1
!wget -O visa-outlier-clusters.csv https://www.dropbox.com/scl/fi/duwtwt64uzv504if590sf/visa_outliers_US.csv?rlkey=6nictvov5pjaeue3yatz7m83k&dl=0
!wget -O wordsim353.zip https://gabrilovich.com/resources/data/wordsim353/wordsim353.zip

In [None]:
!unzip wikitext-filtered-full.zip
!unzip wikitext-filtered-10k.zip
!unzip wordsim353.zip

In [None]:
!pip install datasets
import ast
import os
import pandas as pd
import numpy as np
import nltk
import string
import gensim.downloader as api
from nltk.corpus import stopwords
from collections.abc import Mapping
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from scipy import stats
from scipy.stats import spearmanr
from scipy.spatial import distance
from nltk.corpus import stopwords as nltk_stopwords
from datasets import load_dataset, Dataset


In [None]:
def load_dataset():
  wikitext_small = "wikitext-filtered-10k"
  wikitext_large = "wikitext-filtered-full"

  dataset_small = Dataset.load_from_disk(wikitext_small)
  dataset_large = Dataset.load_from_disk(wikitext_large)
  return dataset_small, dataset_large

wikitext_small, wikitext_large = load_dataset()


wikitext_small: 10000 docs, wikitext_large: 859955 docs


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
stopwords_set = set(stopwords.words('english'))



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#First preprocess method that tokenizes the text from the files and then puts them into lowercase
# and stores them into a variable if they are not a stopword
def Preprocess(text):
  tokens = nltk.word_tokenize(text)
  preprocessed_tokens = [word.lower() for word in tokens if word.lower() not in stopwords_set]
  return preprocessed_tokens


In [None]:
#preprocessing the large and small texts
preprocessed_wikitext_small = [Preprocess(text) for text in wikitext_small['text']]
preprocessed_wikitext_large = [Preprocess(text) for text in wikitext_large['text']]

In [None]:
modelS = Word2Vec(
    vector_size = 50,
    sentences = preprocessed_wikitext_small,
    window = 5,
    epochs = 5,
    min_count = 5)

In [None]:
modelL = Word2Vec(
    vector_size = 50,
    sentences = preprocessed_wikitext_large,
    window = 5,
    epochs = 5,
    min_count = 5,
)

In [None]:
#savig and loading the model so that i keep my meausurements consistent
modelS.save("/content/modelS")
loaded_modelS = Word2Vec.load("/content/modelS")

modelL.save("/content/modelL")
loaded_modelL = Word2Vec.load("/content/modelL")


Step 3 Method

In [None]:
#read in the csv and store them into combined
combined = pd.read_csv("combined.csv")
#store the first two columns that contain the words for the pairs into a data frame and then
pairs_wiki = combined.iloc[:,0:2]
#create tuples of the pairs
pairs_wiki = [(word1,word2) for word1,word2 in zip(combined['Word 1'], combined['Word 2'])]

In [None]:
#compute cosine similairty for both the large and small models and then
def compute_cosine(model, dataframe):

  cosine_scores = []
  # had issues with using certain features like key_to_index
  # checks for the attribute so that those features can be used.
  word_vectors = model.wv if hasattr(model,'wv') else model
  for index, row in dataframe.iterrows():

    word1, word2, human_score = row['Word 1'], row['Word 2'], row['Human (mean)']

    #iterating through the words and getting the cosine similarity
    if word1 in word_vectors.key_to_index and word2 in word_vectors.key_to_index:
      similarity_score = word_vectors.similarity(word1, word2)
    else:
      #handles oov by setting to 0
      similarity_score= 0.0
    #adds score to the list
    cosine_scores.append(similarity_score)
  return cosine_scores
#actually computing the scores
small_similarity_scores = compute_cosine(modelS, combined)
large_similarity_scores = compute_cosine(modelL, combined)
#add scores as new column in the csv
combined['Similarity (Small)'] = small_similarity_scores
combined['Similarity (Large)'] = large_similarity_scores
#writing the csv to a new dataframe without the index, index row gave issues when calculating
combined.to_csv('combined_no_index.csv', index = False)


Step 3 Answers

In [None]:
example_pairs = [
    ('plane', 'car'),
    ('planet', 'sun'),
    ('cup', 'article'),
    ('sugar', 'approach')
]

#iterate through list by finding the word pair from the examples in the combined and getting the scores for the large and small model
for word1, word2 in example_pairs:
  pair_row = combined[(combined['Word 1'] == word1) & (combined['Word 2'] == word2)]
  small_score = pair_row['Similarity (Small)'].values[0]
  large_score = pair_row['Similarity (Large)'].values[0]

  print(f"Cosine Similarity using the Small Model for {word1} and {word2} is {small_score:.4f}")
  print(f"Cosine Similarity using the Large Model for {word1} and {word2} is {large_score:.4f}")


Cosine Similarity using the Small Model for plane and car is 0.9207
Cosine Similarity using the Large Model for plane and car is 0.6393
Cosine Similarity using the Small Model for planet and sun is 0.9564
Cosine Similarity using the Large Model for planet and sun is 0.6501
Cosine Similarity using the Small Model for cup and article is 0.4876
Cosine Similarity using the Large Model for cup and article is 0.0783
Cosine Similarity using the Small Model for sugar and approach is 0.9369
Cosine Similarity using the Large Model for sugar and approach is -0.2205


Step 4 Method

In [None]:
#get the similarity scores and humans mean to compute spearmans
small_cosine_scores = combined["Similarity (Small)"]
large_cosine_scores = combined["Similarity (Large)"]
human_scores = combined["Human (mean)"]

#compute the spearman and p value, i only display spearmans though
spearman_small, p_value_s = stats.spearmanr(small_cosine_scores, human_scores)
spearman_large, p_value_l = stats.spearmanr(large_cosine_scores, human_scores)

Step Answers

In [None]:
print(f"Spearman (small model) is {spearman_small:.4f}")
print(f"Spearman (large model) is {spearman_large:.4f}")

Spearman (small model) is -0.0704
Spearman (large model) is 0.5315


Step 5 Method

In [None]:
# different preprocess where i tokenize differently, i was having issues with the other preprocess method for this file
# so did a lot of trouble shooting where i tokenized differently by converting the columns to lists and then preproces
def Preprocess2(concept_list):
  return [concept.lower() for concept in concept_list if concept.lower() not in stopwords_set]


In [None]:

outlier_clusters_df = pd.read_csv('/content/visa-outlier-clusters.csv')

# where i turn the columns into lists and preprocess the relevant columnsoutlier_clusters_df['concepts'] = outlier_clusters_df['concepts'].apply(ast.literal_eval).apply(Preprocess2)
outlier_clusters_df['concepts'] = outlier_clusters_df['concepts'].apply(ast.literal_eval).apply(Preprocess2)
outlier_clusters_df['labels'] = outlier_clusters_df['labels'].apply(ast.literal_eval)

# general compactness method that works for both models and exlcudes the word of interest in the cluster
# so that it does not computer the simialirty score against itself
#returns summation of simlairty scores and then normalizes it
def compactness_score_without_word(cluster, word_to_exclude, model):
    #create a new list without deleted word
    cluster_without_word = [word for word in cluster if word != word_to_exclude]
    total_sim = 0
    #same check for wv in order to use the key_to_index function
    word_vectors = model.wv if hasattr(model,'wv') else model
    #calculating the simialirty for each word pair in the new cluster
    for i in range(len(cluster_without_word)):
        for j in range(len(cluster_without_word)):
            word1 = cluster_without_word[i]
            word2 = cluster_without_word[j]

            if i != j and word1 in word_vectors.key_to_index and word2 in word_vectors.key_to_index:
                similarity = word_vectors.similarity(word1, word2)
                total_sim += similarity

    return total_sim / (len(cluster_without_word) * (len(cluster_without_word) - 1))
# intialize lists so that i can calculate the accuracy efficiently
true_labels_small = []
predicted_outlier_indices_small = []
true_labels_large = []
predicted_outlier_indices_large = []

# iteration through each cluster in the dataset and
for index, row in outlier_clusters_df.iterrows():

    #gettng columns as lists
    concept_list = row['concepts']
    binary_labels = row['labels']

    #merging the two variables. I realie that i could condense as they are the same thing, but i did not have enough time
    true_labels_small.append(binary_labels.index(1))
    true_labels_large.append(binary_labels.index(1))

    #initilaizing the variables so that I can store the indices and highest score of predicted outliers
    highest_score_small = -1
    outlier_index_small = -1
    highest_score_large = -1
    outlier_index_large = -1

    #iterate through each concept and calcuates compactness score
    for i, concept in enumerate(concept_list):
        compactness_score_small = compactness_score_without_word(concept_list, concept, modelS)
        compactness_score_large = compactness_score_without_word(concept_list, concept, modelL)
        #updates the highest scores and indices of those scores for predicting outliers
        if compactness_score_small > highest_score_small:
            highest_score_small = compactness_score_small
            outlier_index_small = i
        if compactness_score_large > highest_score_large:
            highest_score_large = compactness_score_large
            outlier_index_large = i

    #updates the list with all of the indices of the predicted outliers accoridng to each model
    predicted_outlier_indices_small.append(outlier_index_small)
    predicted_outlier_indices_large.append(outlier_index_large)

#compares true outliers with the predicted outliers according to their index, and computes accuracy
accuracy_small = np.mean(np.array(true_labels_small) == np.array(predicted_outlier_indices_small))
accuracy_large = np.mean(np.array(true_labels_large) == np.array(predicted_outlier_indices_large))


Step 5 Answers

In [None]:
print(f"Accuracy Score (Small Model) for Outlier Detection Task: {accuracy_small:.4f}")
print(f"Accuracy Score (Large Model) for Outlier Detection Task: {accuracy_large:.4f}")

Accuracy Score (Small Model) for Outlier Detection Task: 0.1000
Accuracy Score (Large Model) for Outlier Detection Task: 0.4250


Step 6 Method

In [None]:
modelG = api.load('word2vec-google-news-300')

In [None]:
# computes similarity scores using the google news model
modelG_similarity_scores = compute_cosine(modelG, combined)

# adds a new column to combined
combined['Similarity (ModelG)'] = modelG_similarity_scores
human_scores = combined["Human (mean)"]

spearman_modelG, p_value_G = stats.spearmanr(modelG_similarity_scores, human_scores)


In [None]:
#big trend with this block of code is that when i started to seperate or use other function previously
# defined, i ran into issues. So i redefined a lot of functions, in order to ensure that it works. ideally i
# would not do this but do not have the time to figure out where I am going wrong
def Preprocess2(concept_list):

    return [concept.lower() for concept in concept_list if concept.lower() not in stopwords_set]

# i have dowloaded this twice, I am aware that I shouldnt have to do this. for some reason my code
# doesnt like it if i dont include it here. often gives me the wrong results
outlier_clusters_df = pd.read_csv('/content/visa-outlier-clusters.csv')

# converting to lists again
outlier_clusters_df['concepts'] = outlier_clusters_df['concepts'].apply(ast.literal_eval).apply(Preprocess2)
outlier_clusters_df['labels'] = outlier_clusters_df['labels'].apply(ast.literal_eval)

#same method as above, i am not sure what I am doing wrong but i needed to define it twice for it to function.
#it is functionally the same, but small differences like how i store "n". Ideally I would make the one function
#defined above work here. If i had more time that would be part of what I would fix
def compactness_score_without_word(cluster, word_to_exclude, model):
    cluster_without_word = [word for word in cluster if word != word_to_exclude]
    total_sim = 0

    for i in range(len(cluster_without_word)):
        for j in range(len(cluster_without_word)):
            word1 = cluster_without_word[i]
            word2 = cluster_without_word[j]

            if i != j and word1 in model.key_to_index and word2 in model.key_to_index:
                similarity = model.similarity(word1, word2)
                total_sim += similarity

    n = len(cluster_without_word)
    return total_sim / (n * (n - 1)) if n > 1 else 0.0

true_labels_extra_large = []
predicted_outlier_indices_extra_large = []

# Loop through the clusters and determine the highest compactness score and indices
for index, row in outlier_clusters_df.iterrows():
    concept_list = row['concepts']
    binary_labels = row['labels']

    true_labels_extra_large.append(binary_labels.index(1))

    highest_score_extra_large = -1
    outlier_index_extra_large = -1

    for i, concept in enumerate(concept_list):
        compactness_score_extra_large = compactness_score_without_word(concept_list, concept, modelG)

        if compactness_score_extra_large > highest_score_extra_large:
            highest_score_extra_large = compactness_score_extra_large
            outlier_index_extra_large = i

    predicted_outlier_indices_extra_large.append(outlier_index_extra_large)

accuracy_extra_large = np.mean(np.array(true_labels_extra_large) == np.array(predicted_outlier_indices_extra_large))



Step 6 Answers

In [None]:
print(f"Spearman (modelG) is {spearman_modelG:.4f}")

print(f"Accuracy for the Extra Large Model: {accuracy_extra_large:.4f}")

Spearman (modelG) is 0.7000
Accuracy for the Extra Large Model: 0.4500


step 8

In [None]:
#function that calculate the analogy by using the most_similar method between word 3 and word 1
# and it is not similar to word 2.it looks to capture the relationship between word 1 and 2,
#as with word 3 and the analogies. It filters out word 1 and word 3 results.
def find_analogy(word1, word2, word3, wv, topn=5):
    analogy = wv.most_similar(positive = [word3, word1], negative = [word2], topn=topn)
    words = [word for word, score in analogy if word!= word1 and word != word3]
    return words

In [None]:
analogies_examples = [
    ("man", "woman", "king"),
    ("Athens", "Greece", "Rome"),
    ("reading", "read", "playing"),
    ("Greece", "souvlaki", "Italy"),
    ("airplane", "propeller", "car"),
    ("man", "woman", "computer_programmer"),
    ("man", "woman", "superstar"),
    ("man", "woman", "guitarist"),
    ("man", "woman", "boss"),
]
#dictionary store the results
results = {}
#loops through each set of words and find the analogy, stores it into the dictionary
for w1, w2, w3 in analogies_examples:
  results[f"{w1} is to {w2} as {w3} is to"] = find_analogy(w1,w2,w3,modelG)

In [None]:
#prints the answer for all analogies in the results
for analogy, answer in results.items():
    print(f"{analogy} {answer}")