# CSC_583
# Hithesh Shanmugam
# HW 5

## Example usage

In [1]:
import numpy as np
from scipy.spatial.distance import cosine

def load_glove_embeddings(file_path, dimension):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector[:dimension]
    return embeddings

def load_glove_embeddings_by_dimension(directory_path, dimensions):
    embeddings = {}
    for dimension in dimensions:
        file_path = f"{directory_path}/glove.6B.{dimension}d.txt"
        embeddings[dimension] = load_glove_embeddings(file_path, dimension)
    return embeddings

glove_directory = 'C:/Users/sures/OneDrive - DePaul University/Desktop/glove.6B'
desired_dimensions = [50, 100, 200, 300]

# Load the GloVe embeddings for each desired dimension
glove_embeddings = load_glove_embeddings_by_dimension(glove_directory, desired_dimensions)

# Example usage
word = 'example'
desired_dimension = 100
embedding = glove_embeddings[desired_dimension].get(word)
if embedding is not None:
    print(f"Embedding for '{word}' in {desired_dimension}d: {embedding}")
else:
    print(f"No embedding found for '{word}' in {desired_dimension}d.")

Embedding for 'example' in 100d: [-0.12617    0.61724    0.22581    0.39868    0.16111    0.1523
 -0.14715   -0.29447   -0.27348   -0.13753   -0.20898   -0.73436
  0.14144    0.15048    0.09179    0.018613   0.22539    0.15979
 -0.16935    0.42716    0.042284  -0.3477    -0.11413    0.12222
 -0.025027  -0.20805   -0.067264  -0.2956    -0.30807   -0.32903
  0.19059    0.77141   -0.19332   -0.31069    0.26745    0.32231
  0.2065     0.10497    0.49425   -0.38322   -0.12802   -0.069906
 -0.14828    0.085369  -0.18141    0.14688    0.60968   -0.21131
 -0.29148   -0.52773    0.59508    0.017369   0.15342    0.81925
 -0.20643   -2.0378    -0.11884   -0.16826    1.5288     0.15756
 -0.4994     0.39305    0.12672   -0.10968    1.3671    -0.21006
  0.15684    0.0063801  0.43836   -0.18765   -0.29088    0.18619
  0.085402   0.13985    0.40794   -0.14811    0.26702   -0.19142
 -0.6189     0.0091217  0.34971   -0.24079   -0.52476   -0.25071
 -1.5681     0.22101    0.046796  -0.62616   -0.043358  -

## Part I: Vector Semantics

In [4]:
# Load GloVe embeddings
embeddings_directory = 'C:/Users/sures/OneDrive - DePaul University/Desktop/glove.6B'

def load_glove_embeddings(file_path, dimension):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:dimension + 1], dtype='float32')
            embeddings[word] = vector
    return embeddings

def load_glove_embeddings_by_dimension(directory_path, dimensions):
    embeddings = {}
    for dimension in dimensions:
        file_path = f"{directory_path}/glove.6B.{dimension}d.txt"
        embeddings[dimension] = load_glove_embeddings(file_path, dimension)
    return embeddings

glove_embeddings_directory = 'C:/Users/sures/OneDrive - DePaul University/Desktop/glove.6B'
desired_dimensions = [50, 100, 200, 300]

# Load the GloVe embeddings for each desired dimension
glove_embeddings = load_glove_embeddings_by_dimension(glove_embeddings_directory, desired_dimensions)

# Function to calculate cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Function to find the most similar word to a given word
def find_most_similar_word(word, embeddings, exclude_variants=True):
    target_vector = embeddings.get(word.lower())
    if target_vector is None:
        return f"No embedding found for '{word}'."
    
    similarities = []
    for w, vector in embeddings.items():
        if exclude_variants and w.lower().startswith(word.lower()):
            continue
        if w.lower() == word.lower():
            continue
        sim = cosine_similarity(target_vector, vector)
        similarities.append((w, sim))
    
    similarities.sort(key=lambda x: x[1], reverse=True)
    most_similar_word, similarity = similarities[0]
    return f"The most similar word to '{word}' is '{most_similar_word}' with a cosine similarity of {similarity:.4f}."

# Words for finding most similar
words = ['dog', 'whale', 'before', 'however', 'fabricate']

# Find most similar words for each input word for each desired dimension
for dimension in desired_dimensions:
    print(f"Dimension: {dimension}")
    embeddings = glove_embeddings[dimension]
    print()
    for word in words:
        result = find_most_similar_word(word, embeddings)
        print(result)
    print()

# Function to perform analogy using vector addition and subtraction
def analogy(a, b, a_star, embeddings):
    a_vec = embeddings.get(a.lower())
    b_vec = embeddings.get(b.lower())
    a_star_vec = embeddings.get(a_star.lower())
    
    if a_vec is None or b_vec is None or a_star_vec is None:
        return f"One or more words not found in embeddings."
    
    target_vec = b_vec - a_vec + a_star_vec
    similarities = []
    for w, vector in embeddings.items():
        sim = cosine_similarity(target_vec, vector)
        similarities.append((w, sim))
    
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_candidates = similarities[:3]
    return top_candidates

# Analogies
analogies = [('dog', 'puppy', 'cat'), ('speak', 'speaker', 'sing'), ('France', 'French', 'England'), ('France', 'wine', 'England')]

# Perform analogies and find top candidates for each desired dimension
for dimension in desired_dimensions:
    print(f"Dimension: {dimension}")
    embeddings = glove_embeddings[dimension]
    print()
    for analogy_words in analogies:
        a, b, a_star = analogy_words
        candidates = analogy(a, b, a_star, embeddings)
        print(f"{a} : {b} :: {a_star} : ?")
        for candidate, similarity in candidates:
            print(f"Candidate: {candidate} - Similarity: {similarity:.4f}")
        print()
    print()

Dimension: 50

The most similar word to 'dog' is 'cat' with a cosine similarity of 0.9218.
The most similar word to 'whale' is 'shark' with a cosine similarity of 0.8336.
The most similar word to 'before' is 'after' with a cosine similarity of 0.9512.
The most similar word to 'however' is 'although' with a cosine similarity of 0.9801.
The most similar word to 'fabricate' is 'fabricating' with a cosine similarity of 0.7595.

Dimension: 100

The most similar word to 'dog' is 'cat' with a cosine similarity of 0.8798.
The most similar word to 'whale' is 'shark' with a cosine similarity of 0.7840.
The most similar word to 'before' is 'after' with a cosine similarity of 0.9246.
The most similar word to 'however' is 'although' with a cosine similarity of 0.9658.
The most similar word to 'fabricate' is 'invent' with a cosine similarity of 0.7040.

Dimension: 200

The most similar word to 'dog' is 'cat' with a cosine similarity of 0.7445.
The most similar word to 'whale' is 'humpback' with a co