# Mini Project 1 - Part 1_Jens Jung: Word Embeddings in Colab


In [1]:
# libraries
!pip install sentence-transformers openai gdown numpy pandas matplotlib
import numpy as np
import numpy.linalg as la
import pickle
import os
import gdown
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from openai import OpenAI
import pandas as pd
# API Key
os.environ["OPENAI_API_KEY"] = "INSERT_YOUR_API_KEY_HERE"





# 2. Helper Functions


In [3]:
def get_model_id_gdrive(model_type):
    """
    Returns the Google Drive IDs for the specified GloVe model type.
    """
    if model_type == "25d":
        word_index_id = "13qMXs3-oB9C6kfSRMwbAtzda9xuAUtt8"
        embeddings_id = "1-RXcfBvWyE-Av3ZHLcyJVsps0RYRRr_2"
    elif model_type == "50d":
        embeddings_id = "1DBaVpJsitQ1qxtUvV1Kz7ThDc3az16kZ"
        word_index_id = "1rB4ksHyHZ9skes-fJHMa2Z8J1Qa7awQ9"
    elif model_type == "100d":
        # Ids swapped
        word_index_id = "1-oWV0LqG3fmrozRZ7WB1jzeTJHRUI3mq"
        embeddings_id = "1SRHfX130_6Znz7zbdfqboKosz-PfNvNp"

    return word_index_id, embeddings_id
def download_glove_embeddings_gdrive(model_type):
    """
    Downloads GloVe embeddings from Google Drive.
    """
    word_index_id, embeddings_id = get_model_id_gdrive(model_type)
    embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
    word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
    if not os.path.exists(word_index_temp):
        print(f"Downloading word index for {model_type}...")
        gdown.download(id=word_index_id, output=word_index_temp, quiet=False)

    if not os.path.exists(embeddings_temp):
        print(f"Downloading embeddings for {model_type}...")
        gdown.download(id=embeddings_id, output=embeddings_temp, quiet=False)
def load_glove_embeddings_gdrive(model_type):
    """
    Loads the downloaded GloVe embeddings.
    """
    word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
    embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"

    word_index_dict = pickle.load(open(word_index_temp, "rb"), encoding="latin")
    embeddings = np.load(embeddings_temp)

    return word_index_dict, embeddings
def get_openai_embeddings(sentence, model_name="text-embedding-3-small"):
    """
    Get OpenAI embeddings. Returns zero vector if API call fails.
    """
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    try:
        response = client.embeddings.create(input=sentence, model=model_name)
        return np.array(response.data[0].embedding)
    except Exception as e:
        print(f"Error getting OpenAI embeddings: {e}")
        dim = 3072 if model_name == "text-embedding-3-large" else 1536
        return np.zeros(dim)
# Initialize Sentence Transformer
st_model_mini = SentenceTransformer("all-MiniLM-L6-v2")
def get_sentence_transformer_embeddings(sentence, model_name="all-MiniLM-L6-v2"):
    """
    Get Sentence Transformer embeddings.
    """
    try:
        if model_name == "all-MiniLM-L6-v2":
            return st_model_mini.encode(sentence)
        else:
            temp_model = SentenceTransformer(model_name)
            return temp_model.encode(sentence)
    except:
        return np.zeros(384)
def get_glove_embeddings(word, word_index_dict, embeddings, model_type):
    """
    Get embedding for a single word from GloVe.
    """
    if word.lower() in word_index_dict:
        return embeddings[word_index_dict[word.lower()]]
    else:
        return np.zeros(int(model_type.split("d")[0]))

# 3. Tasks Implementation


In [4]:
def cosine_similarity(x, y):
    """
    Exponentiated cosine similarity
    1. Compute cosine similarity
    2. Exponentiate cosine similarity
    3. Return exponentiated cosine similarity
    """
    # No division by zero
    norm_x = la.norm(x)
    norm_y = la.norm(y)

    if norm_x == 0 or norm_y == 0:
        return 0.0

    dot_product = np.dot(x, y)
    cos_sim = dot_product / (norm_x * norm_y)

    return np.exp(cos_sim)

# Task II: Averaged GloVe Embeddings
We implement the function to calculate the sentence embedding by averaging the embeddings of its constituent words.

In [5]:
def averaged_glove_embeddings_gdrive(sentence, word_index_dict, embeddings, model_type="50d"):
    """
    Get averaged glove embeddings for a sentence
    """
    embedding_dim = int(model_type.split("d")[0])

    words = sentence.split()
    if not words:
        return np.zeros(embedding_dim)

    sum_embedding = np.zeros(embedding_dim)

    for word in words:
        word_embed = get_glove_embeddings(word, word_index_dict, embeddings, model_type)
        sum_embedding += word_embed

    avg_embedding = sum_embedding / len(words)
    return avg_embedding

# Task III. Sorted Cosine Similarity


In [6]:
def get_sorted_cosine_similarity(input_sentence, categories_str, embeddings_metadata):
    """
    Get sorted cosine similarity between input sentence and categories.

    Args:
        input_sentence: The search query/sentence.
        categories_str: Space-separated string of categories (e.g., "Flowers Colors Cars").
        embeddings_metadata: Dict with model info.
    """
    categories = categories_str.split(" ")
    scores = []

    # 1. Calculate Input Sentence

    if embeddings_metadata["embedding_model"] == "glove":
        word_index = embeddings_metadata["word_index_dict"]
        embeds = embeddings_metadata["embeddings"]
        m_type = embeddings_metadata["model_type"]

        # Input Embedding
        input_vec = averaged_glove_embeddings_gdrive(input_sentence, word_index, embeds, m_type)

        # Category Embeddings & Similarity
        for i, cat in enumerate(categories):
            cat_vec = averaged_glove_embeddings_gdrive(cat, word_index, embeds, m_type)
            score = cosine_similarity(input_vec, cat_vec)
            scores.append((i, score))
    elif embeddings_metadata["embedding_model"] == "openai":
        model_name = embeddings_metadata["model_name"]

        # Input Embedding
        input_vec = get_openai_embeddings(input_sentence, model_name)

        # Category Embeddings & Similarity
        for i, cat in enumerate(categories):
            cat_vec = get_openai_embeddings(cat, model_name)
            score = cosine_similarity(input_vec, cat_vec)
            scores.append((i, score))
    else: # Transformers
        model_name = embeddings_metadata["model_name"]

        # Input Embedding
        input_vec = get_sentence_transformer_embeddings(input_sentence, model_name)

        # Category Embeddings & Similarity
        for i, cat in enumerate(categories):
            cat_vec = get_sentence_transformer_embeddings(cat, model_name)
            score = cosine_similarity(input_vec, cat_vec)
            scores.append((i, score))
    # 2. Sort results by score (descending)
    scores.sort(key=lambda x: x[1], reverse=True)

    return scores

# 4. Execution and Visualization


In [7]:
def plot_pie_chart(sorted_scores, categories_str, title):
    categories = categories_str.split(" ")

    labels = [categories[idx] for idx, score in sorted_scores]
    sizes = [score for idx, score in sorted_scores]

    plt.figure(figsize=(6, 6))
    plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140)
    plt.title(title)
    plt.show()
def run_demo(categories_input, text_search_input, glove_dim="50d"):
    print(f"--- Running Demo ---")
    print(f"Categories: {categories_input}")
    print(f"Input: '{text_search_input}'")
    print(f"GloVe Model: {glove_dim}")

    # 1. Setup Data
    download_glove_embeddings_gdrive(glove_dim)
    word_index, embeddings = load_glove_embeddings_gdrive(glove_dim)

    results = {}

    # 2. Run Models

    # GloVe
    print("\nCalculating GloVe...")
    meta_glove = {
        "embedding_model": "glove",
        "word_index_dict": word_index,
        "embeddings": embeddings,
        "model_type": glove_dim
    }
    res_glove = get_sorted_cosine_similarity(text_search_input, categories_input, meta_glove)
    results["GloVe"] = res_glove

    # Sentence Transformer
    print("Calculating Sentence Transformer...")
    meta_trans = {
        "embedding_model": "transformers",
        "model_name": "all-MiniLM-L6-v2"
    }
    res_trans = get_sorted_cosine_similarity(text_search_input, categories_input, meta_trans)
    results["SentenceTransformer"] = res_trans

    # OpenAI Small
    print("Calculating OpenAI Small...")
    meta_oa_small = {
        "embedding_model": "openai",
        "model_name": "text-embedding-3-small"
    }
    res_oa_small = get_sorted_cosine_similarity(text_search_input, categories_input, meta_oa_small)
    results["OpenAI_Small"] = res_oa_small

    # OpenAI Large
    print("Calculating OpenAI Large...")
    meta_oa_large = {
        "embedding_model": "openai",
        "model_name": "text-embedding-3-large"
    }
    res_oa_large = get_sorted_cosine_similarity(text_search_input, categories_input, meta_oa_large)
    results["OpenAI_Large"] = res_oa_large

    # 3. Display Results
    categories_list = categories_input.split(" ")

    # Create Comparison Table
    data = []
    for model_name, res in results.items():
        top_idx, top_score = res[0]
        data.append({
            "Model": model_name,
            "Top Category": categories_list[top_idx],
            "Confidence": f"{top_score:.4f}"
        })
    df = pd.DataFrame(data)
    print("\n--- Summary Results ---")
    print(df)
    return results
# --- TEST 1: Basic Test ---
run_demo("Flowers Colors Cars Weather Food", "Roses are red, trucks are blue", glove_dim="50d")
# --- TEST 2: Sentiment ---
print("\n\n--- Sentiment Test 1 ---")
run_demo("Positive Negative", "The movie was upsetting", glove_dim="50d")
# Input 2: "This is terrible"
print("\n\n--- Sentiment Test 2 ---")
run_demo("Positive Negative", "This is terrible", glove_dim="50d")

--- Running Demo ---
Categories: Flowers Colors Cars Weather Food
Input: 'Roses are red, trucks are blue'
GloVe Model: 50d
Downloading word index for 50d...


Downloading...
From: https://drive.google.com/uc?id=1rB4ksHyHZ9skes-fJHMa2Z8J1Qa7awQ9
To: /content/word_index_dict_50d_temp.pkl
100%|██████████| 60.3M/60.3M [00:00<00:00, 99.9MB/s]


Downloading embeddings for 50d...


Downloading...
From (original): https://drive.google.com/uc?id=1DBaVpJsitQ1qxtUvV1Kz7ThDc3az16kZ
From (redirected): https://drive.google.com/uc?id=1DBaVpJsitQ1qxtUvV1Kz7ThDc3az16kZ&confirm=t&uuid=6a7af07b-085c-4238-a274-a8bad1911e50
To: /content/embeddings_50d_temp.npy
100%|██████████| 477M/477M [00:05<00:00, 93.5MB/s]



Calculating GloVe...
Calculating Sentence Transformer...
Calculating OpenAI Small...
Calculating OpenAI Large...

--- Summary Results ---
                 Model Top Category Confidence
0                GloVe       Colors     2.1371
1  SentenceTransformer      Flowers     1.6905
2         OpenAI_Small      Flowers     1.4881
3         OpenAI_Large      Flowers     1.4724


--- Sentiment Test 1 ---
--- Running Demo ---
Categories: Positive Negative
Input: 'The movie was upsetting'
GloVe Model: 50d

Calculating GloVe...
Calculating Sentence Transformer...
Calculating OpenAI Small...
Calculating OpenAI Large...

--- Summary Results ---
                 Model Top Category Confidence
0                GloVe     Positive     1.8110
1  SentenceTransformer     Negative     1.1408
2         OpenAI_Small     Negative     1.2863
3         OpenAI_Large     Negative     1.2693


--- Sentiment Test 2 ---
--- Running Demo ---
Categories: Positive Negative
Input: 'This is terrible'
GloVe Model: 50d

Ca

{'GloVe': [(0, np.float64(1.9139887701630862)),
  (1, np.float64(1.7827631592375468))],
 'SentenceTransformer': [(1, np.float32(1.3540865)),
  (0, np.float32(1.2986877))],
 'OpenAI_Small': [(1, np.float64(1.3417168345735198)),
  (0, np.float64(1.183129484034134))],
 'OpenAI_Large': [(1, np.float64(1.3203831599227975)),
  (0, np.float64(1.19677959468169))]}

# 5. Part C: Real-World Applications (Word Order Experiment)
Here we perform the experiment described in Part C of the README. We use the categories "cinema hotel restaurant" and test two shuffled sentences to see if the model classification changes.

In [8]:
def run_part_c_experiment():
    categories = "cinema hotel restaurant"

    # Example Pair 1

    s1 = "cinema hotel restaurant"
    s2 = "hotel restaurant cinema"

    print("\n\n=== Part C: Word Order Experiment ===")
    print(f"Categories: {categories}")

    print(f"\nDistorted Sentence 1: '{s1}'")
    run_demo(categories, s1, glove_dim="50d")

    print(f"\nDistorted Sentence 2: '{s2}'")
    run_demo(categories, s2, glove_dim="50d")
# Run the experiment
run_part_c_experiment()



=== Part C: Word Order Experiment ===
Categories: cinema hotel restaurant

Distorted Sentence 1: 'cinema hotel restaurant'
--- Running Demo ---
Categories: cinema hotel restaurant
Input: 'cinema hotel restaurant'
GloVe Model: 50d

Calculating GloVe...
Calculating Sentence Transformer...
Calculating OpenAI Small...
Calculating OpenAI Large...

--- Summary Results ---
                 Model Top Category Confidence
0                GloVe        hotel     2.5466
1  SentenceTransformer        hotel     2.0844
2         OpenAI_Small       cinema     2.0342
3         OpenAI_Large       cinema     1.9845

Distorted Sentence 2: 'hotel restaurant cinema'
--- Running Demo ---
Categories: cinema hotel restaurant
Input: 'hotel restaurant cinema'
GloVe Model: 50d

Calculating GloVe...
Calculating Sentence Transformer...
Calculating OpenAI Small...
Calculating OpenAI Large...

--- Summary Results ---
                 Model Top Category Confidence
0                GloVe        hotel     2.5466
1  Se