<a href="https://colab.research.google.com/github/MurthyKolluru/DBA/blob/main/Word2Vec_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Setup and Model Download (Direct GloVe .zip Load)

# IMPORTANT: Run this cell first. If prompted, you MUST restart the Colab runtime after this cell.
# Go to Runtime -> Restart runtime in the Colab menu.

print("Starting installation and setup...")

# Install necessary libraries (just requests for download, and unzip for extraction)
!pip install --quiet numpy requests
!pip install --quiet patool  # A utility to handle archives

import numpy as np
import requests
import os
import zipfile
import patoolib # For unzipping if zipfile has issues with large files, though zipfile should be fine.

print("NumPy, requests, and patool installed.")

# Define the URL for the GloVe 6B dataset (this is a .zip file containing all dimensions)
# This is from the official Stanford NLP website
glove_zip_url = "https://nlp.stanford.edu/data/glove.6B.zip"
glove_zip_file = "glove.6B.zip"
glove_target_dir = "glove.6B" # Directory where the contents will be extracted

print(f"Downloading GloVe model from {glove_zip_url}...")

# Download the zip file if it doesn't exist
if not os.path.exists(glove_zip_file):
    try:
        response = requests.get(glove_zip_url, stream=True)
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        with open(glove_zip_file, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print("GloVe zip file downloaded successfully!")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading GloVe zip file: {e}")
        print("Please check your internet connection or the URL.")
        raise SystemExit("Failed to download GloVe model. Cannot proceed.")
else:
    print("GloVe zip file already downloaded.")

# Unzip the file
if not os.path.exists(glove_target_dir):
    print(f"Unzipping {glove_zip_file} to {glove_target_dir}...")
    try:
        with zipfile.ZipFile(glove_zip_file, 'r') as zip_ref:
            zip_ref.extractall(glove_target_dir)
        print("GloVe zip file unzipped successfully!")
    except zipfile.BadZipFile:
        print(f"Error: {glove_zip_file} is a bad zip file. Download might be corrupted.")
        raise SystemExit("Corrupted zip file. Cannot proceed.")
    except Exception as e:
        print(f"Error unzipping GloVe file: {e}")
        raise SystemExit("Failed to unzip GloVe model. Cannot proceed.")
else:
    print(f"GloVe model already unzipped in {glove_target_dir}.")

# Now load the specific 50d model
glove_50d_file = os.path.join(glove_target_dir, "glove.6B.50d.txt")

print(f"Loading GloVe 50d model from {glove_50d_file} into memory... This may take a moment.")
word_vectors = {}
try:
    with open(glove_50d_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            word_vectors[word] = vector
    print("GloVe 50d model loaded successfully into dictionary!")
    print(f"Loaded {len(word_vectors)} word vectors, each with dimension {len(next(iter(word_vectors.values())))}.")
except FileNotFoundError:
    print(f"Error: {glove_50d_file} not found. Ensure it was downloaded and unzipped correctly.")
    raise SystemExit("GloVe 50d file not found. Cannot proceed.")
except Exception as e:
    print(f"Error loading GloVe 50d model: {e}")
    raise SystemExit("Error loading GloVe 50d model. Cannot proceed.")

# Define a function to get word vector for subsequent cells
def get_word_vector(word):
    """Returns the vector for a given word, or None if not found."""
    # Convert word to lowercase for GloVe as it's typically uncased
    return word_vectors.get(word.lower())

# Define a function for cosine similarity
def cosine_similarity(vec1, vec2):
    """Calculates cosine similarity between two numpy vectors."""
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0 # Handle zero vectors
    return dot_product / (norm_vec1 * norm_vec2)

print("\nReady to perform word embedding operations.")

Starting installation and setup...
NumPy, requests, and patool installed.
Downloading GloVe model from https://nlp.stanford.edu/data/glove.6B.zip...
GloVe zip file already downloaded.
GloVe model already unzipped in glove.6B.
Loading GloVe 50d model from glove.6B/glove.6B.50d.txt into memory... This may take a moment.
GloVe 50d model loaded successfully into dictionary!
Loaded 400000 word vectors, each with dimension 50.

Ready to perform word embedding operations.


In [None]:
# Cell 2: Find Similar Words to a Given Word
# This cell assumes 'word_vectors', 'get_word_vector', and 'cosine_similarity'
# are available from the previous cell.

# Define the input word
input_word_similar = "father" # You can easily change this word (e.g., "mathematics", "computer")

print(f"Finding words similar to: '{input_word_similar}'")

# Get the vector for the input word
input_vector = get_word_vector(input_word_similar)

if input_vector is None:
    print(f"Error: '{input_word_similar}' not found in the model's vocabulary.")
else:
    similarities = []
    # Iterate through all words in our loaded model
    for word, vector in word_vectors.items():
        # Skip the input word itself
        if word.lower() == input_word_similar.lower():
            continue

        # Calculate cosine similarity
        similarity = cosine_similarity(input_vector, vector)
        similarities.append((word, similarity))

    # Sort by similarity in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)

    # Show top 10 most similar words
    print(f"\nTop 10 words similar to '{input_word_similar}':")
    if similarities:
        for word, similarity in similarities[:10]:
            print(f"  {word}: {similarity:.4f}")
    else:
        print("No similar words found (or vocabulary is too small for meaningful comparison).")

Finding words similar to: 'father'

Top 10 words similar to 'father':
  son: 0.9529
  brother: 0.9323
  grandfather: 0.9146
  friend: 0.9048
  uncle: 0.8977
  mother: 0.8909
  daughter: 0.8905
  husband: 0.8807
  cousin: 0.8806
  wife: 0.8697


In [None]:
# Cell 3: Find Most Dissimilar Word in a List
# This cell assumes 'word_vectors', 'get_word_vector', and 'cosine_similarity'
# are available from the previous cells.

# Given a list of words
word_list_dissimilar = ["banana", "mango", "computer", "orange"] # You can change this list

print(f"Finding the odd one out in the list: {word_list_dissimilar}")

# Get vectors for all words in the list, handling missing words gracefully
vectors_for_list = []
valid_words_in_list = []
missing_words = []

for word in word_list_dissimilar:
    vector = get_word_vector(word)
    if vector is not None:
        vectors_for_list.append(vector)
        valid_words_in_list.append(word)
    else:
        missing_words.append(word)

if missing_words:
    print(f"Warning: The following words were not found in the model's vocabulary and will be excluded: {missing_words}")

if len(valid_words_in_list) < 2:
    print("Error: Not enough words with vector representations to find the odd one out.")
else:
    # To find the odd one out, we calculate the average similarity of each word
    # to all other words in the list. The word with the lowest average similarity
    # is considered the odd one out.
    lowest_avg_similarity = float('inf')
    odd_one_out = None

    for i, current_word in enumerate(valid_words_in_list):
        current_vector = vectors_for_list[i]

        total_similarity = 0.0
        comparison_count = 0

        for j, other_word in enumerate(valid_words_in_list):
            if i != j: # Don't compare a word to itself
                other_vector = vectors_for_list[j]
                similarity = cosine_similarity(current_vector, other_vector)
                total_similarity += similarity
                comparison_count += 1

        if comparison_count > 0:
            avg_similarity = total_similarity / comparison_count
            if avg_similarity < lowest_avg_similarity:
                lowest_avg_similarity = avg_similarity
                odd_one_out = current_word

    if odd_one_out:
        print(f"\nThe odd one out in the list is: '{odd_one_out}'")
    else:
        print("Could not determine the odd one out from the valid words.")

Finding the odd one out in the list: ['banana', 'mango', 'computer', 'orange']

The odd one out in the list is: 'computer'


In [None]:
# Cell 4: Demonstrate Vector Arithmetic
# This cell assumes 'word_vectors', 'get_word_vector', and 'cosine_similarity'
# are available from the previous cells.

# Perform operations like king - man + woman
# Expected conceptual result: queen, princess, etc.
positive_words = ['king', 'woman']
negative_words = ['man']

print(f"Performing vector arithmetic: {' + '.join(positive_words)} - {' + '.join(negative_words)}")

# Get vectors for positive words
pos_vectors = []
missing_pos_words = []
for word in positive_words:
    vec = get_word_vector(word)
    if vec is not None:
        pos_vectors.append(vec)
    else:
        missing_pos_words.append(word)

# Get vectors for negative words
neg_vectors = []
missing_neg_words = []
for word in negative_words:
    vec = get_word_vector(word)
    if vec is not None:
        neg_vectors.append(vec)
    else:
        missing_neg_words.append(word)

if missing_pos_words or missing_neg_words:
    print(f"Warning: Some words for arithmetic not found in vocabulary:")
    if missing_pos_words:
        print(f"  Missing positive words: {missing_pos_words}")
    if missing_neg_words:
        print(f"  Missing negative words: {missing_neg_words}")

if not pos_vectors and not neg_vectors:
    print("Error: No valid words with vectors for arithmetic operation. Cannot proceed.")
else:
    # Calculate the resulting vector
    # Sum positive vectors and subtract sum of negative vectors
    result_vector = np.zeros(list(word_vectors.values())[0].shape) # Initialize with correct dimension
    if pos_vectors:
        result_vector += np.sum(pos_vectors, axis=0)
    if neg_vectors:
        result_vector -= np.sum(neg_vectors, axis=0)

    # Find the most similar words to the result vector
    similarities = []
    result_vector_norm = np.linalg.norm(result_vector)

    if result_vector_norm == 0:
        print("Error: Resulting vector is a zero vector. Cannot find similar words.")
    else:
        # Iterate through all words in our loaded model
        for word, vector in word_vectors.items():
            # Exclude the original input words from the results to avoid trivial matches
            if word.lower() not in [w.lower() for w in positive_words + negative_words]:
                similarity = cosine_similarity(result_vector, vector)
                similarities.append((word, similarity))

        # Sort by similarity in descending order
        similarities.sort(key=lambda x: x[1], reverse=True)

        # Show top 5 most similar results
        print(f"\nTop 5 results for vector arithmetic:")
        if similarities:
            for word, similarity in similarities[:5]:
                print(f"  {word}: {similarity:.4f}")
        else:
            print("No similar words found based on the arithmetic operation.")

Performing vector arithmetic: king + woman - man

Top 5 results for vector arithmetic:
  queen: 0.8610
  daughter: 0.7685
  prince: 0.7641
  throne: 0.7635
  princess: 0.7513
