In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import re
import itertools
from scipy.sparse import coo_matrix
import warnings
warnings.filterwarnings('ignore')

# Set style and random seed
plt.style.use('seaborn-v0_8')
np.random.seed(42)

print("Libraries imported successfully!")


In [None]:
# Create comprehensive text corpus for word embeddings
def create_text_corpus():
    """Create a diverse text corpus for training word embeddings"""
    
    corpus = [
        # Technology and AI
        "artificial intelligence machine learning algorithms process data efficiently",
        "deep learning neural networks train on large datasets using gpu acceleration",
        "computer vision recognizes patterns in images and videos automatically",
        "natural language processing understands human communication and text analysis",
        "machine learning algorithms improve performance through experience and feedback",
        "artificial neural networks mimic biological brain processing and learning",
        "deep learning models require extensive computational resources and data",
        "computer algorithms solve complex problems using mathematical optimization",
        
        # Science and Research
        "scientific research discovers new knowledge through experimental methods",
        "laboratory experiments test hypotheses using controlled conditions",
        "research scientists analyze data to understand natural phenomena",
        "scientific method involves observation hypothesis testing and conclusion",
        "experimental design controls variables to ensure reliable results",
        "peer review process validates scientific findings and methodology",
        "research publications share discoveries with scientific community",
        "data analysis reveals patterns and trends in experimental results",
        
        # Mathematics and Logic
        "mathematical equations describe relationships between variables and constants",
        "statistical analysis interprets data patterns and significance levels",
        "probability theory models uncertainty and random events",
        "linear algebra operations manipulate vectors and matrices efficiently",
        "calculus studies rates of change and accumulation of quantities",
        "mathematical proofs establish logical foundations for theorems",
        "geometric shapes have properties defined by mathematical relationships",
        "number theory explores properties of integers and prime numbers",
        
        # Business and Economics
        "business organizations create value through efficient resource allocation",
        "economic markets determine prices through supply and demand dynamics",
        "financial analysis evaluates investment opportunities and risks",
        "business strategy guides decision making and competitive positioning",
        "market research identifies customer needs and preferences",
        "economic growth depends on productivity and technological innovation",
        "business management coordinates resources to achieve organizational goals",
        "financial planning helps individuals and organizations manage money",
        
        # Education and Learning
        "education systems develop knowledge and skills in students",
        "learning process involves acquiring understanding through study and practice",
        "teaching methods adapt to different learning styles and abilities",
        "educational research improves instructional techniques and outcomes",
        "student assessment measures learning progress and achievement",
        "curriculum design organizes knowledge into structured learning experiences",
        "educational technology enhances learning through digital tools",
        "lifelong learning continues throughout personal and professional development",
        
        # Communication and Language
        "human communication uses language to share ideas and information",
        "written language preserves knowledge across time and distance",
        "oral communication enables immediate interaction and feedback",
        "language evolution reflects cultural and social changes",
        "translation bridges communication between different languages",
        "linguistic analysis studies structure and meaning in language",
        "communication skills improve personal and professional relationships",
        "digital communication connects people across geographical boundaries"
    ]
    
    return corpus

# Preprocessing functions
def preprocess_text(text):
    """Simple text preprocessing for word embeddings"""
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\\s]', '', text)
    # Split into words
    words = text.split()
    # Remove very short words
    words = [word for word in words if len(word) > 2]
    return words

def build_vocabulary(corpus, min_count=2):
    """Build vocabulary from corpus with minimum frequency threshold"""
    word_counts = Counter()
    
    for text in corpus:
        words = preprocess_text(text)
        word_counts.update(words)
    
    # Filter words by minimum count
    vocab = {word: idx for idx, (word, count) in enumerate(word_counts.items()) 
             if count >= min_count}
    
    # Create reverse mapping
    idx_to_word = {idx: word for word, idx in vocab.items()}
    
    return vocab, idx_to_word, word_counts

# Initialize corpus and vocabulary
print("=== Building Text Corpus and Vocabulary ===")
corpus = create_text_corpus()
vocab, idx_to_word, word_counts = build_vocabulary(corpus, min_count=2)

print(f"Corpus size: {len(corpus)} documents")
print(f"Total unique words: {len(word_counts)}")
print(f"Vocabulary size (min_count=2): {len(vocab)}")
print(f"Most common words: {list(word_counts.most_common(10))}")

# Sample processed text
sample_text = corpus[0]
processed_sample = preprocess_text(sample_text)
print(f"\\nSample text: {sample_text}")
print(f"Processed: {processed_sample}")


In [None]:
# Simplified Word2Vec Skip-gram implementation
class SimpleWord2Vec:
    """Simplified Word2Vec Skip-gram implementation for educational purposes"""
    
    def __init__(self, vocab_size, embedding_dim=100, window_size=2, learning_rate=0.01):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.window_size = window_size
        self.learning_rate = learning_rate
        
        # Initialize word embeddings (input and output matrices)
        self.W_in = np.random.uniform(-0.5/embedding_dim, 0.5/embedding_dim, 
                                     (vocab_size, embedding_dim))
        self.W_out = np.random.uniform(-0.5/embedding_dim, 0.5/embedding_dim, 
                                      (embedding_dim, vocab_size))
        
        self.loss_history = []
    
    def softmax(self, x):
        """Compute softmax with numerical stability"""
        exp_x = np.exp(x - np.max(x))
        return exp_x / np.sum(exp_x)
    
    def forward_pass(self, center_word_idx, context_word_idx):
        """Forward pass for skip-gram model"""
        # Get center word embedding
        h = self.W_in[center_word_idx]  # (embedding_dim,)
        
        # Compute output scores
        u = np.dot(h, self.W_out)  # (vocab_size,)
        
        # Apply softmax
        y_pred = self.softmax(u)
        
        # Compute loss (negative log likelihood)
        loss = -np.log(y_pred[context_word_idx] + 1e-10)
        
        return h, u, y_pred, loss
    
    def backward_pass(self, center_word_idx, context_word_idx, h, u, y_pred):
        """Backward pass with gradient descent"""
        # Create one-hot vector for true context word
        y_true = np.zeros(self.vocab_size)
        y_true[context_word_idx] = 1
        
        # Compute gradients
        dL_du = y_pred - y_true  # (vocab_size,)
        dL_dW_out = np.outer(h, dL_du)  # (embedding_dim, vocab_size)
        dL_dh = np.dot(self.W_out, dL_du)  # (embedding_dim,)
        
        # Update weights
        self.W_out -= self.learning_rate * dL_dW_out.T
        self.W_in[center_word_idx] -= self.learning_rate * dL_dh
    
    def generate_training_pairs(self, corpus, vocab):
        """Generate (center_word, context_word) pairs"""
        pairs = []
        
        for text in corpus:
            words = preprocess_text(text)
            # Convert words to indices
            word_indices = [vocab[word] for word in words if word in vocab]
            
            # Generate context pairs
            for i, center_idx in enumerate(word_indices):
                # Define context window
                start = max(0, i - self.window_size)
                end = min(len(word_indices), i + self.window_size + 1)
                
                for j in range(start, end):
                    if i != j:  # Skip center word itself
                        context_idx = word_indices[j]
                        pairs.append((center_idx, context_idx))
        
        return pairs
    
    def train(self, corpus, vocab, epochs=100):
        """Train the Word2Vec model"""
        print(f"Training Word2Vec on {len(corpus)} documents...")
        
        # Generate training pairs
        training_pairs = self.generate_training_pairs(corpus, vocab)
        print(f"Generated {len(training_pairs)} training pairs")
        
        # Training loop
        for epoch in range(epochs):
            total_loss = 0
            np.random.shuffle(training_pairs)
            
            for center_idx, context_idx in training_pairs:
                # Forward pass
                h, u, y_pred, loss = self.forward_pass(center_idx, context_idx)
                total_loss += loss
                
                # Backward pass
                self.backward_pass(center_idx, context_idx, h, u, y_pred)
            
            avg_loss = total_loss / len(training_pairs)
            self.loss_history.append(avg_loss)
            
            if (epoch + 1) % 20 == 0:
                print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {avg_loss:.4f}")
        
        print("Training completed!")
    
    def get_word_vector(self, word_idx):
        """Get word embedding vector"""
        return self.W_in[word_idx]
    
    def find_similar_words(self, word, vocab, idx_to_word, top_k=5):
        """Find most similar words using cosine similarity"""
        if word not in vocab:
            return []
        
        word_idx = vocab[word]
        word_vec = self.get_word_vector(word_idx)
        
        # Compute similarities with all words
        similarities = []
        for idx in range(self.vocab_size):
            if idx != word_idx:
                other_vec = self.get_word_vector(idx)
                similarity = np.dot(word_vec, other_vec) / (
                    np.linalg.norm(word_vec) * np.linalg.norm(other_vec) + 1e-10
                )
                similarities.append((idx, similarity))
        
        # Sort by similarity
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        # Return top k similar words
        similar_words = [(idx_to_word[idx], sim) for idx, sim in similarities[:top_k]]
        return similar_words

# Train Word2Vec model
print("\\n=== Training Word2Vec Model ===")
w2v_model = SimpleWord2Vec(vocab_size=len(vocab), embedding_dim=50, 
                          window_size=2, learning_rate=0.01)

w2v_model.train(corpus, vocab, epochs=100)

# Plot training loss
plt.figure(figsize=(10, 6))
plt.plot(w2v_model.loss_history)
plt.title('Word2Vec Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Average Loss')
plt.grid(True, alpha=0.3)
plt.show()

print(f"Final training loss: {w2v_model.loss_history[-1]:.4f}")


In [None]:
# Simplified GloVe implementation
class SimpleGloVe:
    """Simplified GloVe implementation for educational purposes"""
    
    def __init__(self, vocab_size, embedding_dim=100, learning_rate=0.01, 
                 x_max=100, alpha=0.75):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.learning_rate = learning_rate
        self.x_max = x_max
        self.alpha = alpha
        
        # Initialize word vectors and biases
        self.W = np.random.normal(0, 0.1, (vocab_size, embedding_dim))
        self.W_tilde = np.random.normal(0, 0.1, (vocab_size, embedding_dim))
        self.b = np.random.normal(0, 0.1, vocab_size)
        self.b_tilde = np.random.normal(0, 0.1, vocab_size)
        
        self.loss_history = []
        self.cooccurrence_matrix = None
    
    def build_cooccurrence_matrix(self, corpus, vocab, window_size=5):
        """Build word co-occurrence matrix"""
        print("Building co-occurrence matrix...")
        
        # Initialize co-occurrence counts
        cooccur_counts = defaultdict(float)
        
        for text in corpus:
            words = preprocess_text(text)
            word_indices = [vocab[word] for word in words if word in vocab]
            
            # Count co-occurrences within window
            for i, center_idx in enumerate(word_indices):
                for j in range(max(0, i - window_size), 
                             min(len(word_indices), i + window_size + 1)):
                    if i != j:
                        context_idx = word_indices[j]
                        distance = abs(i - j)
                        # Weight by inverse distance
                        weight = 1.0 / distance
                        cooccur_counts[(center_idx, context_idx)] += weight
        
        # Convert to matrix format
        rows, cols, data = [], [], []
        for (i, j), count in cooccur_counts.items():
            rows.append(i)
            cols.append(j)
            data.append(count)
        
        self.cooccurrence_matrix = coo_matrix((data, (rows, cols)), 
                                            shape=(self.vocab_size, self.vocab_size))
        
        print(f"Co-occurrence matrix: {self.cooccurrence_matrix.nnz} non-zero entries")
        return self.cooccurrence_matrix
    
    def weighting_function(self, x):
        """GloVe weighting function f(X_ij)"""
        return np.where(x < self.x_max, (x / self.x_max) ** self.alpha, 1.0)
    
    def train(self, corpus, vocab, epochs=100, window_size=5):
        """Train GloVe model"""
        print(f"Training GloVe on {len(corpus)} documents...")
        
        # Build co-occurrence matrix
        cooccur_matrix = self.build_cooccurrence_matrix(corpus, vocab, window_size)
        
        # Convert to dense format for training (in practice, use sparse operations)
        X = cooccur_matrix.toarray()
        
        # Training loop
        for epoch in range(epochs):
            total_loss = 0
            num_pairs = 0
            
            # Iterate over non-zero co-occurrences
            for i in range(self.vocab_size):
                for j in range(self.vocab_size):
                    if X[i, j] > 0:
                        # Compute current prediction
                        prediction = (np.dot(self.W[i], self.W_tilde[j]) + 
                                    self.b[i] + self.b_tilde[j])
                        
                        # Compute weighted squared error
                        diff = prediction - np.log(X[i, j] + 1e-10)
                        weight = self.weighting_function(X[i, j])
                        loss = weight * (diff ** 2)
                        total_loss += loss
                        num_pairs += 1
                        
                        # Compute gradients
                        grad_factor = 2 * weight * diff
                        
                        # Update parameters
                        self.W[i] -= self.learning_rate * grad_factor * self.W_tilde[j]
                        self.W_tilde[j] -= self.learning_rate * grad_factor * self.W[i]
                        self.b[i] -= self.learning_rate * grad_factor
                        self.b_tilde[j] -= self.learning_rate * grad_factor
            
            avg_loss = total_loss / max(num_pairs, 1)
            self.loss_history.append(avg_loss)
            
            if (epoch + 1) % 20 == 0:
                print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {avg_loss:.4f}")
        
        print("GloVe training completed!")
    
    def get_word_vector(self, word_idx):
        """Get final word embedding (average of W and W_tilde)"""
        return (self.W[word_idx] + self.W_tilde[word_idx]) / 2
    
    def find_similar_words(self, word, vocab, idx_to_word, top_k=5):
        """Find most similar words using cosine similarity"""
        if word not in vocab:
            return []
        
        word_idx = vocab[word]
        word_vec = self.get_word_vector(word_idx)
        
        # Compute similarities with all words
        similarities = []
        for idx in range(self.vocab_size):
            if idx != word_idx:
                other_vec = self.get_word_vector(idx)
                similarity = np.dot(word_vec, other_vec) / (
                    np.linalg.norm(word_vec) * np.linalg.norm(other_vec) + 1e-10
                )
                similarities.append((idx, similarity))
        
        # Sort by similarity
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        # Return top k similar words
        similar_words = [(idx_to_word[idx], sim) for idx, sim in similarities[:top_k]]
        return similar_words

# Train GloVe model
print("\\n=== Training GloVe Model ===")
glove_model = SimpleGloVe(vocab_size=len(vocab), embedding_dim=50, 
                         learning_rate=0.01)

glove_model.train(corpus, vocab, epochs=50, window_size=5)

# Plot training loss
plt.figure(figsize=(10, 6))
plt.plot(glove_model.loss_history)
plt.title('GloVe Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Average Loss')
plt.grid(True, alpha=0.3)
plt.show()

print(f"Final training loss: {glove_model.loss_history[-1]:.4f}")


In [None]:
# Semantic similarity and analogy analysis
def analyze_word_similarities():
    """Analyze semantic similarities in both models"""
    
    print("\\n=== Word Similarity Analysis ===")
    
    # Test words for similarity analysis
    test_words = ['learning', 'data', 'research', 'language', 'business', 'mathematical']
    
    for word in test_words:
        if word in vocab:
            print(f"\\n--- Similar words to '{word}' ---")
            
            # Word2Vec similarities
            w2v_similar = w2v_model.find_similar_words(word, vocab, idx_to_word, top_k=5)
            print("Word2Vec:")
            for similar_word, similarity in w2v_similar:
                print(f"  {similar_word}: {similarity:.3f}")
            
            # GloVe similarities
            glove_similar = glove_model.find_similar_words(word, vocab, idx_to_word, top_k=5)
            print("GloVe:")
            for similar_word, similarity in glove_similar:
                print(f"  {similar_word}: {similarity:.3f}")

def simple_analogy_test(model, word_a, word_b, word_c, vocab, idx_to_word, model_name):
    """Simple analogy test: word_a is to word_b as word_c is to ?"""
    
    if not all(word in vocab for word in [word_a, word_b, word_c]):
        return None
    
    # Get word vectors
    vec_a = model.get_word_vector(vocab[word_a])
    vec_b = model.get_word_vector(vocab[word_b])
    vec_c = model.get_word_vector(vocab[word_c])
    
    # Compute analogy vector: vec_b - vec_a + vec_c
    analogy_vec = vec_b - vec_a + vec_c
    
    # Find closest word to analogy vector
    best_similarity = -1
    best_word = None
    
    for idx in range(len(vocab)):
        word = idx_to_word[idx]
        if word not in [word_a, word_b, word_c]:  # Exclude input words
            vec = model.get_word_vector(idx)
            similarity = np.dot(analogy_vec, vec) / (
                np.linalg.norm(analogy_vec) * np.linalg.norm(vec) + 1e-10
            )
            
            if similarity > best_similarity:
                best_similarity = similarity
                best_word = word
    
    print(f"{model_name}: {word_a} : {word_b} :: {word_c} : {best_word} (similarity: {best_similarity:.3f})")
    return best_word, best_similarity

# Run similarity analysis
analyze_word_similarities()

# Test simple analogies
print("\\n=== Simple Analogy Tests ===")
analogy_tests = [
    ('data', 'analysis', 'research', 'what should be the fourth word?'),
    ('learning', 'education', 'training', 'what should be the fourth word?'),
    ('machine', 'artificial', 'human', 'what should be the fourth word?'),
]

for word_a, word_b, word_c, description in analogy_tests:
    print(f"\\nAnalogy: {word_a} : {word_b} :: {word_c} : ?")
    simple_analogy_test(w2v_model, word_a, word_b, word_c, vocab, idx_to_word, "Word2Vec")
    simple_analogy_test(glove_model, word_a, word_b, word_c, vocab, idx_to_word, "GloVe")


In [None]:
# Visualization of word embeddings
def visualize_embeddings():
    """Visualize word embeddings using dimensionality reduction"""
    
    print("\\n=== Embedding Visualization ===")
    
    # Get all word vectors for both models
    w2v_vectors = np.array([w2v_model.get_word_vector(i) for i in range(len(vocab))])
    glove_vectors = np.array([glove_model.get_word_vector(i) for i in range(len(vocab))])
    
    # Select subset of words for visualization
    common_words = [word for word, count in word_counts.most_common(30) if word in vocab]
    word_indices = [vocab[word] for word in common_words]
    
    # Get vectors for selected words
    w2v_subset = w2v_vectors[word_indices]
    glove_subset = glove_vectors[word_indices]
    
    # Apply t-SNE for 2D visualization
    print("Applying t-SNE for visualization...")
    tsne = TSNE(n_components=2, random_state=42, perplexity=min(15, len(common_words)-1))
    
    w2v_2d = tsne.fit_transform(w2v_subset)
    
    # Use same t-SNE for GloVe (refit for fair comparison)
    tsne_glove = TSNE(n_components=2, random_state=42, perplexity=min(15, len(common_words)-1))
    glove_2d = tsne_glove.fit_transform(glove_subset)
    
    # Create visualization
    fig, axes = plt.subplots(1, 2, figsize=(20, 8))
    
    # Word2Vec visualization
    axes[0].scatter(w2v_2d[:, 0], w2v_2d[:, 1], alpha=0.7, s=50)
    for i, word in enumerate(common_words):
        axes[0].annotate(word, (w2v_2d[i, 0], w2v_2d[i, 1]), 
                        xytext=(3, 3), textcoords='offset points', 
                        fontsize=9, alpha=0.8)
    axes[0].set_title('Word2Vec Embeddings (t-SNE)')
    axes[0].set_xlabel('t-SNE 1')
    axes[0].set_ylabel('t-SNE 2')
    axes[0].grid(True, alpha=0.3)
    
    # GloVe visualization
    axes[1].scatter(glove_2d[:, 0], glove_2d[:, 1], alpha=0.7, s=50, color='orange')
    for i, word in enumerate(common_words):
        axes[1].annotate(word, (glove_2d[i, 0], glove_2d[i, 1]), 
                        xytext=(3, 3), textcoords='offset points', 
                        fontsize=9, alpha=0.8)
    axes[1].set_title('GloVe Embeddings (t-SNE)')
    axes[1].set_xlabel('t-SNE 1')
    axes[1].set_ylabel('t-SNE 2')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return w2v_vectors, glove_vectors

def compare_embedding_quality():
    """Compare embedding quality using various metrics"""
    
    print("\\n=== Embedding Quality Comparison ===")
    
    # Get all embeddings
    w2v_vectors = np.array([w2v_model.get_word_vector(i) for i in range(len(vocab))])
    glove_vectors = np.array([glove_model.get_word_vector(i) for i in range(len(vocab))])
    
    # Compute average vector norms
    w2v_avg_norm = np.mean([np.linalg.norm(vec) for vec in w2v_vectors])
    glove_avg_norm = np.mean([np.linalg.norm(vec) for vec in glove_vectors])
    
    print(f"Average vector norms:")
    print(f"  Word2Vec: {w2v_avg_norm:.3f}")
    print(f"  GloVe: {glove_avg_norm:.3f}")
    
    # Compute similarity distributions
    print("\\nComputing similarity distributions...")
    
    # Sample pairs for efficiency
    sample_size = min(100, len(vocab))
    sample_indices = np.random.choice(len(vocab), sample_size, replace=False)
    
    w2v_similarities = []
    glove_similarities = []
    
    for i in range(sample_size):
        for j in range(i+1, sample_size):
            idx1, idx2 = sample_indices[i], sample_indices[j]
            
            # Word2Vec similarity
            vec1_w2v = w2v_vectors[idx1]
            vec2_w2v = w2v_vectors[idx2]
            sim_w2v = np.dot(vec1_w2v, vec2_w2v) / (
                np.linalg.norm(vec1_w2v) * np.linalg.norm(vec2_w2v) + 1e-10
            )
            w2v_similarities.append(sim_w2v)
            
            # GloVe similarity
            vec1_glove = glove_vectors[idx1]
            vec2_glove = glove_vectors[idx2]
            sim_glove = np.dot(vec1_glove, vec2_glove) / (
                np.linalg.norm(vec1_glove) * np.linalg.norm(vec2_glove) + 1e-10
            )
            glove_similarities.append(sim_glove)
    
    # Plot similarity distributions
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.hist(w2v_similarities, bins=30, alpha=0.7, label='Word2Vec', density=True)
    plt.hist(glove_similarities, bins=30, alpha=0.7, label='GloVe', density=True)
    plt.xlabel('Cosine Similarity')
    plt.ylabel('Density')
    plt.title('Distribution of Pairwise Similarities')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Statistics comparison
    plt.subplot(1, 2, 2)
    stats_comparison = {
        'Mean Similarity': [np.mean(w2v_similarities), np.mean(glove_similarities)],
        'Std Similarity': [np.std(w2v_similarities), np.std(glove_similarities)],
        'Min Similarity': [np.min(w2v_similarities), np.min(glove_similarities)],
        'Max Similarity': [np.max(w2v_similarities), np.max(glove_similarities)]
    }
    
    x = np.arange(len(stats_comparison))
    width = 0.35
    
    w2v_stats = [stats_comparison[key][0] for key in stats_comparison.keys()]
    glove_stats = [stats_comparison[key][1] for key in stats_comparison.keys()]
    
    plt.bar(x - width/2, w2v_stats, width, label='Word2Vec', alpha=0.7)
    plt.bar(x + width/2, glove_stats, width, label='GloVe', alpha=0.7)
    
    plt.xlabel('Statistics')
    plt.ylabel('Value')
    plt.title('Similarity Statistics Comparison')
    plt.xticks(x, stats_comparison.keys(), rotation=45)
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print numerical comparison
    print("\\nNumerical Comparison:")
    for key, values in stats_comparison.items():
        print(f"  {key}:")
        print(f"    Word2Vec: {values[0]:.4f}")
        print(f"    GloVe: {values[1]:.4f}")
        print(f"    Difference: {abs(values[0] - values[1]):.4f}")

# Run visualization and comparison
embedding_vectors = visualize_embeddings()
compare_embedding_quality()


In [None]:
# Clustering analysis using word embeddings
def clustering_with_embeddings():
    """Perform clustering analysis using word embeddings"""
    
    print("\\n=== Clustering Analysis with Word Embeddings ===")
    
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score
    
    # Get embeddings
    w2v_vectors = np.array([w2v_model.get_word_vector(i) for i in range(len(vocab))])
    glove_vectors = np.array([glove_model.get_word_vector(i) for i in range(len(vocab))])
    
    # Determine optimal number of clusters
    n_clusters_range = range(3, min(8, len(vocab)//3))
    
    w2v_silhouette_scores = []
    glove_silhouette_scores = []
    
    print("Finding optimal number of clusters...")
    for n_clusters in n_clusters_range:
        # Word2Vec clustering
        kmeans_w2v = KMeans(n_clusters=n_clusters, random_state=42)
        w2v_labels = kmeans_w2v.fit_predict(w2v_vectors)
        w2v_silhouette = silhouette_score(w2v_vectors, w2v_labels)
        w2v_silhouette_scores.append(w2v_silhouette)
        
        # GloVe clustering
        kmeans_glove = KMeans(n_clusters=n_clusters, random_state=42)
        glove_labels = kmeans_glove.fit_predict(glove_vectors)
        glove_silhouette = silhouette_score(glove_vectors, glove_labels)
        glove_silhouette_scores.append(glove_silhouette)
        
        print(f"  k={n_clusters}: Word2Vec silhouette={w2v_silhouette:.3f}, GloVe silhouette={glove_silhouette:.3f}")
    
    # Find optimal k
    optimal_k_w2v = n_clusters_range[np.argmax(w2v_silhouette_scores)]
    optimal_k_glove = n_clusters_range[np.argmax(glove_silhouette_scores)]
    
    print(f"\\nOptimal clusters: Word2Vec k={optimal_k_w2v}, GloVe k={optimal_k_glove}")
    
    # Perform final clustering with optimal k
    final_k = optimal_k_w2v  # Use Word2Vec's optimal for comparison
    
    kmeans_w2v = KMeans(n_clusters=final_k, random_state=42)
    w2v_clusters = kmeans_w2v.fit_predict(w2v_vectors)
    
    kmeans_glove = KMeans(n_clusters=final_k, random_state=42)
    glove_clusters = kmeans_glove.fit_predict(glove_vectors)
    
    # Analyze clusters
    print(f"\\n--- Word2Vec Clusters (k={final_k}) ---")
    for cluster_id in range(final_k):
        cluster_words = [idx_to_word[i] for i, label in enumerate(w2v_clusters) if label == cluster_id]
        print(f"Cluster {cluster_id}: {cluster_words}")
    
    print(f"\\n--- GloVe Clusters (k={final_k}) ---")
    for cluster_id in range(final_k):
        cluster_words = [idx_to_word[i] for i, label in enumerate(glove_clusters) if label == cluster_id]
        print(f"Cluster {cluster_id}: {cluster_words}")
    
    # Plot silhouette scores
    plt.figure(figsize=(10, 6))
    plt.plot(n_clusters_range, w2v_silhouette_scores, 'o-', label='Word2Vec', linewidth=2)
    plt.plot(n_clusters_range, glove_silhouette_scores, 's-', label='GloVe', linewidth=2)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.title('Clustering Quality vs Number of Clusters')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Mark optimal points
    plt.axvline(x=optimal_k_w2v, color='blue', linestyle='--', alpha=0.5, label=f'Word2Vec optimal (k={optimal_k_w2v})')
    plt.axvline(x=optimal_k_glove, color='orange', linestyle='--', alpha=0.5, label=f'GloVe optimal (k={optimal_k_glove})')
    
    plt.show()
    
    return w2v_clusters, glove_clusters

def document_similarity_analysis():
    """Analyze document similarity using word embeddings"""
    
    print("\\n=== Document Similarity Analysis ===")
    
    # Create document vectors by averaging word embeddings
    def get_document_vector(text, model, vocab):
        words = preprocess_text(text)
        word_vectors = []
        
        for word in words:
            if word in vocab:
                word_idx = vocab[word]
                word_vec = model.get_word_vector(word_idx)
                word_vectors.append(word_vec)
        
        if word_vectors:
            return np.mean(word_vectors, axis=0)
        else:
            return np.zeros(model.embedding_dim)
    
    # Get document vectors for both models
    w2v_doc_vectors = []
    glove_doc_vectors = []
    
    for text in corpus[:10]:  # Use first 10 documents for analysis
        w2v_doc_vec = get_document_vector(text, w2v_model, vocab)
        glove_doc_vec = get_document_vector(text, glove_model, vocab)
        
        w2v_doc_vectors.append(w2v_doc_vec)
        glove_doc_vectors.append(glove_doc_vec)
    
    w2v_doc_vectors = np.array(w2v_doc_vectors)
    glove_doc_vectors = np.array(glove_doc_vectors)
    
    # Compute document similarity matrices
    w2v_doc_sim = cosine_similarity(w2v_doc_vectors)
    glove_doc_sim = cosine_similarity(glove_doc_vectors)
    
    # Visualize similarity matrices
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    im1 = axes[0].imshow(w2v_doc_sim, cmap='viridis', aspect='auto')
    axes[0].set_title('Document Similarity (Word2Vec)')
    axes[0].set_xlabel('Document Index')
    axes[0].set_ylabel('Document Index')
    plt.colorbar(im1, ax=axes[0])
    
    im2 = axes[1].imshow(glove_doc_sim, cmap='viridis', aspect='auto')
    axes[1].set_title('Document Similarity (GloVe)')
    axes[1].set_xlabel('Document Index')
    axes[1].set_ylabel('Document Index')
    plt.colorbar(im2, ax=axes[1])
    
    plt.tight_layout()
    plt.show()
    
    # Find most similar document pairs
    print("\\nMost similar document pairs:")
    
    # Get upper triangle indices (excluding diagonal)
    triu_indices = np.triu_indices_from(w2v_doc_sim, k=1)
    
    # Word2Vec similarities
    w2v_similarities = w2v_doc_sim[triu_indices]
    w2v_top_pairs = np.argsort(w2v_similarities)[-3:][::-1]
    
    print("\\nWord2Vec top similar pairs:")
    for idx in w2v_top_pairs:
        i, j = triu_indices[0][idx], triu_indices[1][idx]
        similarity = w2v_similarities[idx]
        print(f"  Documents {i} & {j}: {similarity:.3f}")
        print(f"    Doc {i}: {corpus[i][:100]}...")
        print(f"    Doc {j}: {corpus[j][:100]}...")
        print()
    
    # GloVe similarities
    glove_similarities = glove_doc_sim[triu_indices]
    glove_top_pairs = np.argsort(glove_similarities)[-3:][::-1]
    
    print("GloVe top similar pairs:")
    for idx in glove_top_pairs:
        i, j = triu_indices[0][idx], triu_indices[1][idx]
        similarity = glove_similarities[idx]
        print(f"  Documents {i} & {j}: {similarity:.3f}")
        print(f"    Doc {i}: {corpus[i][:100]}...")
        print(f"    Doc {j}: {corpus[j][:100]}...")
        print()

# Run clustering and document analysis
cluster_results = clustering_with_embeddings()
document_similarity_analysis()
