# Assignment 2: Bag-of-Words, TF-IDF, and Word2Vec Embeddings

This notebook demonstrates text vectorization techniques and word embeddings using scikit-learn and Gensim.

## 1. Import Required Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from nltk.tokenize import word_tokenize
import nltk

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

from nltk.corpus import stopwords

print("All libraries imported successfully!")

ModuleNotFoundError: No module named 'matplotlib'

## 2. Load and Prepare Sample Documents

In [None]:
# Sample documents
documents = [
    "Machine learning is a subset of artificial intelligence",
    "Deep learning uses neural networks with multiple layers",
    "Natural language processing is used in chatbots and translation",
    "Machine learning models require large amounts of training data",
    "Python is the most popular language for machine learning",
    "Data science involves statistics and machine learning",
    "Neural networks are inspired by biological neurons",
    "Text classification is a common natural language processing task"
]

print("Sample Documents:")
print("-" * 60)
for i, doc in enumerate(documents, 1):
    print(f"{i}. {doc}")

print(f"\nTotal documents: {len(documents)}")

## 3. Bag-of-Words: Count Occurrence

Create a bag-of-words model with raw word counts.

In [None]:
# Create CountVectorizer (Bag-of-Words with count occurrence)
count_vectorizer = CountVectorizer(stop_words='english', lowercase=True)
bow_count_matrix = count_vectorizer.fit_transform(documents)

print("Bag-of-Words: Count Occurrence")
print("=" * 60)
print(f"Matrix shape: {bow_count_matrix.shape}")
print(f"Number of documents: {bow_count_matrix.shape[0]}")
print(f"Number of unique words: {bow_count_matrix.shape[1]}")

# Get feature names (vocabulary)
feature_names = count_vectorizer.get_feature_names_out()
print(f"\nVocabulary (first 20 words): {feature_names[:20]}")

# Convert to dense array for better visualization
bow_count_dense = bow_count_matrix.toarray()

# Create a DataFrame for better visualization
bow_df = pd.DataFrame(bow_count_dense, columns=feature_names)
print("\nBag-of-Words Count Matrix (first 5 documents, first 15 words):")
print(bow_df.iloc[:5, :15])

# Show word counts for first document
print("\nWord counts in first document:")
first_doc_counts = bow_df.iloc[0].sort_values(ascending=False)
print(first_doc_counts[first_doc_counts > 0])

## 4. Bag-of-Words: Normalized Count Occurrence

Normalize BoW vectors using term frequency normalization.

In [None]:
# Normalize the BoW counts using L2 normalization (Term Frequency)
from sklearn.preprocessing import normalize

# L2 normalization (divide each row by its L2 norm)
bow_normalized_l2 = normalize(bow_count_matrix, norm='l2')
bow_normalized_l2_dense = bow_normalized_l2.toarray()

# L1 normalization (divide by the sum of absolute values)
bow_normalized_l1 = normalize(bow_count_matrix, norm='l1')
bow_normalized_l1_dense = bow_normalized_l1.toarray()

print("Bag-of-Words: Normalized Count Occurrence")
print("=" * 60)

# Create DataFrames for normalized BoW
bow_norm_df = pd.DataFrame(bow_normalized_l2_dense, columns=feature_names)

print("\nL2 Normalized BoW (first 5 documents, first 10 words):")
print(bow_norm_df.iloc[:5, :10])

print("\n\nComparison of raw counts vs normalized (Document 1):")
print("-" * 60)
comparison_df = pd.DataFrame({
    'Word': feature_names[:15],
    'Raw Count': bow_count_dense[0, :15],
    'L2 Normalized': bow_normalized_l2_dense[0, :15],
    'L1 Normalized': bow_normalized_l1_dense[0, :15]
})
print(comparison_df[comparison_df['Raw Count'] > 0])

print("\n\nL2 Normalization (Term Frequency) - Statistical Summary:")
print("-" * 60)
print(f"Min normalized value: {bow_normalized_l2_dense[bow_normalized_l2_dense > 0].min():.6f}")
print(f"Max normalized value: {bow_normalized_l2_dense.max():.6f}")
print(f"Mean normalized value: {bow_normalized_l2_dense[bow_normalized_l2_dense > 0].mean():.6f}")

# Show L2 norm (should be 1 for each document)
print("\nL2 norm for each document (should be ~1.0):")
for i in range(len(documents)):
    l2_norm = np.linalg.norm(bow_normalized_l2_dense[i])
    print(f"Document {i+1}: {l2_norm:.6f}")

## 5. TF-IDF Vectorization

Compute TF-IDF (Term Frequency-Inverse Document Frequency) scores.

In [None]:
# Create TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print("TF-IDF Vectorization")
print("=" * 60)
print(f"Matrix shape: {tfidf_matrix.shape}")
print(f"Number of documents: {tfidf_matrix.shape[0]}")
print(f"Number of unique words: {tfidf_matrix.shape[1]}")

# Get feature names
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert to dense array
tfidf_dense = tfidf_matrix.toarray()

# Create DataFrame
tfidf_df = pd.DataFrame(tfidf_dense, columns=tfidf_feature_names)

print("\nTF-IDF Matrix (first 5 documents, first 10 words):")
print(tfidf_df.iloc[:5, :10])

print("\n\nTop 10 Most Important Words (Highest TF-IDF scores) in Each Document:")
print("-" * 60)
for i in range(len(documents)):
    print(f"\nDocument {i+1}: {documents[i][:50]}...")
    doc_tfidf = tfidf_df.iloc[i]
    top_words = doc_tfidf.nlargest(10)
    for word, score in top_words.items():
        if score > 0:
            print(f"  {word:<20} : {score:.4f}")

# Show IDF values
print("\n\nInverse Document Frequency (IDF) Values:")
print("-" * 60)
print("IDF = log(total_docs / docs_containing_term)")
print(f"Total documents: {len(documents)}")

idf_values = tfidf_vectorizer.idf_
idf_dict = dict(zip(tfidf_feature_names, idf_values))
# Sort by IDF value
sorted_idf = sorted(idf_dict.items(), key=lambda x: x[1], reverse=True)
print("\nTop 15 words by IDF (most discriminative):")
for word, idf in sorted_idf[:15]:
    print(f"  {word:<20} : {idf:.4f}")

## 6. Comparison: BoW vs Normalized BoW vs TF-IDF

In [None]:
print("Comparison of Vectorization Methods")
print("=" * 80)

# Compare for document 1 and first 10 words
doc_idx = 0
comparison_words = tfidf_feature_names[:15]
word_indices = [np.where(tfidf_feature_names == word)[0][0] for word in comparison_words]

print(f"\nDocument 1: {documents[0]}")
print("\n" + "-" * 80)
print(f"{'Word':<20} | {'BoW Count':<15} | {'BoW Normalized':<20} | {'TF-IDF':<15}")
print("-" * 80)

for word, idx in zip(comparison_words, word_indices):
    bow_count = bow_count_dense[doc_idx, idx]
    bow_norm = bow_normalized_l2_dense[doc_idx, idx]
    tfidf = tfidf_dense[doc_idx, idx]
    print(f"{word:<20} | {bow_count:<15.0f} | {bow_norm:<20.6f} | {tfidf:<15.6f}")

print("\n\nKey Differences:")
print("-" * 80)
print("1. BoW Count: Raw word frequencies - favors longer documents")
print("2. BoW Normalized: Term frequencies - accounts for document length")
print("3. TF-IDF: Weights terms by importance - reduces impact of common words")

## 7. Word2Vec: Training the Model

Train a Word2Vec model using Gensim on the tokenized corpus.

In [None]:
# Tokenize documents for Word2Vec
tokenized_docs = []
for doc in documents:
    tokens = simple_preprocess(doc)  # Converts to lowercase and tokenizes
    tokenized_docs.append(tokens)

print("Tokenized Documents:")
print("=" * 60)
for i, tokens in enumerate(tokenized_docs):
    print(f"Doc {i+1}: {tokens}")

# Train Word2Vec model (Skip-gram)
print("\n\nTraining Word2Vec Model (Skip-gram)...")
print("=" * 60)

w2v_model_skipgram = Word2Vec(sentences=tokenized_docs, 
                               vector_size=100,      # Dimension of word vectors
                               window=5,              # Context window size
                               min_count=1,           # Minimum word frequency
                               workers=4,             # Number of worker threads
                               sg=1)                  # 1 = Skip-gram, 0 = CBOW

print(f"Vocabulary size: {len(w2v_model_skipgram.wv)}")
print(f"Vector size: {w2v_model_skipgram.vector_size}")

# Get word vectors
print("\n\nWord Vectors (first 5 words):")
for word in list(w2v_model_skipgram.wv.index_to_key)[:5]:
    vector = w2v_model_skipgram.wv[word]
    print(f"{word}: {vector[:5]}... (showing first 5 dimensions)")

# Train Word2Vec model (CBOW)
print("\n\nTraining Word2Vec Model (CBOW - Continuous Bag of Words)...")
print("=" * 60)

w2v_model_cbow = Word2Vec(sentences=tokenized_docs,
                           vector_size=100,
                           window=5,
                           min_count=1,
                           workers=4,
                           sg=0)  # 0 = CBOW

print(f"Vocabulary size: {len(w2v_model_cbow.wv)}")

## 8. Word2Vec: Word Similarity

Find similar words and calculate similarity scores.

In [None]:
# Find most similar words
print("Most Similar Words (Skip-gram model):")
print("=" * 60)

test_words = ['learning', 'neural', 'processing']

for word in test_words:
    if word in w2v_model_skipgram.wv:
        print(f"\nWords most similar to '{word}':")
        similar_words = w2v_model_skipgram.wv.most_similar(word, topn=5)
        for similar_word, similarity in similar_words:
            print(f"  {similar_word:<20} : {similarity:.4f}")
    else:
        print(f"\nWord '{word}' not in vocabulary")

# Calculate similarity between word pairs
print("\n\nWord-to-Word Similarity:")
print("=" * 60)

word_pairs = [
    ('machine', 'learning'),
    ('neural', 'networks'),
    ('python', 'language'),
    ('data', 'statistics'),
]

for word1, word2 in word_pairs:
    if word1 in w2v_model_skipgram.wv and word2 in w2v_model_skipgram.wv:
        similarity = w2v_model_skipgram.wv.similarity(word1, word2)
        print(f"Similarity('{word1}', '{word2}'): {similarity:.4f}")
    else:
        print(f"One of the words '{word1}' or '{word2}' not in vocabulary")

## 9. Word2Vec: Document Representation

Create document embeddings by averaging word vectors.

In [None]:
# Create document embeddings by averaging word vectors
def get_doc_embedding(doc_tokens, model):
    """Get document embedding by averaging word vectors"""
    vectors = []
    for token in doc_tokens:
        if token in model.wv:
            vectors.append(model.wv[token])
    
    if len(vectors) == 0:
        # Return zero vector if no words found
        return np.zeros(model.vector_size)
    
    return np.mean(vectors, axis=0)

# Get document embeddings for all documents
print("Document Embeddings (averaged word vectors)")
print("=" * 60)

doc_embeddings = []
for i, tokens in enumerate(tokenized_docs):
    embedding = get_doc_embedding(tokens, w2v_model_skipgram)
    doc_embeddings.append(embedding)
    print(f"Document {i+1} embedding shape: {embedding.shape}")
    print(f"  First 10 dimensions: {embedding[:10]}")

doc_embeddings = np.array(doc_embeddings)

# Calculate document-to-document similarity
print("\n\nDocument-to-Document Similarity (Cosine similarity):")
print("=" * 60)

from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(doc_embeddings)

# Create a nice display
sim_df = pd.DataFrame(similarity_matrix)
print("\nDocument Similarity Matrix:")
print(sim_df.round(3))

# Find most similar document pairs
print("\n\nMost Similar Document Pairs:")
print("-" * 60)

# Get upper triangle indices to avoid duplicates
upper_triangle = np.triu_indices_from(similarity_matrix, k=1)
similarities = []
for i, j in zip(upper_triangle[0], upper_triangle[1]):
    similarities.append((i, j, similarity_matrix[i, j]))

# Sort by similarity
similarities.sort(key=lambda x: x[2], reverse=True)

for doc1_idx, doc2_idx, sim in similarities[:5]:
    print(f"\nDocument {doc1_idx+1} & {doc2_idx+1}: {sim:.4f}")
    print(f"  Doc {doc1_idx+1}: {documents[doc1_idx][:50]}...")
    print(f"  Doc {doc2_idx+1}: {documents[doc2_idx][:50]}...")

## 10. Visualization: Word2Vec Embeddings

Visualize high-dimensional embeddings using dimensionality reduction.

In [None]:
# Visualize word embeddings using t-SNE
from sklearn.manifold import TSNE

# Get all word vectors
word_vectors = w2v_model_skipgram.wv.vectors
words = w2v_model_skipgram.wv.index_to_key

# Apply t-SNE for dimensionality reduction (100D -> 2D)
print("Reducing dimensionality using t-SNE...")
tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(words)-1))
vectors_2d = tsne.fit_transform(word_vectors)

# Plot
plt.figure(figsize=(12, 8))
plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], alpha=0.6, s=50)

# Add labels for words
for i, word in enumerate(words):
    plt.annotate(word, (vectors_2d[i, 0], vectors_2d[i, 1]), 
                fontsize=9, ha='center')

plt.title("Word2Vec Embeddings - t-SNE Visualization")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("Visualization complete!")

## 11. Summary and Comparison

Summary of all techniques demonstrated.

In [None]:
print("SUMMARY OF TEXT VECTORIZATION TECHNIQUES")
print("=" * 80)

summary = """
1. BAG-OF-WORDS (BoW - Count Occurrence):
   - Representation: Count of each word in a document
   - Pros: Simple, interpretable, fast
   - Cons: Loses word order, large sparse vectors
   - Use case: Baseline approach, document classification
   
2. BAG-OF-WORDS (Normalized Count):
   - Representation: Term Frequency (TF) - normalized word counts
   - Pros: Accounts for document length, normalized values [0,1]
   - Cons: Still loses word order, ignores word importance
   - Use case: When document length varies significantly
   
3. TF-IDF (Term Frequency-Inverse Document Frequency):
   - Representation: TF * IDF - weights terms by importance
   - Formula: TF-IDF = (word_count/total_words) * log(total_docs/docs_with_word)
   - Pros: Reduces impact of common words, better for info retrieval
   - Cons: Still sparse, context-independent
   - Use case: Information retrieval, search engines, document similarity
   
4. WORD2VEC:
   - Representation: Dense word embeddings (100-300 dimensions)
   - Types: Skip-gram (predicts context from word) or CBOW (predicts word from context)
   - Pros: Captures semantic relationships, dense vectors, context-aware
   - Cons: Requires more training data, harder to interpret
   - Use case: Similarity search, analogies, downstream NLP tasks

COMPARISON TABLE:
"""

print(summary)

comparison_table = pd.DataFrame({
    'Technique': ['BoW Count', 'BoW Normalized', 'TF-IDF', 'Word2Vec'],
    'Dimensionality': ['Sparse (Vocab Size)', 'Sparse (Vocab Size)', 'Sparse (Vocab Size)', 'Dense (100-300)'],
    'Captures Order': ['No', 'No', 'No', 'Yes (context)'],
    'Computational Cost': ['Low', 'Low', 'Medium', 'High'],
    'Semantic Info': ['No', 'No', 'Weak', 'Strong'],
    'Best For': ['Baseline', 'Classification', 'Info Retrieval', 'Deep Learning']
})

print(comparison_table.to_string(index=False))

print("\n" + "=" * 80)
print("CONCLUSION:")
print("-" * 80)
print("""
- Choose BoW for simple, interpretable baselines
- Use TF-IDF for information retrieval and document ranking
- Use Word2Vec for semantic similarity and advanced NLP tasks
- Combine multiple techniques for best results in real-world applications
""")