In [1]:
# PROBLEM 1: Scratch implementation of BoW
def bow_scratch_implementation():
    """Implement BoW from scratch without scikit-learn"""
    
    print("🎯 PROBLEM 1: Scratch BoW Implementation")
    print("=" * 50)
    
    # The three sentences from the problem
    sentences = [
        "This movie is SOOOO funny!!!",
        "What a movie! I never", 
        "best movie ever!!!!! this movie"
    ]
    
    print("Input sentences:")
    for i, sentence in enumerate(sentences):
        print(f"  {i+1}. {sentence}")
    
    # 1-gram BoW implementation
    def bow_1gram(sentences):
        # Tokenize: split by space and remove punctuation
        tokens_list = []
        for sentence in sentences:
            # Basic cleaning: lowercase and remove punctuation
            cleaned = ''.join(char.lower() if char.isalnum() or char.isspace() else ' ' for char in sentence)
            tokens = cleaned.split()
            tokens_list.append(tokens)
        
        # Create vocabulary
        vocabulary = set()
        for tokens in tokens_list:
            vocabulary.update(tokens)
        vocabulary = sorted(list(vocabulary))
        
        # Create BoW vectors
        bow_vectors = []
        for tokens in tokens_list:
            vector = [tokens.count(word) for word in vocabulary]
            bow_vectors.append(vector)
        
        return bow_vectors, vocabulary
    
    # 2-gram BoW implementation  
    def bow_2gram(sentences):
        tokens_list = []
        for sentence in sentences:
            cleaned = ''.join(char.lower() if char.isalnum() or char.isspace() else ' ' for char in sentence)
            tokens = cleaned.split()
            tokens_list.append(tokens)
        
        # Create 2-grams
        bigrams_list = []
        for tokens in tokens_list:
            bigrams = []
            for i in range(len(tokens) - 1):
                bigrams.append(f"{tokens[i]}_{tokens[i+1]}")
            bigrams_list.append(bigrams)
        
        # Create vocabulary
        vocabulary = set()
        for bigrams in bigrams_list:
            vocabulary.update(bigrams)
        vocabulary = sorted(list(vocabulary))
        
        # Create BoW vectors
        bow_vectors = []
        for bigrams in bigrams_list:
            vector = [bigrams.count(bigram) for bigram in vocabulary]
            bow_vectors.append(vector)
        
        return bow_vectors, vocabulary
    
    # Calculate 1-gram and 2-gram
    bow_1gram_vectors, vocab_1gram = bow_1gram(sentences)
    bow_2gram_vectors, vocab_2gram = bow_2gram(sentences)
    
    print("\n📊 1-gram BoW Results:")
    print(f"Vocabulary: {vocab_1gram}")
    for i, vector in enumerate(bow_1gram_vectors):
        print(f"Sentence {i+1}: {vector}")
    
    print("\n📊 2-gram BoW Results:")
    print(f"Vocabulary: {vocab_2gram}")
    for i, vector in enumerate(bow_2gram_vectors):
        print(f"Sentence {i+1}: {vector}")
    
    return bow_1gram_vectors, vocab_1gram, bow_2gram_vectors, vocab_2gram

# Run Problem 1
bow_1gram_vectors, vocab_1gram, bow_2gram_vectors, vocab_2gram = bow_scratch_implementation()

🎯 PROBLEM 1: Scratch BoW Implementation
Input sentences:
  1. This movie is SOOOO funny!!!
  2. What a movie! I never
  3. best movie ever!!!!! this movie

📊 1-gram BoW Results:
Vocabulary: ['a', 'best', 'ever', 'funny', 'i', 'is', 'movie', 'never', 'soooo', 'this', 'what']
Sentence 1: [0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0]
Sentence 2: [1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1]
Sentence 3: [0, 1, 1, 0, 0, 0, 2, 0, 0, 1, 0]

📊 2-gram BoW Results:
Vocabulary: ['a_movie', 'best_movie', 'ever_this', 'i_never', 'is_soooo', 'movie_ever', 'movie_i', 'movie_is', 'soooo_funny', 'this_movie', 'what_a']
Sentence 1: [0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0]
Sentence 2: [1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1]
Sentence 3: [0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0]


In [None]:
from sklearn.datasets import load_files
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

# Download stopwords if not available
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

from nltk.corpus import stopwords

# PROBLEM 2: TF-IDF Calculation with scikit-learn
def tfidf_calculation():
    """Calculate TF-IDF for IMDB dataset using scikit-learn"""
    
    print("🎯 PROBLEM 2: TF-IDF Calculation with scikit-learn")
    print("=" * 50)
    
    # Load IMDB dataset as instructed
    train_review = load_files('./aclImdb/train/', encoding='utf-8')
    x_train, y_train = train_review.data, train_review.target
    
    test_review = load_files('./aclImdb/test/', encoding='utf-8') 
    x_test, y_test = test_review.data, train_review.target
    
    print(f"Training samples: {len(x_train)}")
    print(f"Test samples: {len(x_test)}")
    print(f"Labels: {train_review.target_names}")
    
    # Get English stopwords from NLTK
    stop_words = stopwords.words('english')
    print(f"Number of stopwords: {len(stop_words)}")
    print(f"First 10 stopwords: {stop_words[:10]}")
    
    # Create TF-IDF vectorizer with specified parameters
    tfidf_vectorizer = TfidfVectorizer(
        max_features=5000,           # Maximum vocabulary size
        stop_words=stop_words,       # NLTK stop words
        lowercase=True,              # Convert to lowercase
        token_pattern=r'(?u)\b\w+\b' # Token pattern
    )
    
    # Fit and transform training data
    print("\n🔄 Fitting TF-IDF vectorizer...")
    x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
    x_test_tfidf = tfidf_vectorizer.transform(x_test)
    
    print(f"TF-IDF matrix shape - Train: {x_train_tfidf.shape}")
    print(f"TF-IDF matrix shape - Test: {x_test_tfidf.shape}")
    
    # Show some feature names
    feature_names = tfidf_vectorizer.get_feature_names_out()
    print(f"\nFirst 20 feature names: {feature_names[:20]}")
    
    # Show TF-IDF values for first sample
    first_sample_tfidf = x_train_tfidf[0].toarray().flatten()
    non_zero_indices = first_sample_tfidf.nonzero()[0]
    
    print(f"\nFirst sample - Non-zero TF-IDF values:")
    for idx in non_zero_indices[:10]:  # Show first 10 non-zero values
        print(f"  {feature_names[idx]}: {first_sample_tfidf[idx]:.4f}")
    
    return x_train_tfidf, x_test_tfidf, y_train, y_test, tfidf_vectorizer

# Run Problem 2
x_train_tfidf, x_test_tfidf, y_train, y_test, tfidf_vectorizer = tfidf_calculation()

In [None]:
# PROBLEM 3: Learning with TF-IDF
def classification_with_tfidf():
    """Train classifier using TF-IDF vectors"""
    
    print("🎯 PROBLEM 3: Classification with TF-IDF")
    print("=" * 50)
    
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn.model_selection import cross_val_score
    
    # Use Logistic Regression as binary classifier
    classifier = LogisticRegression(random_state=42, max_iter=1000)
    
    print("🔄 Training classifier...")
    classifier.fit(x_train_tfidf, y_train)
    
    # Predict on test set
    y_pred = classifier.predict(x_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"✅ Test Accuracy: {accuracy:.4f}")
    print(f"Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))
    
    # Experiment with different parameters
    print("\n🔬 Experimenting with different parameters:")
    
    parameters_experiment = [
        {'max_features': 1000, 'ngram_range': (1, 1)},
        {'max_features': 5000, 'ngram_range': (1, 1)}, 
        {'max_features': 5000, 'ngram_range': (1, 2)},
        {'max_features': 10000, 'ngram_range': (1, 1)}
    ]
    
    for params in parameters_experiment:
        print(f"\nTesting: max_features={params['max_features']}, ngram_range={params['ngram_range']}")
        
        # Create new vectorizer with different parameters
        vectorizer = TfidfVectorizer(
            max_features=params['max_features'],
            ngram_range=params['ngram_range'],
            stop_words=stopwords.words('english'),
            lowercase=True
        )
        
        # Transform data
        x_train_new = vectorizer.fit_transform(x_train)
        x_test_new = vectorizer.transform(x_test)
        
        # Train and evaluate
        clf = LogisticRegression(random_state=42, max_iter=1000)
        clf.fit(x_train_new, y_train)
        y_pred_new = clf.predict(x_test_new)
        acc = accuracy_score(y_test, y_pred_new)
        
        print(f"  Accuracy: {acc:.4f}")
    
    return classifier

# Run Problem 3  
classifier = classification_with_tfidf()

In [None]:
# PROBLEM 4: Scratch TF-IDF Implementation
def tfidf_scratch_implementation():
    """Implement TF-IDF from scratch without scikit-learn"""
    
    print("🎯 PROBLEM 4: Scratch TF-IDF Implementation")
    print("=" * 50)
    
    # The three sentences from the problem
    sentences = [
        "This movie is SOOOO funny!!!",
        "What a movie! I never", 
        "best movie ever!!!!! this movie"
    ]
    
    print("Input sentences:")
    for i, sentence in enumerate(sentences):
        print(f"  {i+1}. {sentence}")
    
    def standard_tfidf(sentences):
        """Standard TF-IDF formula"""
        # Tokenize sentences
        tokenized_sentences = []
        for sentence in sentences:
            cleaned = ''.join(char.lower() if char.isalnum() or char.isspace() else ' ' for char in sentence)
            tokens = cleaned.split()
            tokenized_sentences.append(tokens)
        
        # Create vocabulary
        vocabulary = set()
        for tokens in tokenized_sentences:
            vocabulary.update(tokens)
        vocabulary = sorted(list(vocabulary))
        
        N = len(sentences)  # Number of documents
        
        # Calculate TF (Term Frequency) - standard formula
        tf_matrix = []
        for tokens in tokenized_sentences:
            total_terms = len(tokens)
            tf_vector = [tokens.count(term) / total_terms for term in vocabulary]
            tf_matrix.append(tf_vector)
        
        # Calculate IDF (Inverse Document Frequency) - standard formula
        idf_vector = []
        for term in vocabulary:
            doc_count = sum(1 for tokens in tokenized_sentences if term in tokens)
            idf = np.log(N / doc_count) if doc_count > 0 else 0
            idf_vector.append(idf)
        
        # Calculate TF-IDF
        tfidf_matrix = []
        for tf_vector in tf_matrix:
            tfidf_vector = [tf * idf for tf, idf in zip(tf_vector, idf_vector)]
            tfidf_matrix.append(tfidf_vector)
        
        return tfidf_matrix, vocabulary
    
    def sklearn_tfidf(sentences):
        """Scikit-learn TF-IDF formula"""
        tokenized_sentences = []
        for sentence in sentences:
            cleaned = ''.join(char.lower() if char.isalnum() or char.isspace() else ' ' for char in sentence)
            tokens = cleaned.split()
            tokenized_sentences.append(tokens)
        
        vocabulary = set()
        for tokens in tokenized_sentences:
            vocabulary.update(tokens)
        vocabulary = sorted(list(vocabulary))
        
        N = len(sentences)
        
        # TF: raw counts (same as BoW)
        tf_matrix = []
        for tokens in tokenized_sentences:
            tf_vector = [tokens.count(term) for term in vocabulary]
            tf_matrix.append(tf_vector)
        
        # IDF: scikit-learn formula
        idf_vector = []
        for term in vocabulary:
            doc_count = sum(1 for tokens in tokenized_sentences if term in tokens)
            idf = np.log((1 + N) / (1 + doc_count)) + 1
            idf_vector.append(idf)
        
        # TF-IDF
        tfidf_matrix = []
        for tf_vector in tf_matrix:
            tfidf_vector = [tf * idf for tf, idf in zip(tf_vector, idf_vector)]
            tfidf_matrix.append(tfidf_vector)
        
        return tfidf_matrix, vocabulary
    
    # Calculate both versions
    standard_tfidf_matrix, vocab_standard = standard_tfidf(sentences)
    sklearn_tfidf_matrix, vocab_sklearn = sklearn_tfidf(sentences)
    
    print("\n📊 Standard TF-IDF Results:")
    print(f"Vocabulary: {vocab_standard}")
    for i, vector in enumerate(standard_tfidf_matrix):
        print(f"Sentence {i+1}: {[f'{val:.4f}' for val in vector]}")
    
    print("\n📊 Scikit-learn TF-IDF Results:")
    print(f"Vocabulary: {vocab_sklearn}")
    for i, vector in enumerate(sklearn_tfidf_matrix):
        print(f"Sentence {i+1}: {[f'{val:.4f}' for val in vector]}")
    
    return standard_tfidf_matrix, sklearn_tfidf_matrix

# Run Problem 4
import numpy as np
standard_tfidf_matrix, sklearn_tfidf_matrix = tfidf_scratch_implementation()

In [None]:
# PROBLEM 5: Preprocessing for Word2Vec
def preprocess_corpus():
    """Preprocess IMDB corpus for Word2Vec training"""
    
    print("🎯 PROBLEM 5: Corpus Preprocessing for Word2Vec")
    print("=" * 50)
    
    import re
    
    # Load IMDB data
    train_review = load_files('./aclImdb/train/', encoding='utf-8')
    x_train = train_review.data
    
    print(f"Original training samples: {len(x_train)}")
    
    def preprocess_text(text):
        """Preprocess individual text: remove special chars, lowercase, tokenize"""
        if isinstance(text, bytes):
            text = text.decode('utf-8')
        
        # Remove special characters and URLs
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
        
        # Convert to lowercase and split into tokens
        tokens = text.lower().split()
        
        return tokens
    
    # Preprocess all texts
    processed_corpus = []
    for i, text in enumerate(x_train[:1000]):  # Use first 1000 for demonstration
        tokens = preprocess_text(text)
        processed_corpus.append(tokens)
        
        if i < 3:  # Show first 3 examples
            print(f"\nSample {i+1}:")
            print(f"  Original: {text[:100]}...")
            print(f"  Processed: {tokens[:10]}...")
    
    print(f"\n✅ Preprocessed corpus: {len(processed_corpus)} documents")
    print(f"Total tokens in first document: {len(processed_corpus[0])}")
    
    return processed_corpus

# Run Problem 5
processed_corpus = preprocess_corpus()

In [None]:
# PROBLEM 6: Word2Vec Training
def train_word2vec():
    """Train Word2Vec model on preprocessed corpus"""
    
    print("🎯 PROBLEM 6: Word2Vec Training")
    print("=" * 50)
    
    from gensim.models import Word2Vec
    
    # Train Word2Vec model
    print("🔄 Training Word2Vec model...")
    model = Word2Vec(
        sentences=processed_corpus,
        vector_size=100,      # Dimension of word vectors
        window=5,            # Context window size
        min_count=5,         # Ignore words with lower frequency
        workers=4,           # Number of CPU cores
        sg=1                 # Skip-gram (1) vs CBOW (0)
    )
    
    print("✅ Word2Vec model trained successfully!")
    print(f"Vocabulary size: {len(model.wv.key_to_index)}")
    print(f"Vector dimension: {model.vector_size}")
    
    # Show some words from vocabulary
    vocab_words = list(model.wv.key_to_index.keys())[:20]
    print(f"Sample vocabulary: {vocab_words}")
    
    # Test word similarity
    test_words = ['movie', 'good', 'bad', 'story']
    for word in test_words:
        if word in model.wv.key_to_index:
            print(f"\nWords similar to '{word}':")
            similar_words = model.wv.most_similar(word, topn=5)
            for similar, score in similar_words:
                print(f"  {similar}: {score:.4f}")
        else:
            print(f"'{word}' not in vocabulary")
    
    return model

# Run Problem 6
word2vec_model = train_word2vec()

In [None]:
# PROBLEM 7: Vector Visualization
def visualize_word_vectors():
    """Visualize word vectors using t-SNE"""
    
    print("🎯 PROBLEM 7: Word Vector Visualization")
    print("=" * 50)
    
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt
    
    # Get words and vectors
    words = list(word2vec_model.wv.key_to_index.keys())[:50]  # First 50 words
    word_vectors = [word2vec_model.wv[word] for word in words]
    
    # Apply t-SNE for 2D visualization
    print("🔄 Applying t-SNE dimensionality reduction...")
    tsne = TSNE(n_components=2, random_state=42, perplexity=15)
    vectors_2d = tsne.fit_transform(word_vectors)
    
    # Create visualization
    plt.figure(figsize=(12, 10))
    plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], alpha=0.7)
    
    # Add word labels
    for i, word in enumerate(words):
        plt.annotate(word, xy=(vectors_2d[i, 0], vectors_2d[i, 1]), 
                    xytext=(5, 2), textcoords='offset points',
                    fontsize=8, alpha=0.8)
    
    plt.title('Word2Vec Vector Visualization (t-SNE)')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # Find similar words for specific examples
    test_words = ['good', 'bad', 'movie', 'story']
    for word in test_words:
        if word in word2vec_model.wv.key_to_index:
            print(f"\nMost similar to '{word}':")
            similar = word2vec_model.wv.most_similar(word, topn=5)
            for sim_word, score in similar:
                print(f"  {sim_word}: {score:.4f}")

# Run Problem 7
visualize_word_vectors()

In [None]:
# PROBLEM 8: Classification with Word2Vec
def classification_with_word2vec():
    """Classify IMDB reviews using Word2Vec vectors"""
    
    print("🎯 PROBLEM 8: Classification with Word2Vec")
    print("=" * 50)
    
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score
    import numpy as np
    
    # Load data
    train_review = load_files('./aclImdb/train/', encoding='utf-8')
    x_train, y_train = train_review.data[:1000], train_review.target[:1000]  # Use subset
    
    # Convert documents to vectors by averaging word vectors
    def document_to_vector(text, model):
        tokens = preprocess_text(text)
        vectors = []
        for token in tokens:
            if token in model.wv.key_to_index:
                vectors.append(model.wv[token])
        
        if len(vectors) > 0:
            return np.mean(vectors, axis=0)
        else:
            return np.zeros(model.vector_size)
    
    # Convert training data to vectors
    print("🔄 Converting documents to Word2Vec vectors...")
    x_train_vectors = []
    for text in x_train:
        vector = document_to_vector(text, word2vec_model)
        x_train_vectors.append(vector)
    
    x_train_vectors = np.array(x_train_vectors)
    
    print(f"Training vectors shape: {x_train_vectors.shape}")
    
    # Train classifier
    print("🔄 Training classifier...")
    classifier = LogisticRegression(random_state=42, max_iter=1000)
    classifier.fit(x_train_vectors, y_train)
    
    # Simple train accuracy (for demonstration)
    train_pred = classifier.predict(x_train_vectors)
    train_accuracy = accuracy_score(y_train, train_pred)
    
    print(f"✅ Training Accuracy: {train_accuracy:.4f}")
    
    # Compare with pre-trained vectors (conceptual)
    print("\n💡 Using pre-trained vectors (conceptual):")
    print("  • Download pre-trained Word2Vec/FastText/GloVe vectors")
    print("  • Map words to pre-trained vectors instead of training")
    print("  • Often better performance with large pre-trained models")
    print("  • Examples: Google News Word2Vec, FastText Wikipedia vectors")
    
    return classifier, x_train_vectors

# Run Problem 8
classifier_w2v, x_train_vectors = classification_with_word2vec()