## 1. Import Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from tqdm import tqdm
import pickle
import os

print("Libraries imported successfully!")

Libraries imported successfully!


## 2. Load Dataset

In [2]:
# Load cleaned dataset
df = pd.read_csv('../../data/cleaned_label.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nSentiment distribution:")
print(df['sentiment_label'].value_counts())

Dataset shape: (50000, 2)

Columns: ['review_text', 'sentiment_label']

First few rows:
                                         review_text  sentiment_label
0  Once again Mr. Costner has dragged out a movie...                0
1  This is a pale imitation of 'Officer and a Gen...                0
2  Years ago, when DARLING LILI played on TV, it ...                0
3  I was looking forward to this movie. Trustwort...                0
4  First of all, I would like to say that I am a ...                0

Sentiment distribution:
sentiment_label
0    25000
1    25000
Name: count, dtype: int64


## 3. Tokenize Texts

In [3]:
# Simple tokenization (split by space)
def simple_tokenize(text):
    """Split text into words"""
    return str(text).lower().split()

# Tokenize all texts
print("Tokenizing texts...")
tokenized_texts = [simple_tokenize(text) for text in tqdm(df['review_text'])]

print(f"\nTotal documents: {len(tokenized_texts)}")
print(f"Sample tokenized text: {tokenized_texts[0][:20]}...")

Tokenizing texts...


100%|██████████| 50000/50000 [00:01<00:00, 40265.68it/s]


Total documents: 50000
Sample tokenized text: ['once', 'again', 'mr.', 'costner', 'has', 'dragged', 'out', 'a', 'movie', 'for', 'far', 'longer', 'than', 'necessary.', 'aside', 'from', 'the', 'terrific', 'sea', 'rescue']...





## 4. Train Word2Vec Model

In [4]:
# Word2Vec parameters
VECTOR_SIZE = 100      # Embedding dimension
WINDOW = 5             # Context window size
MIN_COUNT = 2          # Minimum word frequency
WORKERS = 4            # Number of threads
EPOCHS = 10            # Training epochs

print("Training Word2Vec model...")
print(f"Parameters:")
print(f"  - Vector size: {VECTOR_SIZE}")
print(f"  - Window: {WINDOW}")
print(f"  - Min count: {MIN_COUNT}")
print(f"  - Epochs: {EPOCHS}")
print()

# Train model
w2v_model = Word2Vec(
    sentences=tokenized_texts,
    vector_size=VECTOR_SIZE,
    window=WINDOW,
    min_count=MIN_COUNT,
    workers=WORKERS,
    epochs=EPOCHS,
    sg=0  # 0=CBOW, 1=Skip-gram
)

print("\nWord2Vec model trained successfully!")
print(f"Vocabulary size: {len(w2v_model.wv)}")

Training Word2Vec model...
Parameters:
  - Vector size: 100
  - Window: 5
  - Min count: 2
  - Epochs: 10



Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_fl


Word2Vec model trained successfully!
Vocabulary size: 149452


## 5. Generate Document Embeddings

In [5]:
def get_document_embedding(tokens, model):
    """
    Get document embedding by averaging word vectors
    
    Args:
        tokens: list of words
        model: trained Word2Vec model
        
    Returns:
        numpy array: document embedding
    """
    vectors = []
    for word in tokens:
        if word in model.wv:
            vectors.append(model.wv[word])
    
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    
    return np.mean(vectors, axis=0)

# Generate embeddings for all documents
print("Generating document embeddings...")
embeddings = []

for tokens in tqdm(tokenized_texts):
    embedding = get_document_embedding(tokens, w2v_model)
    embeddings.append(embedding)

embeddings = np.array(embeddings)

print(f"\nEmbeddings shape: {embeddings.shape}")
print(f"Each document represented as {embeddings.shape[1]}-dimensional vector")

Generating document embeddings...


100%|██████████| 50000/50000 [00:15<00:00, 3151.89it/s]


Embeddings shape: (50000, 100)
Each document represented as 100-dimensional vector





## 6. Save Model and Embeddings

In [6]:
# Create output directory
output_dir = '../../models/word2vec'
os.makedirs(output_dir, exist_ok=True)

# Save Word2Vec model
model_path = os.path.join(output_dir, 'word2vec_imdb.model')
w2v_model.save(model_path)
print(f"Word2Vec model saved: {model_path}")

# Save embeddings
embeddings_path = os.path.join(output_dir, 'document_embeddings.npy')
np.save(embeddings_path, embeddings)
print(f"Document embeddings saved: {embeddings_path}")

# Save labels
labels_path = os.path.join(output_dir, 'labels.npy')
np.save(labels_path, df['sentiment_label'].values)
print(f"Labels saved: {labels_path}")

# Save metadata
metadata = {
    'vector_size': VECTOR_SIZE,
    'window': WINDOW,
    'min_count': MIN_COUNT,
    'epochs': EPOCHS,
    'vocab_size': len(w2v_model.wv),
    'num_documents': len(embeddings),
    'embedding_shape': embeddings.shape
}

metadata_path = os.path.join(output_dir, 'metadata.pkl')
with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)
print(f"Metadata saved: {metadata_path}")

print("\n" + "="*80)
print("All files saved successfully!")
print("="*80)
print(f"\nOutput directory: {output_dir}")
print(f"Files:")
print(f"  - word2vec_imdb.model      (Word2Vec model)")
print(f"  - document_embeddings.npy  (Document embeddings: {embeddings.shape})")
print(f"  - labels.npy               (Sentiment labels)")
print(f"  - metadata.pkl             (Model metadata)")

Word2Vec model saved: ../../models/word2vec/word2vec_imdb.model
Document embeddings saved: ../../models/word2vec/document_embeddings.npy
Labels saved: ../../models/word2vec/labels.npy
Metadata saved: ../../models/word2vec/metadata.pkl

All files saved successfully!

Output directory: ../../models/word2vec
Files:
  - word2vec_imdb.model      (Word2Vec model)
  - document_embeddings.npy  (Document embeddings: (50000, 100))
  - labels.npy               (Sentiment labels)
  - metadata.pkl             (Model metadata)


## 7. Test Word Similarities

In [7]:
# Test word similarities
print("Testing word similarities...\n")

test_words = ['good', 'bad', 'movie', 'film', 'great', 'terrible']

for word in test_words:
    if word in w2v_model.wv:
        print(f"Similar words to '{word}':")
        similar = w2v_model.wv.most_similar(word, topn=5)
        for similar_word, score in similar:
            print(f"  {similar_word:15s} {score:.4f}")
        print()
    else:
        print(f"'{word}' not in vocabulary\n")

Testing word similarities...

Similar words to 'good':
  decent          0.7669
  great           0.7581
  good,           0.7362
  bad             0.7054
  nice            0.6695

Similar words to 'bad':
  bad,            0.7703
  bad.            0.7347
  terrible        0.7218
  horrible        0.7061
  good            0.7054

Similar words to 'movie':
  film            0.9332
  movie,          0.8348
  flick           0.7746
  film,           0.7715
  movie.          0.7415

Similar words to 'film':
  movie           0.9332
  film,           0.8228
  movie,          0.7716
  flick           0.7451
  film;           0.7133

Similar words to 'great':
  wonderful       0.8205
  fantastic       0.7910
  fine            0.7849
  terrific        0.7624
  good            0.7581

Similar words to 'terrible':
  horrible        0.9129
  horrid          0.8171
  lousy           0.7659
  awful           0.7606
  awful,          0.7527

  decent          0.7669
  great           0.7581
  good,  

## 8. Load Saved Embeddings (Example)

In [8]:
# Example: How to load saved embeddings
print("Example - Loading saved embeddings:\n")

# Load Word2Vec model
loaded_model = Word2Vec.load(model_path)
print(f"✓ Word2Vec model loaded")
print(f"  Vocabulary size: {len(loaded_model.wv)}")

# Load embeddings
loaded_embeddings = np.load(embeddings_path)
print(f"\n✓ Document embeddings loaded")
print(f"  Shape: {loaded_embeddings.shape}")

# Load labels
loaded_labels = np.load(labels_path)
print(f"\n✓ Labels loaded")
print(f"  Shape: {loaded_labels.shape}")

# Load metadata
with open(metadata_path, 'rb') as f:
    loaded_metadata = pickle.load(f)
print(f"\n✓ Metadata loaded")
print(f"  {loaded_metadata}")

print("\n" + "="*80)
print("Word2Vec embedding generation complete!")
print("Use these embeddings for any classifier (Logistic Regression, SVM, etc.)")
print("="*80)

Example - Loading saved embeddings:

✓ Word2Vec model loaded
  Vocabulary size: 149452

✓ Document embeddings loaded
  Shape: (50000, 100)

✓ Labels loaded
  Shape: (50000,)

✓ Metadata loaded
  {'vector_size': 100, 'window': 5, 'min_count': 2, 'epochs': 10, 'vocab_size': 149452, 'num_documents': 50000, 'embedding_shape': (50000, 100)}

Word2Vec embedding generation complete!
Use these embeddings for any classifier (Logistic Regression, SVM, etc.)
✓ Word2Vec model loaded
  Vocabulary size: 149452

✓ Document embeddings loaded
  Shape: (50000, 100)

✓ Labels loaded
  Shape: (50000,)

✓ Metadata loaded
  {'vector_size': 100, 'window': 5, 'min_count': 2, 'epochs': 10, 'vocab_size': 149452, 'num_documents': 50000, 'embedding_shape': (50000, 100)}

Word2Vec embedding generation complete!
Use these embeddings for any classifier (Logistic Regression, SVM, etc.)
