In [None]:
!pip install gensim
# Made with help from GPT
import nltk
import random
from nltk.corpus import movie_reviews
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
import numpy as np
from typing import List, Optional, Any


# Download required NLTK data
nltk.download('movie_reviews')
# nltk.download('punkt')




[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:

# Load and shuffle documents
documents : list[tuple[list[str], str]]  = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

# Extract only the review texts for training Word2Vec
sentences = [list(map(str.lower, movie_reviews.words(fileid)))
             for fileid in movie_reviews.fileids()]


In [3]:
# Train Word2Vec model (Skip-Gram)
embedding_dim = 300
w2v_model = Word2Vec(
    sentences,
    vector_size=embedding_dim,
    window=15,
    min_count=2,
    sg=1,  # Skip-Gram
    workers=8,
    epochs=10
)


In [4]:

def get_vector(word: str) -> Optional[np.ndarray]:
    """Fetch vector from Word2Vec model if word exists."""
    return w2v_model.wv[word] if word in w2v_model.wv else None

def get_review_vector_avg(words: List[str]) -> np.ndarray:
    """Compute average vector for a list of words."""
    vectors = [w2v_model.wv[w] for w in map(str.lower, words) if w in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(embedding_dim, dtype=np.float32)

def get_review_vector_max(words: List[str]) -> np.ndarray:
    """Compute max-pooled vector for a list of words."""
    vectors = [w2v_model.wv[w] for w in map(str.lower, words) if w in w2v_model.wv]
    return np.max(vectors, axis=0) if vectors else np.zeros(embedding_dim, dtype=np.float32)



# Prepare data
X_avg = []
X_max = []
y = []

for words, label in documents:
    X_avg.append(get_review_vector_avg(words))
    X_max.append(get_review_vector_max(words))
    y.append(1 if label == 'pos' else 0)


In [5]:

# Split data
X_avg_train, X_avg_test, y_train, y_test = train_test_split(X_avg, y, test_size=0.2, random_state=42)
X_max_train, X_max_test, _, _ = train_test_split(X_max, y, test_size=0.2, random_state=42)

# Train classifiers
clf_avg = LogisticRegression(max_iter=1000)
clf_avg.fit(X_avg_train, y_train)
y_pred_avg = clf_avg.predict(X_avg_test)

clf_max = LogisticRegression(max_iter=1000)
clf_max.fit(X_max_train, y_train)
y_pred_max = clf_max.predict(X_max_test)

# Results
print("Classification Report (Average Vector):")
print(classification_report(y_test, y_pred_avg))

print("Classification Report (Max Pooling Vector):")
print(classification_report(y_test, y_pred_max))

Classification Report (Average Vector):
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       212
           1       0.85      0.86      0.85       188

    accuracy                           0.86       400
   macro avg       0.86      0.86      0.86       400
weighted avg       0.86      0.86      0.86       400

Classification Report (Max Pooling Vector):
              precision    recall  f1-score   support

           0       0.75      0.70      0.72       212
           1       0.68      0.74      0.71       188

    accuracy                           0.72       400
   macro avg       0.72      0.72      0.72       400
weighted avg       0.72      0.72      0.72       400



In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

scale_data = True

# Split data
X_avg_train, X_avg_test, y_train, y_test = train_test_split(X_avg, y, test_size=0.2, random_state=42)
X_max_train, X_max_test, _, _ = train_test_split(X_max, y, test_size=0.2, random_state=42)

if scale_data:
    clf_avg = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
    clf_max = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
else:
    clf_avg = LogisticRegression(max_iter=1000)
    clf_max = LogisticRegression(max_iter=1000)

# Train classifiers
clf_avg.fit(X_avg_train, y_train)
y_pred_avg = clf_avg.predict(X_avg_test)

clf_max.fit(X_max_train, y_train)
y_pred_max = clf_max.predict(X_max_test)

# Results
print("Classification Report (Average Vector):")
print(classification_report(y_test, y_pred_avg))

print("Classification Report (Max Pooling Vector):")
print(classification_report(y_test, y_pred_max))


Classification Report (Average Vector):
              precision    recall  f1-score   support

           0       0.93      0.90      0.91       212
           1       0.89      0.92      0.90       188

    accuracy                           0.91       400
   macro avg       0.91      0.91      0.91       400
weighted avg       0.91      0.91      0.91       400

Classification Report (Max Pooling Vector):
              precision    recall  f1-score   support

           0       0.72      0.71      0.71       212
           1       0.68      0.69      0.68       188

    accuracy                           0.70       400
   macro avg       0.70      0.70      0.70       400
weighted avg       0.70      0.70      0.70       400



In [7]:

import tensorflow as tf
from tensorflow.keras import Input, Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam


def build_dense_model(input_dim: int) -> tf.keras.Model:
    model = Sequential([
        Input(shape=(input_dim,)),             # Cleanly define input shape
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')         # Binary classification
    ])
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model


# Early stopping callback
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    verbose=1
)


In [8]:
# Convert lists to NumPy arrays
X_avg_train_np = np.array(X_avg_train)
X_avg_test_np = np.array(X_avg_test)
X_max_train_np = np.array(X_max_train)
X_max_test_np = np.array(X_max_test)
y_train_np = np.array(y_train)
y_test_np = np.array(y_test)

print("Train labels:", np.unique(y_train_np, return_counts=True))
print("Test labels:", np.unique(y_test_np, return_counts=True))



Train labels: (array([0, 1]), array([788, 812]))
Test labels: (array([0, 1]), array([212, 188]))


In [9]:


if scale_data:
  from sklearn.preprocessing import StandardScaler

  scaler = StandardScaler()
  X_avg_train_np_scaled = scaler.fit_transform(X_avg_train_np)
  X_avg_test_np_scaled = scaler.transform(X_avg_test_np)
  print("Data Scaled!")
else:
  X_avg_train_np_scaled = X_avg_train_np
  X_avg_test_np_scaled = X_avg_test_np


# Train on average vectors
print("Training model on average vectors...")
model_avg = build_dense_model(embedding_dim)
# model_avg.fit(X_avg_train_np, y_train_np, epochs=10, batch_size=32, validation_split=0.1)
model_avg.fit(
    X_avg_train_np_scaled,
    y_train_np,
    epochs=50,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop],
    verbose=1
)


# Evaluate
print("Evaluating model on average vectors...")
loss, accuracy_avg = model_avg.evaluate(X_avg_test_np_scaled, y_test_np)
print(f"Test Accuracy (Average Vector): {accuracy_avg:.4f}")


Data Scaled!
Training model on average vectors...
Epoch 1/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.5536 - loss: 0.7516 - val_accuracy: 0.7563 - val_loss: 0.5042
Epoch 2/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7460 - loss: 0.5209 - val_accuracy: 0.8313 - val_loss: 0.3990
Epoch 3/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7910 - loss: 0.4423 - val_accuracy: 0.8625 - val_loss: 0.3316
Epoch 4/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8857 - loss: 0.2999 - val_accuracy: 0.8813 - val_loss: 0.3086
Epoch 5/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8993 - loss: 0.2676 - val_accuracy: 0.8813 - val_loss: 0.2833
Epoch 6/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9241 - loss: 0.2004 - val_accuracy: 0.8875 - val_lo

In [10]:


if scale_data:
  from sklearn.preprocessing import StandardScaler
  scaler = StandardScaler()
  X_max_train_np_scaled = scaler.fit_transform(X_max_train_np)
  X_max_test_np_scaled = scaler.transform(X_max_test_np)
else:
  X_max_train_np_scaled = X_max_train_np
  X_max_test_np_scaled = X_max_test_np



# Train on max-pooled vectors
print("Training model on max-pooled vectors...")
model_max = build_dense_model(embedding_dim)
model_max.fit(
    X_max_train_np_scaled,
    y_train_np,
    epochs=30,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate
print("Evaluating model on max-pooled vectors...")
loss, accuracy_max = model_max.evaluate(X_max_test_np_scaled, y_test_np)
print(f"Test Accuracy (Max Pooling Vector): {accuracy_max:.4f}")


Training model on max-pooled vectors...
Epoch 1/30
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.5208 - loss: 0.7838 - val_accuracy: 0.5500 - val_loss: 0.7007
Epoch 2/30
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5751 - loss: 0.6815 - val_accuracy: 0.5625 - val_loss: 0.6896
Epoch 3/30
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6048 - loss: 0.6490 - val_accuracy: 0.6187 - val_loss: 0.6671
Epoch 4/30
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6723 - loss: 0.6131 - val_accuracy: 0.6375 - val_loss: 0.6370
Epoch 5/30
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7475 - loss: 0.5365 - val_accuracy: 0.6687 - val_loss: 0.6207
Epoch 6/30
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7625 - loss: 0.4896 - val_accuracy: 0.6562 - val_loss: 0.6352


In [11]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, GlobalMaxPooling1D, Dense, Dropout

def build_sequence_model(seq_len: int, embedding_dim: int) -> tf.keras.Model:
    inputs = Input(shape=(seq_len, embedding_dim))
    x = Conv1D(64, kernel_size=3, activation='relu')(inputs)
    x = GlobalMaxPooling1D()(x)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(16, activation='relu')(x)
    x = Dropout(0.3)(x)
    outputs = Dense(1, activation='sigmoid')(x)

    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [12]:
def review_to_sequences(words: List[str], seq_len: int = 10) -> List[np.ndarray]:
    """Convert a list of words into non-overlapping sequences of word vectors."""
    vectors = [w2v_model.wv[w.lower()] for w in words if w.lower() in w2v_model.wv]
    sequences = []
    for i in range(0, len(vectors) - seq_len + 1, seq_len):
        seq = vectors[i:i + seq_len]
        sequences.append(np.stack(seq))  # shape: (seq_len, embedding_dim)
    return sequences


In [13]:
sequence_length = 15


# Shuffle and split into train/test by review (not sequences)
train_reviews, test_reviews = train_test_split(documents, test_size=0.2, random_state=42)
# Sequence training data from train_reviews
X_seq, y_seq = [], []

for words, label in train_reviews:
    sequences = review_to_sequences(words, seq_len=sequence_length)
    X_seq.extend(sequences)
    y_seq.extend([1 if label == 'pos' else 0] * len(sequences))

X_seq = np.array(X_seq)
y_seq = np.array(y_seq)


In [14]:
X_seq_train, X_seq_test, y_seq_train, y_seq_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

model_seq = build_sequence_model(sequence_length, embedding_dim)
model_seq.fit(
    X_seq_train, y_seq_train,
    epochs=30,
    batch_size=64,
    validation_split=0.1,
    callbacks=[early_stop],
    verbose=1
)

loss, acc = model_seq.evaluate(X_seq_test, y_seq_test)
print(f"Sequence Model Accuracy: {acc:.4f}")


Epoch 1/30
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - accuracy: 0.5537 - loss: 0.6810 - val_accuracy: 0.6500 - val_loss: 0.6195
Epoch 2/30
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 11ms/step - accuracy: 0.6604 - loss: 0.6171 - val_accuracy: 0.6705 - val_loss: 0.6037
Epoch 3/30
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.6797 - loss: 0.5923 - val_accuracy: 0.6657 - val_loss: 0.6014
Epoch 4/30
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.7031 - loss: 0.5738 - val_accuracy: 0.6687 - val_loss: 0.5981
Epoch 5/30
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.7113 - loss: 0.5601 - val_accuracy: 0.6671 - val_loss: 0.5979
Epoch 6/30
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 10ms/step - accuracy: 0.7271 - loss: 0.5426 - val_accuracy: 0.6610 - val_loss: 0.6061
Epoch 7/30
[1m9

In [15]:
def predict_review(words: List[str], model, seq_len: int = 10) -> float:
    """Predict review sentiment by averaging predictions over all sequences."""
    sequences = review_to_sequences(words, seq_len)
    if not sequences:
        return 0.5  # neutral prediction
    sequences_np = np.array(sequences)
    preds = model.predict(sequences_np, verbose=0)
    return float(np.mean(preds))  # average prediction across sequences


In [16]:
sample_review_words = movie_reviews.words(movie_reviews.fileids('pos')[0])
score = predict_review(words, model_seq, seq_len=sequence_length)
print(f"Predicted sentiment score: {score:.3f} → {'Positive' if score >= 0.5 else 'Negative'}")


Predicted sentiment score: 0.680 → Positive


In [17]:
# Run review-level evaluation
correct = 0
for words, label in test_reviews:
    score = predict_review(words, model_seq, seq_len=sequence_length)
    pred_label = 'pos' if score >= 0.5 else 'neg'
    if pred_label == label:
        correct += 1

review_accuracy = correct / len(test_reviews)
print(f"Review-level accuracy (Case C): {review_accuracy:.4f}")


Review-level accuracy (Case C): 0.9050
