In [2]:
# Check IMDB dataset format
import torch
from torchtext.datasets import IMDB
import torchtext
import random

torchtext.disable_torchtext_deprecation_warning()

def inspect_imdb_data():
    print("Loading IMDB dataset...")
    train_data = IMDB(split='train')
    
    # Convert to list and count overall distribution
    all_samples = list(train_data)
    total_pos = sum(1 for label, _ in all_samples if label == 2)
    total_neg = sum(1 for label, _ in all_samples if label == 1)
    
    print(f"\nTotal dataset statistics:")
    print(f"Total samples: {len(all_samples)}")
    print(f"Total positive reviews: {total_pos}")
    print(f"Total negative reviews: {total_neg}")
    
    # Randomly select 10 samples for inspection
    print("\nInspecting 10 random samples:")
    random_samples = random.sample(all_samples, 10)
    
    for i, sample in enumerate(random_samples, 1):
        print(f"\nSample {i}:")
        true_label, review_text = sample
        
        sentiment = "Negative" if true_label == 1 else "Positive"
        
        print(f"True Label (raw): {true_label}")
        print(f"Sentiment: {sentiment}")
        print(f"Review preview: {review_text[:200]}...")
        print("-" * 80)
        
    # Calculate distribution in random samples
    sample_pos = sum(1 for label, _ in random_samples if label == 2)
    sample_neg = sum(1 for label, _ in random_samples if label == 1)
    print(f"\nIn these random samples:")
    print(f"Positive reviews: {sample_pos}")
    print(f"Negative reviews: {sample_neg}")

if __name__ == "__main__":
    inspect_imdb_data()



Loading IMDB dataset...


################################################################################
The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a
future torchdata release! Please see https://github.com/pytorch/data/issues/1196
to learn more and leave feedback.
################################################################################




Total dataset statistics:
Total samples: 25000
Total positive reviews: 12500
Total negative reviews: 12500

Inspecting 10 random samples:

Sample 1:
True Label (raw): 1
Sentiment: Negative
Review preview: no redeeming qualities can possibly be expressed. i wish i could get my time back. nice skull face broad really smiles, bright at the camera when the disease has already wreaked enough havoc on the il...
--------------------------------------------------------------------------------

Sample 2:
True Label (raw): 2
Sentiment: Positive
Review preview: "A Guy Thing" may not be a classic, but it sure is a good, funny comedy. The plot focuses on Paul (Jason Lee), who wakes up the morning after his bachelor party with no memory and Becky (Julia Stiles)...
--------------------------------------------------------------------------------

Sample 3:
True Label (raw): 1
Sentiment: Negative
Review preview: I'm just quite disappointed with "Soul Survivors". It doesn't worth even a comment in this

In [3]:
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import GloVe
from torchtext.datasets import IMDB
import nltk
from nltk.tokenize import word_tokenize
import re
import numpy as np
from tqdm import tqdm
import os
import warnings
import torchtext
import random

# Disable torchtext deprecation warnings
torchtext.disable_torchtext_deprecation_warning()

def preprocess_text(text):
    """Text preprocessing function"""
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and HTML tags
    text = re.sub(r'<.*?>', ' ', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', ' ', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Truncate or pad to fixed length
    if len(tokens) > 256:
        tokens = tokens[:256]
    else:
        tokens.extend(['<pad>'] * (256 - len(tokens)))
    return tokens

def text_to_features(tokens, glove):
    # Convert tokens to GloVe embeddings
    features = torch.zeros(256, 300)
    for i, token in enumerate(tokens[:256]):
        if token in glove.stoi:
            features[i] = glove.vectors[glove.stoi[token]]
    return features

class IMDbDataset(Dataset):
    def __init__(self, data_iter, glove, max_samples=None, split='train'):
        """
        Parameters:
            data_iter: IMDB data iterator
            glove: GloVe word vectors
            max_samples: Maximum samples per class (None means use all data)
            split: 'train' or 'test', used for display information
        """
        print(f"Preprocessing {split} dataset...")
        # Collect all samples and group by label
        pos_samples = []
        neg_samples = []
        all_data = list(data_iter)
        
        print(f"Separating positive and negative samples for {split} set...")
        for label, text in all_data:
            if label == 1:  # negative
                neg_samples.append((label, text))
            else:  # positive (label == 2)
                pos_samples.append((label, text))
                
        print(f"Total {split} samples - Positive: {len(pos_samples)}, Negative: {len(neg_samples)}")
        
        # If max_samples specified, ensure balanced classes
        if max_samples:
            samples_per_class = max_samples // 2
            pos_samples = random.sample(pos_samples, samples_per_class)
            neg_samples = random.sample(neg_samples, samples_per_class)
            print(f"Using {samples_per_class} samples per class for {split} set")
            
        # Combine and shuffle samples
        self.samples = []
        combined_samples = pos_samples + neg_samples
        random.shuffle(combined_samples)
        
        # Process samples
        print(f"Processing {split} samples...")
        for label, text in tqdm(combined_samples):
            try:
                tokens = preprocess_text(text)
                features = text_to_features(tokens, glove)
                label_tensor = torch.tensor(label - 1, dtype=torch.long)  # Convert 1->0, 2->1
                self.samples.append((features, label_tensor))
            except Exception as e:
                print(f"Error processing sample: {str(e)[:200]}")
                continue
        
        print(f"Successfully processed {len(self.samples)} {split} samples")
        
        # Verify final distribution
        final_pos = sum(1 for _, label in self.samples if label == 1)
        final_neg = sum(1 for _, label in self.samples if label == 0)
        print(f"Final {split} distribution - Positive: {final_pos}, Negative: {final_neg}")
        
    def __len__(self):
        return len(self.samples)
        
    def __getitem__(self, idx):
        return self.samples[idx]

class SentimentModel(nn.Module):
    def __init__(self):
        super(SentimentModel, self).__init__()
        # Feature extraction layers
        self.feature_extractor = nn.Sequential(
            nn.Linear(300, 150),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(150, 64),
            nn.ReLU(),
        )
        # Classification layers
        self.classifier = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 2)
        )
        
    def forward(self, x):
        batch_size = x.size(0)
        token_features = []
        # Process each token in the sequence
        for i in range(x.size(1)):
            token_vec = x[:, i, :]
            token_features.append(self.feature_extractor(token_vec))
        
        token_features = torch.stack(token_features, dim=1)
        # Mean pooling over token features
        pooled_features = torch.mean(token_features, dim=1)
        
        output = self.classifier(pooled_features)
        return output

def evaluate_model(model, test_loader, criterion, device):
    """Evaluate model performance on test data"""
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    pos_correct = 0
    neg_correct = 0
    pos_total = 0
    neg_total = 0
    
    with torch.no_grad():
        for inputs, labels in tqdm(test_loader, desc='Evaluating'):
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            test_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            # Track accuracy separately for positive and negative samples
            pos_mask = labels == 1
            neg_mask = labels == 0
            pos_correct += (predicted[pos_mask] == labels[pos_mask]).sum().item()
            neg_correct += (predicted[neg_mask] == labels[neg_mask]).sum().item()
            pos_total += pos_mask.sum().item()
            neg_total += neg_mask.sum().item()
    
    test_loss = test_loss / len(test_loader)
    accuracy = 100 * correct / total
    pos_accuracy = 100 * pos_correct / pos_total if pos_total > 0 else 0
    neg_accuracy = 100 * neg_correct / neg_total if neg_total > 0 else 0
    
    print('Test Results:')
    print(f'Loss: {test_loss:.4f}')
    print(f'Overall Accuracy: {accuracy:.2f}%')
    print(f'Positive Reviews Accuracy: {pos_accuracy:.2f}%')
    print(f'Negative Reviews Accuracy: {neg_accuracy:.2f}%')
    
    return accuracy

def train_model(model, train_loader, criterion, optimizer, device, num_epochs=5):
    """Train the model"""
    model.train()
    best_accuracy = 0.0
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        pos_correct = 0
        neg_correct = 0
        pos_total = 0
        neg_total = 0
        
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
        for inputs, labels in progress_bar:
            inputs = inputs.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)
            
            # Forward pass
            optimizer.zero_grad(set_to_none=True)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            # Track metrics
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            # Track accuracy by class
            pos_mask = labels == 1
            neg_mask = labels == 0
            pos_correct += (predicted[pos_mask] == labels[pos_mask]).sum().item()
            neg_correct += (predicted[neg_mask] == labels[neg_mask]).sum().item()
            pos_total += pos_mask.sum().item()
            neg_total += neg_mask.sum().item()
            
            # Clean up GPU memory if using MPS
            if device.type == 'mps':
                torch.mps.empty_cache()
            
            progress_bar.set_postfix({
                'loss': f'{running_loss/(total//labels.size(0)):.4f}',
                'acc': f'{100*correct/total:.2f}%'
            })
            
        # Print epoch statistics
        epoch_acc = 100 * correct/total
        print(f'Epoch {epoch+1}:')
        print(f'Loss: {running_loss/len(train_loader):.4f}')
        print(f'Overall Accuracy: {epoch_acc:.2f}%')
        print(f'Positive Reviews Accuracy: {100*pos_correct/pos_total:.2f}%')
        print(f'Negative Reviews Accuracy: {100*neg_correct/neg_total:.2f}%')
        
        if epoch_acc > best_accuracy:
            best_accuracy = epoch_acc
            print(f'New best accuracy: {best_accuracy:.2f}%')

def save_weights_txt(model, output_dir='weights'):
    """Save model weights in text format"""
    os.makedirs(output_dir, exist_ok=True)
    state_dict = model.state_dict()
    for name, param in state_dict.items():
        param_numpy = param.detach().cpu().numpy()
        filename = os.path.join(output_dir, f"{name.replace('.', '_')}.txt")
        param_numpy.tofile(filename, sep='\n', format='%.8e')
        print(f"Saved {name} to {filename}")

def save_weights_binary(model, output_dir='weights_bin'):
    """Save model weights in binary format"""
    os.makedirs(output_dir, exist_ok=True)
    state_dict = model.state_dict()
    for name, param in state_dict.items():
        param_numpy = param.detach().cpu().numpy()
        filename = os.path.join(output_dir, f"{name.replace('.', '_')}.bin")
        param_numpy.tofile(filename)
        print(f"Saved {name} to {filename}")

def main(train_samples=1000, test_samples=200):
    """
    Main function
    Parameters:
        train_samples: Total number of training samples (will be split equally between positive/negative)
        test_samples: Total number of test samples (will be split equally between positive/negative)
    """
    # Set random seed for reproducibility
    random.seed(42)
    torch.manual_seed(42)
    
    # Set device (MPS/CUDA/CPU)
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using MPS device")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using CUDA device")
    else:
        device = torch.device("cpu")
        print("Using CPU device")
    
    # Download NLTK data if needed
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        print("Downloading NLTK punkt tokenizer...")
        nltk.download('punkt', quiet=True)
    
    # Load datasets and word vectors
    print("Loading IMDb dataset...")
    train_data = IMDB(split='train')
    test_data = IMDB(split='test')
    
    print("Loading GloVe vectors...")
    glove = GloVe(name='6B', dim=300)
    
    # Create datasets and dataloaders
    print("Creating data loaders...")
    train_dataset = IMDbDataset(train_data, glove, max_samples=train_samples, split='train')
    test_dataset = IMDbDataset(test_data, glove, max_samples=test_samples, split='test')
    
    if len(train_dataset) == 0 or len(test_dataset) == 0:
        print("No valid samples were processed. Exiting...")
        return
        
    train_loader = DataLoader(
        train_dataset, 
        batch_size=8,
        shuffle=True,
        num_workers=0,
        pin_memory=True
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=8,
        shuffle=False,
        num_workers=0,
        pin_memory=True
    )
    
    # Initialize model and training components
    print("Initializing model...")
    model = SentimentModel().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    # Train and evaluate
    print("Starting training...")
    try:
        train_model(model, train_loader, criterion, optimizer, device)
        
        print("\nEvaluating on test set...")
        test_accuracy = evaluate_model(model, test_loader, criterion, device)
        
    except RuntimeError as e:
        print(f"Error during training: {e}")
        print("Try reducing batch size or model size if you're running out of memory")
        return
        
    # Save model weights
    print("Saving model weights...")
    save_weights_txt(model)
    save_weights_binary(model)
    
    print("Done!")

if __name__ == "__main__":
    # Modify these parameters to control training/test set sizes
    main(train_samples=1000, test_samples=200)

Using MPS device
Loading IMDb dataset...
Loading GloVe vectors...
Creating data loaders...
Preprocessing train dataset...
Separating positive and negative samples for train set...
Total train samples - Positive: 12500, Negative: 12500
Using 500 samples per class for train set
Processing train samples...


100%|█████████████████████████████████████| 1000/1000 [00:00<00:00, 1012.60it/s]


Successfully processed 1000 train samples
Final train distribution - Positive: 500, Negative: 500
Preprocessing test dataset...
Separating positive and negative samples for test set...
Total test samples - Positive: 12500, Negative: 12500
Using 100 samples per class for test set
Processing test samples...


100%|███████████████████████████████████████| 200/200 [00:00<00:00, 1193.16it/s]

Successfully processed 200 test samples
Final test distribution - Positive: 100, Negative: 100
Initializing model...





Starting training...


Epoch 1/5: 100%|█████| 125/125 [00:22<00:00,  5.65it/s, loss=0.6893, acc=52.20%]


Epoch 1:
Loss: 0.6893
Overall Accuracy: 52.20%
Positive Reviews Accuracy: 72.60%
Negative Reviews Accuracy: 31.80%
New best accuracy: 52.20%


Epoch 2/5: 100%|█████| 125/125 [00:21<00:00,  5.68it/s, loss=0.6038, acc=68.20%]


Epoch 2:
Loss: 0.6038
Overall Accuracy: 68.20%
Positive Reviews Accuracy: 64.60%
Negative Reviews Accuracy: 71.80%
New best accuracy: 68.20%


Epoch 3/5: 100%|█████| 125/125 [00:22<00:00,  5.60it/s, loss=0.4904, acc=77.80%]


Epoch 3:
Loss: 0.4904
Overall Accuracy: 77.80%
Positive Reviews Accuracy: 78.00%
Negative Reviews Accuracy: 77.60%
New best accuracy: 77.80%


Epoch 4/5: 100%|█████| 125/125 [00:22<00:00,  5.55it/s, loss=0.3932, acc=82.20%]


Epoch 4:
Loss: 0.3932
Overall Accuracy: 82.20%
Positive Reviews Accuracy: 82.20%
Negative Reviews Accuracy: 82.20%
New best accuracy: 82.20%


Epoch 5/5: 100%|█████| 125/125 [00:22<00:00,  5.56it/s, loss=0.3664, acc=83.70%]


Epoch 5:
Loss: 0.3664
Overall Accuracy: 83.70%
Positive Reviews Accuracy: 86.60%
Negative Reviews Accuracy: 80.80%
New best accuracy: 83.70%

Evaluating on test set...


Evaluating: 100%|███████████████████████████████| 25/25 [00:01<00:00, 21.36it/s]


Test Results:
Loss: 0.4308
Overall Accuracy: 80.50%
Positive Reviews Accuracy: 75.00%
Negative Reviews Accuracy: 86.00%
Saving model weights...
Saved feature_extractor.0.weight to weights/feature_extractor_0_weight.txt
Saved feature_extractor.0.bias to weights/feature_extractor_0_bias.txt
Saved feature_extractor.3.weight to weights/feature_extractor_3_weight.txt
Saved feature_extractor.3.bias to weights/feature_extractor_3_bias.txt
Saved classifier.0.weight to weights/classifier_0_weight.txt
Saved classifier.0.bias to weights/classifier_0_bias.txt
Saved classifier.3.weight to weights/classifier_3_weight.txt
Saved classifier.3.bias to weights/classifier_3_bias.txt
Saved feature_extractor.0.weight to weights_bin/feature_extractor_0_weight.bin
Saved feature_extractor.0.bias to weights_bin/feature_extractor_0_bias.bin
Saved feature_extractor.3.weight to weights_bin/feature_extractor_3_weight.bin
Saved feature_extractor.3.bias to weights_bin/feature_extractor_3_bias.bin
Saved classifier.0.w