# IMDB Reviews: Overfitting and Underfitting

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import numpy as np
import matplotlib.pyplot as plt

## Generate Synthetic Data

In [None]:
# Generate synthetic text classification data
np.random.seed(42)
torch.manual_seed(42)

# Create synthetic features (e.g., TF-IDF vectors)
n_samples = 1000
n_features = 100

X = torch.randn(n_samples, n_features)
y = (X[:, 0] + X[:, 1] > 0).long()

# Split into train/test
train_size = int(0.8 * n_samples)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Models of Different Complexities

In [None]:
class SimpleModel(nn.Module):
    """Underfit model"""
    def __init__(self, input_size=100):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(input_size, 2)
    
    def forward(self, x):
        return self.fc(x)

class MediumModel(nn.Module):
    """Well-fit model"""
    def __init__(self, input_size=100):
        super(MediumModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 2)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)

class ComplexModel(nn.Module):
    """Overfit model"""
    def __init__(self, input_size=100):
        super(ComplexModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 256)
        self.fc4 = nn.Linear(256, 2)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        return self.fc4(x)

## Train and Compare Models

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_model(model, train_loader, test_loader, epochs=100):
    model = model.to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    train_losses = []
    test_losses = []
    train_accs = []
    test_accs = []
    
    for epoch in range(epochs):
        # Train
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            outputs = model(X_batch)
            loss = loss_fn(outputs, y_batch)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            train_total += y_batch.size(0)
            train_correct += (predicted == y_batch).sum().item()
        
        train_loss /= len(train_loader)
        train_acc = 100 * train_correct / train_total
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        
        # Test
        model.eval()
        test_loss = 0
        test_correct = 0
        test_total = 0
        
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = loss_fn(outputs, y_batch)
                test_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                test_total += y_batch.size(0)
                test_correct += (predicted == y_batch).sum().item()
        
        test_loss /= len(test_loader)
        test_acc = 100 * test_correct / test_total
        test_losses.append(test_loss)
        test_accs.append(test_acc)
    
    return train_losses, test_losses, train_accs, test_accs

# Train all three models
simple_results = train_model(SimpleModel(), train_loader, test_loader, epochs=100)
medium_results = train_model(MediumModel(), train_loader, test_loader, epochs=100)
complex_results = train_model(ComplexModel(), train_loader, test_loader, epochs=100)

## Compare Results

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

epochs_range = range(1, 101)

# Loss comparison
axes[0, 0].plot(epochs_range, simple_results[0], label='Simple Train')
axes[0, 0].plot(epochs_range, simple_results[1], label='Simple Test')
axes[0, 0].set_title('Simple Model (Underfitting)')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True)

axes[0, 1].plot(epochs_range, medium_results[0], label='Medium Train')
axes[0, 1].plot(epochs_range, medium_results[1], label='Medium Test')
axes[0, 1].set_title('Medium Model (Well-fit)')
axes[0, 1].set_ylabel('Loss')
axes[0, 1].legend()
axes[0, 1].grid(True)

axes[1, 0].plot(epochs_range, complex_results[0], label='Complex Train')
axes[1, 0].plot(epochs_range, complex_results[1], label='Complex Test')
axes[1, 0].set_title('Complex Model (Overfitting)')
axes[1, 0].set_ylabel('Loss')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].legend()
axes[1, 0].grid(True)

axes[1, 1].plot(epochs_range, simple_results[3], label='Simple Test Acc')
axes[1, 1].plot(epochs_range, medium_results[3], label='Medium Test Acc')
axes[1, 1].plot(epochs_range, complex_results[3], label='Complex Test Acc')
axes[1, 1].set_title('Test Accuracy Comparison')
axes[1, 1].set_ylabel('Accuracy (%)')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].legend()
axes[1, 1].grid(True)

plt.tight_layout()
plt.show()