# Text Classification with PyTorch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

import numpy as np
import matplotlib.pyplot as plt

## Text Representation

In [None]:
class SimpleTextDataset(Dataset):
    """Synthetic text classification dataset"""
    def __init__(self, num_samples=200, vocab_size=100, seq_length=20):
        self.num_samples = num_samples
        self.vocab_size = vocab_size
        self.seq_length = seq_length
        
        # Generate synthetic sequences
        self.sequences = np.random.randint(0, vocab_size, (num_samples, seq_length))
        # Binary labels
        self.labels = np.random.randint(0, 2, num_samples)
    
    def __len__(self):
        return self.num_samples
    
    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long), \
               torch.tensor(self.labels[idx], dtype=torch.long)

# Create dataset
vocab_size = 100
seq_length = 20
dataset = SimpleTextDataset(num_samples=200, vocab_size=vocab_size, seq_length=seq_length)

# Split into train and test
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Text Classification Model

In [None]:
class TextClassificationNet(nn.Module):
    def __init__(self, vocab_size, embedding_dim=50, hidden_dim=64):
        super(TextClassificationNet, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, 32)
        self.fc2 = nn.Linear(32, 2)  # Binary classification
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, x):
        # x: (batch, seq_len)
        embedded = self.embedding(x)  # (batch, seq_len, embedding_dim)
        lstm_out, (h_n, c_n) = self.lstm(embedded)
        # Use the last hidden state
        last_output = lstm_out[:, -1, :]
        x = self.relu(self.fc1(last_output))
        x = self.dropout(x)
        logits = self.fc2(x)
        return logits

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TextClassificationNet(vocab_size=vocab_size).to(device)
print(model)

## Training

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_epoch(model, train_loader, loss_fn, optimizer, device):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        
        outputs = model(x)
        loss = loss_fn(outputs, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += y.size(0)
        correct += (predicted == y).sum().item()
    
    return train_loss / len(train_loader), 100 * correct / total

def evaluate(model, test_loader, loss_fn, device):
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            outputs = model(x)
            loss = loss_fn(outputs, y)
            test_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()
    
    return test_loss / len(test_loader), 100 * correct / total

epochs = 10
train_losses = []
test_losses = []
train_accs = []
test_accs = []

for epoch in range(epochs):
    train_loss, train_acc = train_epoch(model, train_loader, loss_fn, optimizer, device)
    test_loss, test_acc = evaluate(model, test_loader, loss_fn, device)
    
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    
    print(f'Epoch {epoch+1}/{epochs}')
    print(f'  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%')
    print(f'  Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')

## Results

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(train_losses, label='Train')
axes[0].plot(test_losses, label='Test')
axes[0].set_title('Loss')
axes[0].set_ylabel('Loss')
axes[0].set_xlabel('Epoch')
axes[0].legend()
axes[0].grid(True)

axes[1].plot(train_accs, label='Train')
axes[1].plot(test_accs, label='Test')
axes[1].set_title('Accuracy')
axes[1].set_ylabel('Accuracy (%)')
axes[1].set_xlabel('Epoch')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()