In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer
import numpy as np
from tqdm import tqdm
import math
import gc
import os
import sys

def safe_cuda_init():
    """Safely initialize CUDA and return device"""
    if not torch.cuda.is_available():
        print("CUDA is not available. Using CPU.")
        return torch.device('cpu')
    
    try:
        # Try to initialize CUDA
        torch.cuda.init()
        
        # Get device properties
        device = torch.device('cuda')
        props = torch.cuda.get_device_properties(device)
        print(f"Using GPU: {props.name}")
        print(f"Compute Capability: {props.major}.{props.minor}")
        print(f"GPU Memory: {props.total_memory / 1024**3:.2f} GB")
        
        # Set conservative memory limits
        torch.cuda.set_per_process_memory_fraction(0.5)  # Use only 50% of available memory
        
        return device
    except Exception as e:
        print(f"Error initializing CUDA: {e}")
        print("Falling back to CPU.")
        return torch.device('cpu')

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model=64, nhead=2, num_layers=2, dim_feedforward=256, dropout=0.1):
        super().__init__()
        
        # Even smaller model size
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout, max_len=256)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,
            norm_first=True
        )
        
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.classifier = nn.Sequential(
            nn.Linear(d_model, 32),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(32, 2)
        )
        
        self.init_weights()
    
    def init_weights(self):
        initrange = 0.02
        nn.init.uniform_(self.embedding.weight.data, -initrange, initrange)
        for layer in self.classifier:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight.data)
                if layer.bias is not None:
                    nn.init.zeros_(layer.bias.data)
    
    def forward(self, x, attention_mask=None):
        x = self.embedding(x)
        x = self.pos_encoder(x)
        
        if attention_mask is not None:
            attention_mask = attention_mask.bool()
            x = x.masked_fill(~attention_mask.unsqueeze(-1), 0)
        
        x = self.transformer_encoder(x)
        
        if attention_mask is not None:
            mask = attention_mask.unsqueeze(-1).float()
            x = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9)
        else:
            x = x.mean(dim=1)
        
        x = self.classifier(x)
        return x

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=256):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

class IMDBDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def train_model(batch_size=2, num_epochs=5, learning_rate=0.0001):
    try:
        # Safe CUDA initialization
        device = safe_cuda_init()
        
        print("\nLoading IMDB dataset...")
        dataset = load_dataset("stanfordnlp/imdb")
        
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        
        # Create datasets with reduced max_length
        max_length = 256
        print("Preparing training dataset...")
        train_dataset = IMDBDataset(
            dataset['train']['text'][:1000],  # Start with smaller subset
            dataset['train']['label'][:1000],
            tokenizer,
            max_length=max_length
        )
        
        print("Preparing validation dataset...")
        val_dataset = IMDBDataset(
            dataset['test']['text'][:200],
            dataset['test']['label'][:200],
            tokenizer,
            max_length=max_length
        )
        
        # Create data loaders
        train_loader = DataLoader(
            train_dataset, 
            batch_size=batch_size,
            shuffle=True,
            num_workers=0
        )
        
        val_loader = DataLoader(
            val_dataset, 
            batch_size=batch_size,
            shuffle=False,
            num_workers=0
        )
        
        print("\nInitializing model...")
        model = TransformerClassifier(
            vocab_size=tokenizer.vocab_size,
            d_model=64,      # Smaller model
            nhead=2,         # Reduced attention heads
            num_layers=2     # Minimal layers
        )
        
        # Initialize model on CPU first
        print("Moving model to device...")
        model = model.to(device)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
        
        print("\nStarting training loop...")
        best_val_accuracy = 0
        
        for epoch in range(num_epochs):
            print(f"\nEpoch {epoch + 1}/{num_epochs}")
            model.train()
            train_loss = 0
            train_correct = 0
            train_total = 0
            
            try:
                for i, batch in enumerate(tqdm(train_loader, desc="Training")):
                    # Move batch to device
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)
                    
                    optimizer.zero_grad()
                    outputs = model(input_ids, attention_mask)
                    loss = criterion(outputs, labels)
                    
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
                    optimizer.step()
                    
                    train_loss += loss.item()
                    predictions = torch.argmax(outputs, dim=1)
                    train_correct += (predictions == labels).sum().item()
                    train_total += labels.size(0)
                    
                    # Clear memory periodically
                    if i % 10 == 0 and device.type == 'cuda':
                        del outputs, loss
                        gc.collect()
                
                avg_train_loss = train_loss / len(train_loader)
                train_accuracy = train_correct / train_total
                print(f"\nTraining Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")
                
                if train_accuracy > best_val_accuracy:
                    best_val_accuracy = train_accuracy
                    # Save model on CPU to avoid CUDA issues
                    model_cpu = model.to('cpu')
                    torch.save({
                        'epoch': epoch,
                        'model_state_dict': model_cpu.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'train_accuracy': train_accuracy,
                    }, 'best_model.pt')
                    model = model_cpu.to(device)
                
            except RuntimeError as e:
                if "out of memory" in str(e):
                    print("WARNING: out of memory")
                    if device.type == 'cuda':
                        gc.collect()
                else:
                    print(f"RuntimeError: {str(e)}")
                    raise e
                
    except Exception as e:
        print(f"Error during training: {str(e)}")
        raise e

if __name__ == "__main__":
    train_model()

Using device: cuda




RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer
import numpy as np
from tqdm import tqdm
import math
import gc
import os

# PART 1: Data Preparation
class IMDBDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def prepare_data(batch_size=2, max_length=256):
    # Load the IMDB dataset
    dataset = load_dataset("stanfordnlp/imdb")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    
    # Prepare training and validation datasets
    train_dataset = IMDBDataset(
        dataset['train']['text'][:1000],  # Start with smaller subset
        dataset['train']['label'][:1000],
        tokenizer,
        max_length=max_length
    )
    
    val_dataset = IMDBDataset(
        dataset['test']['text'][:200],
        dataset['test']['label'][:200],
        tokenizer,
        max_length=max_length
    )
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    
    return train_loader, val_loader, tokenizer

# PART 2: Model Definition
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=256):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model=64, nhead=2, num_layers=2, dim_feedforward=256, dropout=0.1):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout, max_len=256)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,
            norm_first=True
        )
        
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.classifier = nn.Sequential(
            nn.Linear(d_model, 32),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(32, 2)
        )
        
        self.init_weights()
    
    def init_weights(self):
        initrange = 0.02
        nn.init.uniform_(self.embedding.weight.data, -initrange, initrange)
        for layer in self.classifier:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight.data)
                if layer.bias is not None:
                    nn.init.zeros_(layer.bias.data)
    
    def forward(self, x, attention_mask=None):
        x = self.embedding(x)
        x = self.pos_encoder(x)
        
        if attention_mask is not None:
            attention_mask = attention_mask.bool()
            x = x.masked_fill(~attention_mask.unsqueeze(-1), 0)
        
        x = self.transformer_encoder(x)
        
        if attention_mask is not None:
            mask = attention_mask.unsqueeze(-1).float()
            x = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9)
        else:
            x = x.mean(dim=1)
        
        x = self.classifier(x)
        return x

# PART 3: Training Procedure
def train_model(batch_size=2, num_epochs=5, learning_rate=0.0001):
    # Prepare data
    train_loader, val_loader, tokenizer = prepare_data(batch_size=batch_size)
    
    # Safe CUDA initialization
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Initialize model and move to device
    model = TransformerClassifier(vocab_size=tokenizer.vocab_size)
    model = model.to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    
    best_val_accuracy = 0
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        model.train()
        
        train_loss = 0
        train_correct = 0
        train_total = 0
        
        for batch in tqdm(train_loader, desc="Training"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
            optimizer.step()
            
            train_loss += loss.item()
            predictions = torch.argmax(outputs, dim=1)
            train_correct += (predictions == labels).sum().item()
            train_total += labels.size(0)
            
            # Clear memory periodically
            if device.type == 'cuda':
                del outputs, loss
                gc.collect()
        
        avg_train_loss = train_loss / len(train_loader)
        train_accuracy = train_correct / train_total
        print(f"Training Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")

if __name__ == "__main__":
    train_model()


NameError: name 'device' is not defined