In [2]:
import pandas as pd
import numpy as np
import torch
import kagglehub
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import random
import time
import logging

In [3]:
path = kagglehub.dataset_download("raj713335/twittesentimentanalysis")

print("Path to dataset files:", path)

tweets = pd.read_csv(f"{path}/tweets.csv", encoding = "ISO-8859-1", names= ["Target", "ID", "Date", "Query", "User", "Text"], header=None)

tweets.head()

Path to dataset files: C:\Users\keep_\.cache\kagglehub\datasets\raj713335\twittesentimentanalysis\versions\1


Unnamed: 0,Target,ID,Date,Query,User,Text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [14]:
import torch
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import gc
import time

class TweetDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len=32):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors=None
        )

        return {
            'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long),
            'targets': torch.tensor(target, dtype=torch.long)
        }

def prepare_data(df, tokenizer, config):
    # Convert targets from 0,4 to 0,1
    df['Target'] = df['Target'].map({0: 0, 4: 1})
    
    # Sample balanced data
    df_neg = df[df['Target'] == 0]
    df_pos = df[df['Target'] == 1]
    
    n_samples = min(len(df_neg), len(df_pos)) // config['portion']
    
    df_neg = df_neg.sample(n=n_samples, random_state=42)
    df_pos = df_pos.sample(n=n_samples, random_state=42)
    
    df_sampled = pd.concat([df_neg, df_pos]).sample(frac=1, random_state=42)
    
    # Create dataset
    dataset = TweetDataset(
        texts=df_sampled['Text'].values,
        targets=df_sampled['Target'].values,
        tokenizer=tokenizer,
        max_len=config['max_len']
    )
    
    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=config['batch_size'],
        shuffle=True,
        num_workers=0  # No parallel loading for debugging
    )
    
    return dataloader

def train_model(model, dataloader, config, device):
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'])
    
    for epoch in range(config['epochs']):
        print(f"\nEpoch {epoch + 1}/{config['epochs']}")
        running_loss = 0.0
        
        progress_bar = tqdm(dataloader, desc=f'Training')
        start_time = time.time()
        
        for batch_idx, batch in enumerate(progress_bar):
            # Move data to GPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)
            
            # Forward pass
            optimizer.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=targets
            )
            
            # Backward pass
            outputs.loss.backward()
            optimizer.step()
            
            # Update metrics
            running_loss += outputs.loss.item()
            avg_loss = running_loss / (batch_idx + 1)
            
            # Update progress bar
            progress_bar.set_postfix({
                'loss': f'{avg_loss:.4f}',
                'speed': f'{batch_idx / (time.time() - start_time):.1f} it/s'
            })
            
            # Clear cache periodically
            if batch_idx % 100 == 0:
                torch.cuda.empty_cache()

def main():
    # Configuration
    CONFIG = {
        'max_len': 32,          # Reduced for memory efficiency
        'batch_size': 8,        # Small batch size for GTX 1650 Ti
        'portion': 100,         # Use 1/100th of data
        'epochs': 2,            # Start with 1 epoch
        'learning_rate': 2e-5
    }
    
    # Clear memory
    torch.cuda.empty_cache()
    gc.collect()
    
    # Load data
    print("Loading data...")
    df = pd.read_csv(
        f"{path}/tweets.csv",
        encoding="ISO-8859-1",
        names=["Target", "ID", "Date", "Query", "User", "Text"],
        header=None
    )
    print(f"Total samples: {len(df)}")
    
    # Initialize model and tokenizer
    print("\nInitializing model and tokenizer...")
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=2
    )
    
    # Setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Prepare data
    print("\nPreparing data...")
    dataloader = prepare_data(df, tokenizer, CONFIG)
    print(f"Number of batches: {len(dataloader)}")
    
    # Train model
    print("\nStarting training...")
    train_model(model, dataloader, CONFIG, device)
    
    # Clean up
    del model
    del tokenizer
    torch.cuda.empty_cache()
    gc.collect()

if __name__ == "__main__":
    main()

Loading data...
Total samples: 1600000

Initializing model and tokenizer...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Preparing data...
Number of batches: 2000

Starting training...

Epoch 1/2


Training:   2%|▏         | 48/2000 [00:10<07:03,  4.61it/s, loss=0.6956, speed=4.6 it/s]


KeyboardInterrupt: 

In [17]:
import torch
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import gc
import time
import os
from datetime import datetime

class TweetDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len=48):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors=None
        )

        return {
            'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long),
            'targets': torch.tensor(target, dtype=torch.long)
        }

def prepare_data(df, tokenizer, config):
    # Convert targets from 0,4 to 0,1
    df['Target'] = df['Target'].map({0: 0, 4: 1})
    
    # Sample balanced data
    df_neg = df[df['Target'] == 0]
    df_pos = df[df['Target'] == 1]
    
    n_samples = min(len(df_neg), len(df_pos)) // config['portion']
    
    df_neg = df_neg.sample(n=n_samples, random_state=42)
    df_pos = df_pos.sample(n=n_samples, random_state=42)
    
    df_sampled = pd.concat([df_neg, df_pos]).sample(frac=1, random_state=42)
    
    # Split into train and validation
    train_df, val_df = train_test_split(df_sampled, test_size=0.1, random_state=42)
    
    # Create datasets
    train_dataset = TweetDataset(
        texts=train_df['Text'].values,
        targets=train_df['Target'].values,
        tokenizer=tokenizer,
        max_len=config['max_len']
    )
    
    val_dataset = TweetDataset(
        texts=val_df['Text'].values,
        targets=val_df['Target'].values,
        tokenizer=tokenizer,
        max_len=config['max_len']
    )
    
    # Create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=config['batch_size'],
        shuffle=True,
        num_workers=0
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=config['batch_size'],
        shuffle=False,
        num_workers=0
    )
    
    return train_loader, val_loader

def save_checkpoint(model, optimizer, scheduler, epoch, loss, accuracy, config, checkpoint_dir):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}_{timestamp}.pt')
    
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
        'loss': loss,
        'accuracy': accuracy,
        'config': config
    }, checkpoint_path)
    
    print(f"Checkpoint saved: {checkpoint_path}")
    return checkpoint_path

def train_model(model, train_loader, val_loader, config, device, checkpoint_dir):
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'])
    
    # Calculate total steps and warmup
    total_steps = len(train_loader) * config['epochs']
    warmup_steps = total_steps // 10
    
    # Create scheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )
    
    best_val_loss = float('inf')
    best_val_accuracy = 0.0
    
    for epoch in range(config['epochs']):
        print(f"\nEpoch {epoch + 1}/{config['epochs']}")
        
        # Training phase
        model.train()
        running_loss = 0.0
        progress_bar = tqdm(train_loader, desc='Training')
        start_time = time.time()
        
        for batch_idx, batch in enumerate(progress_bar):
            # Move data to GPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)
            
            # Forward pass
            optimizer.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=targets
            )
            
            # Backward pass
            outputs.loss.backward()
            optimizer.step()
            scheduler.step()
            
            # Update metrics
            running_loss += outputs.loss.item()
            avg_loss = running_loss / (batch_idx + 1)
            
            # Update progress bar
            progress_bar.set_postfix({
                'loss': f'{avg_loss:.4f}',
                'speed': f'{batch_idx / (time.time() - start_time):.1f} it/s',
                'lr': f'{scheduler.get_last_lr()[0]:.2e}'
            })
            
            # Clear cache periodically
            if batch_idx % 100 == 0:
                torch.cuda.empty_cache()
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        
        print("\nRunning validation...")
        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                targets = batch['targets'].to(device)
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=targets
                )
                
                val_loss += outputs.loss.item()
                predictions = torch.argmax(outputs.logits, dim=1)
                correct += (predictions == targets).sum().item()
                total += targets.size(0)
        
        # Calculate epoch metrics
        epoch_val_loss = val_loss / len(val_loader)
        epoch_val_accuracy = 100 * correct / total
        
        print(f"\nEpoch {epoch + 1} Results:")
        print(f"Training Loss: {avg_loss:.4f}")
        print(f"Validation Loss: {epoch_val_loss:.4f}")
        print(f"Validation Accuracy: {epoch_val_accuracy:.2f}%")
        
        # Save checkpoint if best model
        if epoch_val_accuracy > best_val_accuracy:
            best_val_accuracy = epoch_val_accuracy
            best_val_loss = epoch_val_loss
            save_checkpoint(
                model, optimizer, scheduler, epoch + 1,
                best_val_loss, best_val_accuracy,
                config, checkpoint_dir
            )

def main():
    # Configuration
    CONFIG = {
        'max_len': 48,
        'batch_size': 16,
        'portion': 10,
        'epochs': 3,
        'learning_rate': 2e-5
    }
    
    # Create checkpoint directory
    checkpoint_dir = 'model_checkpoints'
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    # Clear memory
    torch.cuda.empty_cache()
    gc.collect()
    
    # Load data
    print("Loading data...")
    df = pd.read_csv(
        f"{path}/tweets.csv",
        encoding="ISO-8859-1",
        names=["Target", "ID", "Date", "Query", "User", "Text"],
        header=None
    )
    print(f"Total samples: {len(df)}")
    
    # Initialize model and tokenizer
    print("\nInitializing model and tokenizer...")
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=2
    )
    
    # Setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Prepare data
    print("\nPreparing data...")
    train_loader, val_loader = prepare_data(df, tokenizer, CONFIG)
    print(f"Number of training batches: {len(train_loader)}")
    print(f"Number of validation batches: {len(val_loader)}")
    
    # Train model
    print("\nStarting training...")
    train_model(model, train_loader, val_loader, CONFIG, device, checkpoint_dir)
    
    # Clean up
    del model
    del tokenizer
    torch.cuda.empty_cache()
    gc.collect()

if __name__ == "__main__":
    main()

KeyboardInterrupt: 

In [9]:
import torch
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import gc
import time
import os
from datetime import datetime, timedelta
import psutil
import numpy as np
import GPUtil
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class TweetDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len=48):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors=None
        )

        return {
            'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long),
            'targets': torch.tensor(target, dtype=torch.long)
        }

class MetricsTracker:
    def __init__(self):
        self.start_time = None
        self.epoch_start_time = None
        self.epoch_times = []
        self.training_times = []
        self.evaluation_times = []
        self.training_samples = 0
        self.evaluation_samples = 0
        self.accuracies = []
        self.resource_usage = []
        
    def start_training(self):
        self.start_time = time.time()
        
    def start_epoch(self):
        self.epoch_start_time = time.time()
        
    def end_epoch(self):
        epoch_time = time.time() - self.epoch_start_time
        self.epoch_times.append(epoch_time)
        
    def add_batch_training_time(self, batch_time, batch_size):
        self.training_times.append(batch_time)
        self.training_samples += batch_size
        
    def add_batch_evaluation_time(self, batch_time, batch_size):
        self.evaluation_times.append(batch_time)
        self.evaluation_samples += batch_size
        
    def add_accuracy(self, accuracy):
        self.accuracies.append(accuracy)
        
    def add_resource_usage(self, cpu_percent, ram_percent, gpu_percent, gpu_memory):
        self.resource_usage.append({
            'cpu': cpu_percent,
            'ram': ram_percent,
            'gpu_util': gpu_percent,
            'gpu_memory': gpu_memory
        })
        
    def get_metrics(self):
        total_training_time = time.time() - self.start_time
        avg_training_time = sum(self.training_times) / self.training_samples
        avg_evaluation_time = sum(self.evaluation_times) / self.evaluation_samples
        avg_epoch_time = sum(self.epoch_times) / len(self.epoch_times)
        
        # Calculate average resource usage
        avg_resources = {
            'cpu': np.mean([r['cpu'] for r in self.resource_usage]),
            'ram': np.mean([r['ram'] for r in self.resource_usage]),
            'gpu_util': np.mean([r['gpu_util'] for r in self.resource_usage]),
            'gpu_memory': np.mean([r['gpu_memory'] for r in self.resource_usage])
        }
        
        return {
            'avg_training_time_per_sample': avg_training_time,
            'avg_evaluation_time_per_sample': avg_evaluation_time,
            'first_epoch_accuracy': self.accuracies[0],
            'final_accuracy': self.accuracies[-1],
            'avg_epoch_time': avg_epoch_time,
            'total_training_time': total_training_time,
            'avg_resources': avg_resources
        }

def get_model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    
    size_all_mb = (param_size + buffer_size) / 1024**2
    return size_all_mb

def get_system_resources():
    # CPU
    cpu_percent = psutil.cpu_percent(interval=0.1)
    
    # RAM
    ram = psutil.virtual_memory()
    ram_percent = ram.percent
    
    # GPU
    gpu = GPUtil.getGPUs()[0]
    gpu_util = gpu.load * 100
    gpu_memory = gpu.memoryUtil * 100
    
    return {
        'cpu_percent': cpu_percent,
        'ram_percent': ram_percent,
        'gpu_util': gpu_util,
        'gpu_memory': gpu_memory
    }

def prepare_data(df, tokenizer, config):
    df['Target'] = df['Target'].map({0: 0, 4: 1})
    
    # Faster sampling
    n_samples = len(df) // (2 * config['portion'])
    df_sampled = pd.concat([
        df[df['Target'] == 0].sample(n=n_samples, random_state=42),
        df[df['Target'] == 1].sample(n=n_samples, random_state=42)
    ]).sample(frac=1, random_state=42)
    
    train_df, val_df = train_test_split(df_sampled, test_size=0.1, random_state=42)
    
    # Create datasets with minimal overhead
    train_dataset = TweetDataset(
        texts=train_df['Text'].values,
        targets=train_df['Target'].values,
        tokenizer=tokenizer,
        max_len=config['max_len']
    )
    
    val_dataset = TweetDataset(
        texts=val_df['Text'].values,
        targets=val_df['Target'].values,
        tokenizer=tokenizer,
        max_len=config['max_len']
    )
    
    # Efficient DataLoaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=config['batch_size'],
        shuffle=True,
        num_workers=0,
        pin_memory=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=config['batch_size'] * 2,  # Larger batches for validation
        shuffle=False,
        num_workers=0,
        pin_memory=True
    )
    
    return train_loader, val_loader

def save_checkpoint(model, optimizer, scheduler, epoch, loss, accuracy, config, checkpoint_dir):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}_{timestamp}.pt')
    
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
        'loss': loss,
        'accuracy': accuracy,
        'config': config
    }, checkpoint_path)
    
    print(f"Checkpoint saved: {checkpoint_path}")
    return checkpoint_path

def train_model(model, train_loader, val_loader, config, device, checkpoint_dir):
    metrics = MetricsTracker()
    metrics.start_training()
    
    # Get initial model size
    model_size_mb = get_model_size(model)
    print(f"\nModel Size: {model_size_mb:.2f} MB")
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'])
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=len(train_loader) // 10,
        num_training_steps=len(train_loader) * config['epochs']
    )
    
    best_val_accuracy = 0.0
    
    for epoch in range(config['epochs']):
        metrics.start_epoch()
        print(f"\nEpoch {epoch + 1}/{config['epochs']}")
        
        # Training phase
        model.train()
        running_loss = 0.0
        
        for batch_idx, batch in enumerate(tqdm(train_loader, desc='Training')):
            batch_start = time.time()
            
            # Move data to GPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)
            
            # Forward and backward pass
            optimizer.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=targets
            )
            
            outputs.loss.backward()
            optimizer.step()
            scheduler.step()
            
            # Record metrics
            batch_time = time.time() - batch_start
            metrics.add_batch_training_time(batch_time, len(input_ids))
            running_loss += outputs.loss.item()
            
            # Monitor resources every 10 batches
            if batch_idx % 10 == 0:
                resources = get_system_resources()
                metrics.add_resource_usage(
                    resources['cpu_percent'],
                    resources['ram_percent'],
                    resources['gpu_util'],
                    resources['gpu_memory']
                )
        
        # Validation phase
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                batch_start = time.time()
                
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                targets = batch['targets'].to(device)
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=targets
                )
                
                batch_time = time.time() - batch_start
                metrics.add_batch_evaluation_time(batch_time, len(input_ids))
                
                val_loss += outputs.loss.item()
                predictions = torch.argmax(outputs.logits, dim=1)
                correct += (predictions == targets).sum().item()
                total += targets.size(0)
        
        # Calculate epoch metrics
        epoch_accuracy = 100 * correct / total
        metrics.add_accuracy(epoch_accuracy)
        metrics.end_epoch()
        
        # Print epoch results
        print(f"\nEpoch {epoch + 1} Results:")
        print(f"Training Loss: {running_loss/len(train_loader):.4f}")
        print(f"Validation Accuracy: {epoch_accuracy:.2f}%")
        
        # Save if best model
        if epoch_accuracy > best_val_accuracy:
            best_val_accuracy = epoch_accuracy
            save_checkpoint(model, optimizer, scheduler, epoch + 1,
                          val_loss/len(val_loader), epoch_accuracy,
                          config, checkpoint_dir)
    
    # Calculate and print final metrics
    final_metrics = metrics.get_metrics()
    print("\nFinal Training Metrics:")
    print(f"Average Training Time per Sample: {final_metrics['avg_training_time_per_sample']*1000:.2f} ms")
    print(f"Average Evaluation Time per Sample: {final_metrics['avg_evaluation_time_per_sample']*1000:.2f} ms")
    print(f"First Epoch Accuracy: {final_metrics['first_epoch_accuracy']:.2f}%")
    print(f"Final Accuracy: {final_metrics['final_accuracy']:.2f}%")
    print(f"Average Epoch Time: {final_metrics['avg_epoch_time']:.2f} seconds")
    print(f"Total Training Time: {final_metrics['total_training_time']:.2f} seconds")
    print(f"Model Size: {model_size_mb:.2f} MB")
    print("\nAverage Resource Usage:")
    print(f"CPU Usage: {final_metrics['avg_resources']['cpu']:.1f}%")
    print(f"RAM Usage: {final_metrics['avg_resources']['ram']:.1f}%")
    print(f"GPU Utilization: {final_metrics['avg_resources']['gpu_util']:.1f}%")
    print(f"GPU Memory Usage: {final_metrics['avg_resources']['gpu_memory']:.1f}%")
    
    # Save metrics to file
    metrics_path = os.path.join(checkpoint_dir, 'training_metrics.txt')
    with open(metrics_path, 'w') as f:
        f.write("Training Metrics:\n")
        for key, value in final_metrics.items():
            if key == 'avg_resources':
                f.write("\nAverage Resource Usage:\n")
                for resource, usage in value.items():
                    f.write(f"{resource}: {usage:.1f}%\n")
            else:
                f.write(f"{key}: {value}\n")
        f.write(f"\nModel Size: {model_size_mb:.2f} MB\n")
    
    return final_metrics

def main():
    # Configuration
    CONFIG = {
        'max_len': 48,
        'batch_size': 16, 
        'portion': 1,
        'epochs': 3,
        'learning_rate': 2e-5
    }
    
    # Create checkpoint directory
    checkpoint_dir = 'model_checkpoints'
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    # Clear memory
    torch.cuda.empty_cache()
    gc.collect()
    
    # Load data
    print("Loading data...")
    df = pd.read_csv(
        f"{path}/tweets.csv",
        encoding="ISO-8859-1",
        names=["Target", "ID", "Date", "Query", "User", "Text"],
        header=None
    )
    print(f"Total samples: {len(df)}")
    
    # Initialize model and tokenizer
    print("\nInitializing model and tokenizer...")
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=2
    )
    
    # Setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Prepare data
    print("\nPreparing data...")
    train_loader, val_loader = prepare_data(df, tokenizer, CONFIG)
    print(f"Number of training batches: {len(train_loader)}")
    print(f"Number of validation batches: {len(val_loader)}")
    
    # Train model
    print("\nStarting training...")
    train_model(model, train_loader, val_loader, CONFIG, device, checkpoint_dir)
    
    # Clean up
    del model
    del tokenizer
    torch.cuda.empty_cache()
    gc.collect()

if __name__ == "__main__":
    main()

Loading data...
Total samples: 1600000

Initializing model and tokenizer...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Preparing data...
Number of training batches: 90000
Number of validation batches: 5000

Starting training...

Model Size: 255.42 MB

Epoch 1/3


Training: 100%|██████████| 90000/90000 [11:51:39<00:00,  2.11it/s]  
Validation: 100%|██████████| 5000/5000 [19:58<00:00,  4.17it/s]



Epoch 1 Results:
Training Loss: 0.3436
Validation Accuracy: 86.50%
Checkpoint saved: model_checkpoints\checkpoint_epoch_1_20250308_085125.pt

Epoch 2/3


Training: 100%|██████████| 90000/90000 [9:40:56<00:00,  2.58it/s]   
Validation: 100%|██████████| 5000/5000 [18:16<00:00,  4.56it/s]



Epoch 2 Results:
Training Loss: 0.2762
Validation Accuracy: 86.97%
Checkpoint saved: model_checkpoints\checkpoint_epoch_2_20250308_185045.pt

Epoch 3/3


Training: 100%|██████████| 90000/90000 [10:20:10<00:00,  2.42it/s]   
Validation: 100%|██████████| 5000/5000 [17:14<00:00,  4.83it/s]



Epoch 3 Results:
Training Loss: 0.2192
Validation Accuracy: 86.90%

Final Training Metrics:
Average Training Time per Sample: 4.14 ms
Average Evaluation Time per Sample: 0.55 ms
First Epoch Accuracy: 86.50%
Final Accuracy: 86.90%
Average Epoch Time: 39365.72 seconds
Total Training Time: 118113.66 seconds
Model Size: 255.42 MB

Average Resource Usage:
CPU Usage: 27.6%
RAM Usage: 69.5%
GPU Utilization: 91.2%
GPU Memory Usage: 46.6%
