In [None]:
# Download and extract model weights and data
!wget https://huggingface.co/sergak0/sn32/resolve/main/deberta-v3-large-hf-weights.zip
!wget https://huggingface.co/sergak0/sn32/resolve/main/deberta-large-ls03-ctx1024.pth
#!unzip /content/deberta-v3-large-hf-weights.zip -d deberta-v3-large-hf-weights #unzip not working in jupyter nb
!wget https://huggingface.co/sergak0/sn32/resolve/main/data.zip
#!unzip /content/data.zip -d data

In [None]:
import zipfile
import os

# Define the paths to the ZIP files and their respective extraction directories
zip_file_paths = {
    "deberta-v3-large-hf-weights.zip": "./deberta-v3-large-hf-weights",
    "data.zip": "./data"
}

for zip_file_path, extract_to_dir in zip_file_paths.items():
    # Create the extraction directory if it doesn't exist
    os.makedirs(extract_to_dir, exist_ok=True)
    
    # Check if the ZIP file exists
    if os.path.exists(zip_file_path):
        # Open and extract the ZIP file
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to_dir)
        print(f"Files from {zip_file_path} have been extracted to: {extract_to_dir}")
    else:
        print(f"File {zip_file_path} does not exist.")


In [None]:
!pip install --upgrade torch transformers scikit-learn pandas tqdm tiktoken tokenizers huggingface_hub sentencepiece gc-python-utils


In [None]:
!pip install protobuf 

In [None]:
!pip install -U torch==2.2.2
!pip uninstall -y torchvision

In [None]:
%reset -f

In [1]:
import os
import pickle
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
    set_seed
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_recall_curve, average_precision_score, roc_auc_score
import pandas as pd
from tqdm.auto import tqdm
import gc
from torch.cuda.amp import autocast, GradScaler
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
set_seed(42)

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=1024):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        text = f"[CLS] {text} [SEP]"
        text = ' '.join(text.split())
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length' if self.max_length else False,
            return_tensors=None,
            add_special_tokens=True
        )

        return {
            'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

class AdvancedTrainer:
    def __init__(
        self,
        model,
        train_dataloader,
        val_dataloader,
        optimizer,
        scheduler,
        device,
        gradient_accumulation_steps=4,
        model_save_path='best_model.pth',
        fp16=True
    ):
        self.model = model
      
        if torch.cuda.device_count() > 1:
            print(f"Using {torch.cuda.device_count()} GPUs!")
            self.model = nn.DataParallel(model)
        self.model.to(device)  # Ensure model is on device after DataParallel
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = device
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.scaler = GradScaler() if fp16 else None
        self.model_save_path = model_save_path
        self.best_metrics = {
            'avg_score': 0,
            'f1': 0,
            'fp_score': 0,
            'ap': 0,
            'auc': 0
        }
        self.fp16 = fp16

    def train_epoch(self):
        self.model.train()
        total_loss = 0
        progress_bar = tqdm(self.train_dataloader, desc="Training")

        for i, batch in enumerate(progress_bar):
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = batch['labels'].to(self.device)

            # Clear gradients
            self.optimizer.zero_grad()

            if self.fp16:
                with autocast():
                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
                    # Ensure loss is scalar by taking mean across GPUs
                    loss = outputs.loss.mean() / self.gradient_accumulation_steps
                
                # Scale and backward
                self.scaler.scale(loss).backward()

                if (i + 1) % self.gradient_accumulation_steps == 0:
                    self.scaler.unscale_(self.optimizer)
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                    self.scaler.step(self.optimizer)
                    self.scaler.update()
                    self.scheduler.step()
                    self.optimizer.zero_grad()
            else:
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                # Ensure loss is scalar by taking mean across GPUs
                loss = outputs.loss.mean() / self.gradient_accumulation_steps
                loss.backward()

                if (i + 1) % self.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                    self.optimizer.step()
                    self.scheduler.step()
                    self.optimizer.zero_grad()

            total_loss += loss.item() * self.gradient_accumulation_steps
            progress_bar.set_postfix({'loss': f'{total_loss/(i+1):.4f}'})
            
            del outputs, loss
            if (i + 1) % 50 == 0:
                torch.cuda.empty_cache()

        return total_loss / len(self.train_dataloader)

    def evaluate(self):
        self.model.eval()
        all_preds = []
        all_labels = []
        all_probs = []
        total_val_loss = 0

        with torch.no_grad():
            for batch in tqdm(self.val_dataloader, desc="Evaluating"):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                if self.fp16:
                    with autocast():
                        outputs = self.model(
                            input_ids=input_ids,
                            attention_mask=attention_mask,
                            labels=labels
                        )
                else:
                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
                
                # Take mean of loss across GPUs
                loss = outputs.loss.mean()
                total_val_loss += loss.item()
                
                # Handle logits from DataParallel
                logits = outputs.logits
                probs = torch.softmax(logits, dim=-1)[:, 1]
                preds = (probs > 0.5).long()

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
                all_probs.extend(probs.cpu().numpy())

                del outputs, probs, preds

        metrics = self.compute_metrics(all_labels, all_preds, all_probs)
        metrics['val_loss'] = total_val_loss / len(self.val_dataloader)
        
        if metrics['avg_score'] > self.best_metrics['avg_score']:
            self.best_metrics = metrics
            self.save_model()
            print(f"\nNew best model saved! Metrics:")
            for k, v in metrics.items():
                print(f"{k}: {v:.4f}")

        return metrics

    def compute_metrics(self, labels, preds, probs):
        f1 = f1_score(labels, preds)
        fp_score = 1 - ((np.array(preds) > np.array(labels)).sum() / len(labels))
        ap = average_precision_score(labels, probs)
        auc = roc_auc_score(labels, probs)
        avg_score = (f1 + fp_score + ap ) /3

        return {
            'avg_score': avg_score,
            'f1': f1,
            'fp_score': fp_score,
            'ap': ap,
            'auc': auc
        }

    def save_model(self):
        # Save the underlying model without DataParallel wrapper
        model_to_save = self.model.module if hasattr(self.model, 'module') else self.model
        torch.save({
            'model_state_dict': model_to_save.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'best_metrics': self.best_metrics
        }, self.model_save_path)

def create_balanced_sample(neg_samples, pos_samples, sample_size=None):
    """Create a balanced dataset using random sampling"""
    if sample_size is None:
        sample_size = min(len(neg_samples), len(pos_samples))
    
    neg_indices = np.random.choice(len(neg_samples), sample_size, replace=False)
    pos_indices = np.random.choice(len(pos_samples), sample_size, replace=False)
    
    sampled_neg = [neg_samples[i] for i in neg_indices]
    sampled_pos = [pos_samples[i] for i in pos_indices]
    
    return sampled_neg, sampled_pos

def main():
    # Set up multi-GPU environment
    torch.cuda.empty_cache()
    n_gpu = torch.cuda.device_count()
    print(f"Number of GPUs available: {n_gpu}")

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load model and tokenizer
    model_path = 'deberta-v3-large-hf-weights'
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # Load pre-trained weights
    state_dict = torch.load('deberta-large-ls03-ctx1024.pth', map_location=device)
    
    # Initialize model with optimal configuration
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        num_labels=2,
        state_dict=state_dict,
        output_hidden_states=True,
        attention_probs_dropout_prob=0.1,
        hidden_dropout_prob=0.1
    )
    model.to(device)

    # Load and preprocess data
    with open('data/train_neg_list.pickle', 'rb') as f:
        neg_samples = pickle.load(f)
    with open('data/train_pos_list.pickle', 'rb') as f:
        pos_samples = pickle.load(f)
    
    # Create balanced dataset
    sampled_neg, sampled_pos = create_balanced_sample(neg_samples, pos_samples)
    texts = sampled_neg + sampled_pos
    labels = [0] * len(sampled_neg) + [1] * len(sampled_pos)

    # Split data
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.05, random_state=42, stratify=labels
    )

    # Create datasets
    train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer)
    val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer)

    # Adjust batch size for multi-GPU training (multiply by number of GPUs)
    per_gpu_batch_size = 8
    total_train_batch_size = per_gpu_batch_size * n_gpu
    total_eval_batch_size = per_gpu_batch_size * 2 * n_gpu

    # Create dataloaders with adjusted batch sizes
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=total_train_batch_size,
        shuffle=True,
        num_workers=4,
        pin_memory=True
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=total_eval_batch_size,
        num_workers=4,
        pin_memory=True
    )

    # Initialize optimizer and scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6, weight_decay=0.01)
    
    # Calculate steps for one epoch
    num_training_steps = len(train_dataloader) 
    num_warmup_steps = num_training_steps // 10
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    # Initialize trainer
    trainer = AdvancedTrainer(
        model=model,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        optimizer=optimizer,
        scheduler=scheduler,
        device=device,
        gradient_accumulation_steps=4,
        model_save_path='best_model.pth'
    )

    print("Starting training...")
    print("\nEpoch 1/1")
    train_loss = trainer.train_epoch()
    print(f"Training Loss: {train_loss:.4f}")
    
    metrics = trainer.evaluate()
    print("\nValidation Metrics:")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

if __name__ == "__main__":
    main()

  from pandas.core import (


Number of GPUs available: 2
Using device: cuda
Using 2 GPUs!
Starting training...

Epoch 1/1


Training:   0%|          | 0/65491 [00:00<?, ?it/s]

Training Loss: 0.0598


Evaluating:   0%|          | 0/1724 [00:00<?, ?it/s]


New best model saved! Metrics:
avg_score: 0.9979
f1: 0.9971
fp_score: 0.9973
ap: 0.9993
auc: 0.9996
val_loss: 0.0145

Validation Metrics:
avg_score: 0.9979
f1: 0.9971
fp_score: 0.9973
ap: 0.9993
auc: 0.9996
val_loss: 0.0145
