In [None]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from transformers import EarlyStoppingCallback
import os
import json
from datetime import datetime

# Constants
DATA_PATH = "/root/workspace/npe_project/Dataset/NPEPatches.json"
MODEL_PATH = "/root/workspace/npe_project/microsoft_CodeBERT"
OUTPUT_DIR = "./codebert_results"

class NPECommitDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = [str(text) for text in texts]
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

class MetricsCalculator:
    @staticmethod
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=1)
        
        # Calculate confusion matrix
        tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()
        
        # Calculate metrics
        metrics = {
            'accuracy': accuracy_score(labels, predictions),
            'precision': precision_score(labels, predictions),
            'recall': recall_score(labels, predictions),
            'f1_score': f1_score(labels, predictions),
            'fpr': fp / (fp + tn) if (fp + tn) > 0 else 0,
            'fnr': fn / (fn + tp) if (fn + tp) > 0 else 0
        }
        
        return metrics

class MetricsTracker:
    def __init__(self):
        self.epoch_metrics = []
        self.final_metrics = None
        
    def add_epoch_metrics(self, metrics):
        self.epoch_metrics.append(metrics)
        
    def set_final_metrics(self, metrics):
        self.final_metrics = metrics
        
    def calculate_averages(self):
        if not self.epoch_metrics:
            return {}
            
        metrics_sum = {k: 0.0 for k in self.epoch_metrics[0].keys()}
        for metrics in self.epoch_metrics:
            for k, v in metrics.items():
                metrics_sum[k] += v
                
        return {f"avg_{k}": v/len(self.epoch_metrics) 
                for k, v in metrics_sum.items()}

def train_and_evaluate():
    try:
        # Load and preprocess data
        data = pd.read_csv(DATA_PATH, encoding='latin1')
        data = data.dropna(subset=['Category', 'Patch'])
        
        # Map labels
        label_mapping = {'NPE-Fixes': 1, 'Not-NPE': 0}
        data['Category'] = data['Category'].map(label_mapping)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            data['Patch'].values, 
            data['Category'].values,
            test_size=0.2,
            random_state=42,
            stratify=data['Category'].values
        )
        
        # Initialize model and tokenizer
        tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH)
        model = RobertaForSequenceClassification.from_pretrained(
            MODEL_PATH,
            num_labels=2
        )
        
        # Create datasets
        train_dataset = NPECommitDataset(X_train, y_train, tokenizer)
        test_dataset = NPECommitDataset(X_test, y_test, tokenizer)
        
        # Initialize metrics tracker
        metrics_tracker = MetricsTracker()
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir=OUTPUT_DIR,
            num_train_epochs=5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            gradient_accumulation_steps=2,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1_score",
            greater_is_better=True,
            fp16=True,
            learning_rate=5e-5,
            warmup_ratio=0.1
        )
        
        # Initialize trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            compute_metrics=MetricsCalculator.compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )
        
        # Train and evaluate
        trainer.train()
        final_metrics = trainer.evaluate()
        
        # Save model and metrics
        trainer.save_model(os.path.join(OUTPUT_DIR, "final_model"))
        
        # Calculate and display metrics
        metrics_tracker.set_final_metrics(final_metrics)
        avg_metrics = metrics_tracker.calculate_averages()
        
        # Display results
        print("\nFinal Results:")
        print("-" * 50)
        print("Test Set Metrics:")
        for metric, value in final_metrics.items():
            if isinstance(value, (int, float)):
                print(f"{metric}: {value:.4f}")
        
        print("\nAverage Metrics across Epochs:")
        print("-" * 50)
        for metric, value in avg_metrics.items():
            print(f"{metric}: {value:.4f}")
            
        # Save metrics to file
        results = {
            'final_metrics': final_metrics,
            'average_metrics': avg_metrics,
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        with open(os.path.join(OUTPUT_DIR, 'metrics_results.json'), 'w') as f:
            json.dump(results, f, indent=4)
            
        return results
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        raise
    finally:
        torch.cuda.empty_cache()

if __name__ == "__main__":
    results = train_and_evaluate()