In [None]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from transformers import AutoTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from transformers import EarlyStoppingCallback
import os
import json
import logging
from typing import Dict, Any
import matplotlib.pyplot as plt
from datetime import datetime

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('training.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Constants
DATA_PATH = "/root/workspace/npe_project/Dataset/NPEPatches.json"
MODEL_PATH = "microsoft/codereviewer"
OUTPUT_DIR = f"./results_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
MAX_LENGTH = 512

class MetricsTracker:
    def __init__(self):
        self.history = []
        self.best_metrics = {}  # Initialize as empty dict instead of None
        self.current_best_f1 = 0.0

    def update(self, metrics: Dict[str, Any]) -> None:
        try:
            processed_metrics = {k: float(v) if isinstance(v, (np.float64, float)) else v 
                               for k, v in metrics.items()}
            
            self.history.append(processed_metrics)
            
            current_f1 = processed_metrics.get('eval_f1_score', 0.0)
            if current_f1 > self.current_best_f1:
                self.current_best_f1 = current_f1
                self.best_metrics = processed_metrics
        except Exception as e:
            logger.error(f"Error in metrics update: {e}")

    def get_metrics_report(self) -> Dict[str, Any]:
        return {
            'history': self.history,
            'best_metrics': self.best_metrics if self.best_metrics else {},
            'final_metrics': self.history[-1] if self.history else {}
        }

class NPEDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=MAX_LENGTH,
            return_tensors=None
        )

        return {
            'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def compute_metrics(eval_pred):
    try:
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1).flatten()
        labels = labels.flatten()

        tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()
        
        metrics = {
            'eval_accuracy': float(accuracy_score(labels, predictions)),
            'eval_precision': float(precision_score(labels, predictions)),
            'eval_recall': float(recall_score(labels, predictions)),
            'eval_f1_score': float(f1_score(labels, predictions)),
            'eval_fpr': float(fp / (fp + tn)) if (fp + tn) > 0 else 0.0,
            'eval_fnr': float(fn / (fn + tp)) if (fn + tp) > 0 else 0.0
        }
        
        return metrics
    except Exception as e:
        logger.error(f"Error in compute_metrics: {e}")
        return {
            'eval_accuracy': 0.0,
            'eval_precision': 0.0,
            'eval_recall': 0.0,
            'eval_f1_score': 0.0,
            'eval_fpr': 0.0,
            'eval_fnr': 0.0
        }

class MetricsCallback(EarlyStoppingCallback):
    def __init__(self, metrics_tracker, early_stopping_patience=2):
        super().__init__(early_stopping_patience=early_stopping_patience)
        self.metrics_tracker = metrics_tracker

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics:
            self.metrics_tracker.update(metrics)
        super().on_evaluate(args, state, control, metrics, **kwargs)

def save_results(output_dir: str, metrics_tracker: MetricsTracker, final_results: Dict[str, Any]):
    try:
        # Save detailed metrics report
        report_path = os.path.join(output_dir, 'metrics_report.txt')
        with open(report_path, 'w') as f:
            f.write("=== Final Evaluation Metrics ===\n\n")
            for metric, value in final_results.items():
                if isinstance(value, (int, float)):
                    f.write(f"{metric}: {value:.4f}\n")

            if metrics_tracker.best_metrics:
                f.write("\n=== Best Metrics During Training ===\n\n")
                for metric, value in metrics_tracker.best_metrics.items():
                    if isinstance(value, (int, float)):
                        f.write(f"{metric}: {value:.4f}\n")

        # Save JSON results
        results = metrics_tracker.get_metrics_report()
        with open(os.path.join(output_dir, 'results.json'), 'w') as f:
            json.dump(results, f, indent=4)

    except Exception as e:
        logger.error(f"Error saving results: {e}")

def main():
    try:
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        metrics_tracker = MetricsTracker()

        # Load and preprocess data
        logger.info("Loading dataset...")
        data = pd.read_csv(DATA_PATH, encoding='latin1')
        data['Patch'] = data['Patch'].fillna('').astype(str)
        data = data.drop_duplicates(subset=['Patch'])
        
        label_mapping = {'NPE-Fixes': 1, 'Not-NPE': 0}
        data['Category'] = data['Category'].map(label_mapping)

        X = data['Patch'].values
        y = data['Category'].values
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
        model = RobertaForSequenceClassification.from_pretrained(
            MODEL_PATH,
            num_labels=2
        )

        train_dataset = NPEDataset(X_train, y_train, tokenizer)
        test_dataset = NPEDataset(X_test, y_test, tokenizer)

        training_args = TrainingArguments(
            output_dir=OUTPUT_DIR,
            num_train_epochs=5,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="eval_f1_score",
            greater_is_better=True,
            learning_rate=3e-5,
            logging_dir=os.path.join(OUTPUT_DIR, "logs"),
            logging_steps=100,
            save_total_limit=2,
            remove_unused_columns=False
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            compute_metrics=compute_metrics,
            callbacks=[MetricsCallback(metrics_tracker, early_stopping_patience=2)]
        )

        logger.info("Starting training...")
        trainer.train()
        
        final_results = trainer.evaluate()
        
        # Save results and model
        save_results(OUTPUT_DIR, metrics_tracker, final_results)
        trainer.save_model(os.path.join(OUTPUT_DIR, "final_model"))

        logger.info(f"Training completed. Results saved to {OUTPUT_DIR}")
        
        # Display final metrics
        print("\nFinal Evaluation Metrics:")
        for metric, value in final_results.items():
            if isinstance(value, (int, float)):
                print(f"{metric}: {value:.4f}")

    except Exception as e:
        logger.error(f"Error occurred: {e}")
        raise

if __name__ == "__main__":
    main()