In [None]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, RobertaConfig
from torch.utils.data import Dataset
from transformers import EarlyStoppingCallback, TrainerCallback
import os
import json
from datetime import datetime

# Constants
DATA_PATH = "/root/workspace/npe_project/Dataset/NPEPatches.json"
MODEL_PATH = "microsoft/graphcodebert-base"
OUTPUT_DIR = "./graphcodebert_results"
METRICS_FILE = os.path.join(OUTPUT_DIR, "training_metrics.json")

def load_and_clean_data(file_path):
    encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
    for encoding in encodings:
        try:
            data = pd.read_csv(file_path, encoding=encoding)
            data['Patch'] = data['Patch'].fillna('').astype(str)
            data = data.drop_duplicates(subset=["Patch"])
            data = data.dropna(subset=["Category"])
            return data
        except UnicodeDecodeError:
            continue
    raise ValueError("Could not read file with any of the attempted encodings")

class NPECommitDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = [str(text) for text in texts]
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_token_type_ids=True
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'token_type_ids': encoding['token_type_ids'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class MetricsTracker(TrainerCallback):
    def __init__(self):
        self.metrics_history = {
            'accuracy': [], 'precision': [], 'recall': [], 
            'f1_score': [], 'fpr': [], 'fnr': []
        }
        self.current_epoch = 0
        
    def on_epoch_end(self, args, state, control, metrics=None, **kwargs):
        if metrics:
            self.current_epoch += 1
            for key in self.metrics_history.keys():
                if f"eval_{key}" in metrics:
                    self.metrics_history[key].append(metrics[f"eval_{key}"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)
    
    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()
    
    metrics = {
        'accuracy': accuracy_score(labels, predictions),
        'precision': precision_score(labels, predictions, average='binary'),
        'recall': recall_score(labels, predictions, average='binary'),
        'f1_score': f1_score(labels, predictions, average='binary'),
        'fpr': fp / (fp + tn) if (fp + tn) > 0 else 0,
        'fnr': fn / (fn + tp) if (fn + tp) > 0 else 0
    }
    
    return metrics

def save_results(trainer, metrics_tracker, output_dir):
    final_eval = trainer.evaluate()
    
    # Calculate averages
    avg_metrics = {
        f"avg_{key}": sum(values)/len(values) 
        for key, values in metrics_tracker.metrics_history.items()
        if values
    }
    
    # Combine results
    results = {
        'final_metrics': final_eval,
        'average_metrics': avg_metrics,
        'training_history': metrics_tracker.metrics_history
    }
    
    # Save results
    os.makedirs(output_dir, exist_ok=True)
    with open(os.path.join(output_dir, 'results.json'), 'w') as f:
        json.dump(results, f, indent=4)
    
    return results

def main():
    try:
        # Create output directory
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        
        # Load and preprocess data
        data = load_and_clean_data(DATA_PATH)
        label_mapping = {'NPE-Fixes': 1, 'Not-NPE': 0}
        data["Category"] = data["Category"].map(label_mapping)
        
        # Split data
        X = data["Patch"].values
        y = data["Category"].values
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # Model setup
        config = RobertaConfig.from_pretrained(
            MODEL_PATH,
            num_labels=2,
            hidden_dropout_prob=0.1
        )
        
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_PATH,
            config=config
        )
        
        # Dataset preparation
        train_dataset = NPECommitDataset(X_train, y_train, tokenizer)
        test_dataset = NPECommitDataset(X_test, y_test, tokenizer)
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir=OUTPUT_DIR,
            num_train_epochs=5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            logging_dir=os.path.join(OUTPUT_DIR, "logs"),
            logging_steps=100,
            load_best_model_at_end=True,
            metric_for_best_model="f1_score",
            greater_is_better=True,
            fp16=True,
            weight_decay=0.01,
            learning_rate=3e-5,
            warmup_ratio=0.1,
            save_total_limit=2
        )
        
        # Callbacks
        metrics_tracker = MetricsTracker()
        early_stopping = EarlyStoppingCallback(early_stopping_patience=3)
        
        # Initialize trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
            callbacks=[early_stopping, metrics_tracker]
        )
        
        # Training
        trainer.train()
        
        # Save results
        results = save_results(trainer, metrics_tracker, OUTPUT_DIR)
        
        # Display final results
        print("\nTraining completed successfully!")
        print("\nFinal Metrics:")
        for metric, value in results['final_metrics'].items():
            if isinstance(value, (int, float)):
                print(f"{metric}: {value:.4f}")
        
        print("\nAverage Metrics:")
        for metric, value in results['average_metrics'].items():
            print(f"{metric}: {value:.4f}")
        
        # Save model
        trainer.save_model(os.path.join(OUTPUT_DIR, "final_model"))
        print(f"\nModel and results saved to {OUTPUT_DIR}")
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()