In [None]:
%pip install --upgrade datasets
%pip install --upgrade transformers
%pip install --upgrade optuna

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datasets import Dataset,load_dataset
import ast
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support
from torch.utils.data import DataLoader
from transformers import default_data_collator,set_seed
import torch
import optuna
import shutil
seed = 42
set_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
import time

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    # Get the number of GPUs available
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    
    # Print out the name of each GPU and memory details
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"Total memory: {torch.cuda.get_device_properties(i).total_memory / 1e9} GB")
    
    # If multiple GPUs are available, use DataParallel for multi-GPU
    if num_gpus > 1:
        print("Using DataParallel for multi-GPU training.")
        # Example: Wrap your model with DataParallel
    else:
        print("Only one GPU available, using single GPU mode.")
else:
    print("No GPUs available, using CPU.")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/java-dataset/NLBSE_Dataset_Java.csv")
test = load_dataset('NLBSE/nlbse25-code-comment-classification')['java_test']
df.info()

In [None]:
df.head(10)
# if we are in a kaggle environment we need to use that
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("Wandb")
import wandb
# Replace YOUR_API_KEY with your actual API key
wandb.login(key=secret_value_0)

In [None]:
df.drop_duplicates(subset=['comment_sentence'], keep='first', inplace=True)
df.info()
null_rows = df[df['comment_sentence'].isnull()]

print("Rows with null values in 'comment_sentence':")
print(null_rows)

In [None]:
df_cleaned = df.dropna(subset=['comment_sentence'])
print("DataFrame shape after removing nulls:", df_cleaned.shape)
df_cleaned.info()

In [None]:
pattern = r'https?://\S+|\t'
rows_with_pattern = df_cleaned.apply(lambda row: row.astype(str).str.contains(pattern).any(), axis=1)

# Count rows with patterns
num_rows_with_pattern = rows_with_pattern.sum()
print(f"\nNumber of rows containing patterns: {num_rows_with_pattern}")

# Remove `//` or `*` from all columns
df_cleaned = df_cleaned.replace(pattern, '', regex=True)

In [None]:
df = df_cleaned
df['combo'] = df['comment_sentence'] +"  |  "+  df['class']
java_dataset = Dataset.from_pandas(df)
# Split the dataset into train and validation subsets
train_test_split = java_dataset.train_test_split(test_size=0.2, seed=42)

# Extract train and validation datasets
java_train = train_test_split['train']
java_test = train_test_split['test']
java_labels = ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational']
data_collator = default_data_collator
num_labels = len(java_labels)


In [None]:
# Initialize global variables
best_loss = float('inf')  # Tracks the best evaluation loss
best_model_path = "./best_model"  # Directory to store the best model

# Ensure best_model directory does not exist at the start
if os.path.exists(best_model_path):
    shutil.rmtree(best_model_path)

# Define a function to clear a directory's contents
def clear_directory(directory):
    """Removes all contents inside a directory."""
    if not os.path.exists(directory):
        os.makedirs(directory)
        return
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)  # Remove file or symbolic link
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)  # Remove directory
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")

# Define the evaluation function
def evaluate(new_dataset, model, tokenizer, labels, batch_size=16, device='cuda'):
    # Move model to device
    model.to(device)

    # Prepare data
    texts = new_dataset['combo']
    true_labels = np.array(new_dataset['labels'])

    # Tokenize the inputs
    inputs = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Split into batches
    num_batches = (len(texts) + batch_size - 1) // batch_size
    predictions = []
    start_time = time.time()

    # Perform inference in batches
    for i in range(num_batches):
        batch_input_ids = input_ids[i * batch_size: (i + 1) * batch_size]
        batch_attention_mask = attention_mask[i * batch_size: (i + 1) * batch_size]

        with torch.no_grad():
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            logits = outputs.logits
            preds = (logits.sigmoid() > 0.5).int().cpu().numpy()
            predictions.append(preds)

    end_time = time.time()
    avg_runtime = (end_time - start_time) / num_batches

    # Concatenate predictions
    predictions = np.vstack(predictions)

    # Evaluate metrics for each label
    metrics = []
    for i, label in enumerate(labels):
        tp = np.sum((true_labels[:, i] == 1) & (predictions[:, i] == 1))
        fp = np.sum((true_labels[:, i] == 0) & (predictions[:, i] == 1))
        fn = np.sum((true_labels[:, i] == 1) & (predictions[:, i] == 0))
        tn = np.sum((true_labels[:, i] == 0) & (predictions[:, i] == 0))

        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

        metrics.append({'label': label, 'precision': precision, 'recall': recall, 'f1': f1})

    # Convert metrics to DataFrame
    metrics_df = pd.DataFrame(metrics)
    average_f1 = metrics_df['f1'].mean()
    print("Average F1 on test set:", average_f1)
    return metrics_df, avg_runtime

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large")

# Define the model initialization function
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        "FacebookAI/roberta-large",
        num_labels=num_labels,
        problem_type="multi_label_classification",
    )

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["combo"], truncation=True, padding="max_length", max_length=128)

tokenized_train = java_train.map(tokenize_function, batched=True)
tokenized_test = java_test.map(tokenize_function, batched=True)

# Convert labels to tensors
def encode_labels(examples):
    if isinstance(examples['labels'], str):
        examples["labels"] = examples["labels"].replace(" ", ",")
        labels = ast.literal_eval(examples['labels'])
    else:
        labels = examples['labels']
    # Convert labels to tensors
    labels = torch.tensor(labels, dtype=torch.float32)
    return {'labels': labels}

tokenized_train = tokenized_train.map(encode_labels)
tokenized_test = tokenized_test.map(encode_labels)

# Format datasets for PyTorch
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Define compute_metrics function for Trainer
def compute_metrics(pred):
    logits, labels = pred
    print(f"Logits Shape: {logits.shape}, Labels Shape: {labels.shape}")
    # Apply sigmoid to logits to convert to probabilities
    probs = 1 / (1 + np.exp(-logits))  # Sigmoid function
    preds = (probs > 0.5).astype(int)  # Threshold for multi-label classification
    # Compute metrics
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="micro")
    if f1 == 0:
        print("F1 score is zero. Resetting the trial.")
        raise optuna.exceptions.TrialPruned()  # Prune the trial to reset it
    return {"precision": precision, "recall": recall, "f1": f1}

# Define the Optuna objective function
def optuna_objective(trial):
    global best_loss, best_model_path

    # Define hyperparameter search space
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-4)
    weight_decay = trial.suggest_float("weight_decay", 0.01, 0.1)
    batch_size = trial.suggest_categorical("batch_size", [4, 8, 16, 32])

    # Define unique temporary directories for each trial
    temp_output_dir = f"./temp_results_bert_base_trial_{trial.number}"
    temp_logs_dir = f"./temp_logs_bert_base_trial_{trial.number}"

    # Create temporary directories
    os.makedirs(temp_output_dir, exist_ok=True)
    os.makedirs(temp_logs_dir, exist_ok=True)

    # Define TrainingArguments
    training_args = TrainingArguments(
        output_dir=temp_output_dir,  # Unique temporary directory for each trial
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=5,
        weight_decay=weight_decay,
        logging_dir=temp_logs_dir,
        logging_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_total_limit=1,  # Keep only the best checkpoint
    )

    # Initialize Trainer
    trainer = Trainer(
        model_init=model_init,  # Ensures a fresh model for each trial
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        data_collator=default_data_collator,
    )

    # Perform training
    trainer.train()

    # Evaluate on the validation set
    eval_results = trainer.evaluate()
    eval_loss = eval_results["eval_loss"]
    print(f"Trial {trial.number} - Evaluation Loss: {eval_loss}")

    # Check if this trial has the best loss so far
    if eval_loss < best_loss:
        best_loss = eval_loss
        print(f"Trial {trial.number} achieved the new best loss: {best_loss}. Saving the model.")

        # Remove existing best_model directory if it exists
        if os.path.exists(best_model_path):
            shutil.rmtree(best_model_path)

        # **Save the best model correctly using trainer.save_model**
        trainer.save_model(best_model_path)

    else:
        print(f"Trial {trial.number} did not beat the best loss: {best_loss}.")

    # Clean up the temporary directories to save space
    shutil.rmtree(temp_output_dir)
    shutil.rmtree(temp_logs_dir)

    return eval_loss

# Run Optuna search
study = optuna.create_study(direction="minimize")
study.optimize(optuna_objective, n_trials=8)

# Display best hyperparameters
print("Best Hyperparameters:", study.best_params)
print("Best Evaluation Loss:", study.best_value)

best_model = AutoModelForSequenceClassification.from_pretrained(best_model_path)
best_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large")

# (Optional) Evaluate the loaded best model
metrics_df, avg_runtime = evaluate(
    new_dataset=test,
    model=best_model,
    tokenizer=best_tokenizer,
    labels=java_labels,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

print("Metrics of the best model:")
print(metrics_df)
