In [None]:
%pip install --upgrade datasets
%pip install --upgrade transformers
%pip install --upgrade optuna

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datasets import Dataset,load_dataset
import ast
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support
from torch.utils.data import DataLoader
from transformers import default_data_collator
import torch
import optuna
import shutil
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/nlbse-25-dataset/NLBSE_Dataset_Python.csv")
df.info()

In [None]:
df.head(10)
# if we are in a kaggle environment we need to use that
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("Wandb")
import wandb
# Replace YOUR_API_KEY with your actual API key
wandb.login(key=secret_value_0)

In [None]:
df.drop_duplicates(subset=['comment_sentence'], keep='first', inplace=True)
df.info()
null_rows = df[df['comment_sentence'].isnull()]

print("Rows with null values in 'comment_sentence':")
print(null_rows)

In [None]:
df_cleaned = df.dropna(subset=['comment_sentence'])
print("DataFrame shape after removing nulls:", df_cleaned.shape)
df_cleaned.info()

In [None]:
pattern = r'\*|//'
rows_with_pattern = df_cleaned.apply(lambda row: row.astype(str).str.contains(pattern).any(), axis=1)

# Count rows with patterns
num_rows_with_pattern = rows_with_pattern.sum()
print(f"\nNumber of rows containing patterns: {num_rows_with_pattern}")

# Remove `//` or `*` from all columns
df_cleaned = df_cleaned.replace(pattern, '', regex=True)

In [None]:
df = df_cleaned
df['combo'] = df['comment_sentence'] +"  |  "+  df['class']
python_dataset = Dataset.from_pandas(df)
# Split the dataset into train and validation subsets
train_test_split = python_dataset.train_test_split(test_size=0.2, seed=42)

# Extract train and validation datasets
python_train = train_test_split['train']
python_test = train_test_split['test']
python_labels = ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary']

In [None]:
# Use Hugging Face's default data collator
data_collator = default_data_collator
num_labels = len(python_labels)

# Load model
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        "google-bert/bert-large-uncased",
        num_labels=num_labels,
        problem_type="multi_label_classification",
    ).to(device)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-large-uncased")

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["combo"], truncation=True, padding="max_length", max_length=128)
tokenized_train = python_train.map(tokenize_function, batched=True)
tokenized_test = python_test.map(tokenize_function, batched=True)

# Convert labels to tensors
def encode_labels(examples):
    if isinstance(examples['labels'], str):
        examples["labels"]=examples["labels"].replace(" ", ",")
        labels = ast.literal_eval(examples['labels'])
    else:
        labels = examples['labels']
    # Convert labels to tensors
    labels = torch.tensor(labels, dtype=torch.float32)
    return {'labels': labels}
    
tokenized_train = tokenized_train.map(encode_labels)
tokenized_test = tokenized_test.map(encode_labels)

# Format datasets for PyTorch
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


# train_dataloader = DataLoader(tokenized_train, batch_size=32, shuffle=True)
# test_dataloader = DataLoader(tokenized_test, batch_size=32, shuffle=False)
def clear_directory(directory):
    """Removes all contents inside a directory."""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)  # Remove file or symbolic link
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)  # Remove directory
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")

# Define evaluation metrics
def compute_metrics(pred):
    logits, labels = pred
    print(f"Logits Shape: {logits.shape}, Labels Shape: {labels.shape}")
    # Apply sigmoid to logits to convert to probabilities
    probs = 1 / (1 + np.exp(-logits))  # Sigmoid function
    preds = (probs > 0.5).astype(int)  # Threshold for multi-label classification
    # Compute metrics
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="micro")
    return {"precision": precision, "recall": recall, "f1": f1}
    
def optuna_objective(trial):
    temp_dir = "./temp_results"
    if os.path.exists(temp_dir):
        clear_directory(temp_dir)
    else:
        os.makedirs(temp_dir)
    temp_dir = "./temp_logs"
    if os.path.exists(temp_dir):
        clear_directory(temp_dir)
    else:
        os.makedirs(temp_dir)
    
    # Define hyperparameter search space
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-4)
    weight_decay = trial.suggest_float("weight_decay", 0.01, 0.1)
    batch_size = trial.suggest_categorical("batch_size", [4,8,16, 32])

    # Initialize Trainer with current trial parameters
    training_args = TrainingArguments(
        output_dir="./temp_results_bert_large",  # Temporary directory for checkpoints
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=8,
        weight_decay=weight_decay,
        logging_dir="./temp_logs_bert_large",
        logging_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_total_limit=3,  # Keep only the latest checkpoint
    )

    trainer = Trainer(
        model_init=model_init,  # This makes sure we load the base model at each trial
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
    )

    # Perform training
    trainer.train()
    # Evaluate on the validation set
    eval_results = trainer.evaluate()
    print("Evaluation Results:", eval_results)
    eval_loss = eval_results["eval_loss"]
    # Optuna will minimize this
    return eval_loss

# Run Optuna search
study = optuna.create_study(direction="minimize")
study.optimize(optuna_objective, n_trials=16)

# Display best hyperparameters
print("Best Hyperparameters:", study.best_params)
print("Best Evaluation Loss:", study.best_value)