In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datasets import Dataset,load_dataset
import ast
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support
from torch.utils.data import DataLoader
from transformers import default_data_collator,TrainerCallback,set_seed
import torch
seed = 42
set_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlbse-datasets/NLBSE_Dataset_Java.csv
/kaggle/input/nlbse-datasets/NLBSE_Dataset_Python.csv
/kaggle/input/nlbse-datasets/NLBSE_Dataset_Pharo.csv
Using device: cuda


In [2]:
df = pd.read_csv('/kaggle/input/nlbse-datasets/NLBSE_Dataset_Python.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10170 entries, 0 to 10169
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   class             10170 non-null  object
 1   comment_sentence  10169 non-null  object
 2   labels            10170 non-null  object
 3   types             10170 non-null  object
dtypes: object(4)
memory usage: 317.9+ KB


In [4]:
df.head(10)
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("Wandb")
import wandb

# Replace YOUR_API_KEY with your actual API key
wandb.login(key=secret_value_0)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
df.drop_duplicates(subset=['comment_sentence'], keep='first', inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8176 entries, 0 to 10168
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   class             8176 non-null   object
 1   comment_sentence  8175 non-null   object
 2   labels            8176 non-null   object
 3   types             8176 non-null   object
dtypes: object(4)
memory usage: 319.4+ KB


In [6]:
null_rows = df[df['comment_sentence'].isnull()]

print("Rows with null values in 'comment_sentence':")
print(null_rows)

Rows with null values in 'comment_sentence':
               class comment_sentence       labels types
9172  MultiValueDict              NaN  [1 0 0 0 0]    ru


In [7]:
df_cleaned = df.dropna(subset=['comment_sentence'])
print("DataFrame shape after removing nulls:", df_cleaned.shape)
df_cleaned.info()

DataFrame shape after removing nulls: (8175, 4)
<class 'pandas.core.frame.DataFrame'>
Index: 8175 entries, 0 to 10168
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   class             8175 non-null   object
 1   comment_sentence  8175 non-null   object
 2   labels            8175 non-null   object
 3   types             8175 non-null   object
dtypes: object(4)
memory usage: 319.3+ KB


In [8]:
pattern = r'https?://\S+|\t'
rows_with_pattern = df_cleaned.apply(lambda row: row.astype(str).str.contains(pattern).any(), axis=1)

# Count rows with patterns
num_rows_with_pattern = rows_with_pattern.sum()
print(f"\nNumber of rows containing patterns: {num_rows_with_pattern}")

# Remove `//` or `*` from all columns
df_cleaned = df_cleaned.replace(pattern, '', regex=True)


Number of rows containing patterns: 4


In [9]:
df = df_cleaned
df['combo'] = df['comment_sentence'] +"  |  "+  df['class']
python_dataset = Dataset.from_pandas(df)
# Split the dataset into train and validation subsets
train_test_split = python_dataset.train_test_split(test_size=0.2, seed=42)

# Extract train and validation datasets
python_train = train_test_split['train']
python_test = train_test_split['test']
python_labels =['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary']

In [10]:
# Use Hugging Face's default data collator
data_collator = default_data_collator
num_labels = len(python_labels)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["combo"], truncation=True, padding="max_length", max_length=128)
tokenized_train = python_train.map(tokenize_function, batched=True)
tokenized_test = python_test.map(tokenize_function, batched=True)

# Convert labels to tensors
def encode_labels(examples):
    if isinstance(examples['labels'], str):
        examples["labels"]=examples["labels"].replace(" ", ",")
        labels = ast.literal_eval(examples['labels'])
    else:
        labels = examples['labels']
    # Convert labels to tensors
    labels = torch.tensor(labels, dtype=torch.float32)
    return {'labels': labels}
tokenized_train = tokenized_train.map(encode_labels)
tokenized_test = tokenized_test.map(encode_labels)

# Format datasets for PyTorch
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-large",
    num_labels=num_labels,
    problem_type="multi_label_classification"
).to(device)

# Define evaluation metrics
def compute_metrics(pred):
    logits, labels = pred
    preds = (logits > 0.5).astype(int)  # Multi-label threshold
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="micro")
    return {"precision": precision, "recall": recall, "f1": f1}


# Define training arguments
training_args = TrainingArguments(
    output_dir="./results1",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1.959052459030173e-05,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=24,
    weight_decay= 0.08787938306842205,
    logging_dir="./logs1",
    logging_steps=100,
    save_total_limit=3,
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="eval_loss",  # Use validation loss to select the best model
    greater_is_better=False,  # Lower validation loss is better
)


# Custom callback to monitor F1 score and restart training if F1 is zero
class RestartIfF1ZeroCallback(TrainerCallback):
    def __init__(self, trainer, train_dataset, eval_dataset, tokenizer, data_collator):
        self.trainer = trainer
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.tokenizer = tokenizer
        self.data_collator = data_collator

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        # Check F1 score
        if "eval_f1" in metrics and metrics["eval_f1"] == 0.0:
            print("F1 score is zero. Restarting training...")
            
            # Reinitialize model
            self.trainer.model = AutoModelForSequenceClassification.from_pretrained(
                "roberta-base",
                num_labels=num_labels,
                problem_type="multi_label_classification"
            ).to(device)

            # Restart training
            self.trainer.train_dataset = self.train_dataset
            self.trainer.eval_dataset = self.eval_dataset
            self.trainer.tokenizer = self.tokenizer
            self.trainer.data_collator = self.data_collator

            self.trainer.train()

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

# Add custom callback
trainer.add_callback(RestartIfF1ZeroCallback(trainer, tokenized_train, tokenized_test, tokenizer, data_collator))

# Train the model
trainer.train()

# Save the model
trainer.save_model("./roberta-python-multi-label")
tokenizer.save_pretrained("./roberta-python-multi-label")

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/6540 [00:00<?, ? examples/s]

Map:   0%|          | 0/1635 [00:00<?, ? examples/s]

Map:   0%|          | 0/6540 [00:00<?, ? examples/s]

Map:   0%|          | 0/1635 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mmathparenaai[0m ([33mmathparenaai-bangladesh-university-of-engineering-and-te[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241210_180627-vf0a4840[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m./results1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/mathparenaai-bangladesh-university-of-engineering-and-te/huggingface[0

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.427,0.287109,0.832049,0.607424,0.702211
2,0.2564,0.197557,0.877843,0.759843,0.814591
3,0.1611,0.17882,0.890177,0.820585,0.853965
4,0.1043,0.100738,0.939858,0.896513,0.917674
5,0.0693,0.09248,0.951688,0.919573,0.935355
6,0.0514,0.078151,0.959054,0.935321,0.947039
7,0.0365,0.074806,0.955556,0.943195,0.949335
8,0.0269,0.065833,0.964612,0.950506,0.957507
9,0.0228,0.082467,0.957931,0.947694,0.952785
10,0.0153,0.066911,0.967503,0.954443,0.960929


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(devic

Evaluation Results: {'eval_loss': 0.056582484394311905, 'eval_precision': 0.9768883878241262, 'eval_recall': 0.9746906636670416, 'eval_f1': 0.9757882882882882, 'eval_runtime': 22.1709, 'eval_samples_per_second': 73.745, 'eval_steps_per_second': 1.173, 'epoch': 24.0}


In [11]:
import numpy as np
import pandas as pd
import time
import torch

def evaluate_roberta(new_dataset, model, tokenizer, labels, batch_size=16, device='cuda'):
    """
    Evaluate a fine-tuned RoBERTa-large model on a new dataset.

    Args:
        new_dataset (dict): A dictionary with 'combo' (input texts) and 'labels' (one-hot encoded labels).
        model (transformers.PreTrainedModel): A trained RoBERTa model.
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer corresponding to the model.
        labels (list): List of label categories.
        batch_size (int): Batch size for evaluation.
        device (str): Device to use ('cuda' or 'cpu').

    Returns:
        pd.DataFrame: DataFrame containing precision, recall, and F1-score for each label.
        float: Average runtime per batch.
    """
    # Move model to device
    model.to(device)

    # Prepare data
    texts = new_dataset['combo']
    true_labels = np.array(new_dataset['labels'])

    # Tokenize the inputs
    inputs = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Split into batches
    num_batches = (len(texts) + batch_size - 1) // batch_size
    predictions = []
    start_time = time.time()

    # Perform inference in batches
    for i in range(num_batches):
        batch_input_ids = input_ids[i * batch_size: (i + 1) * batch_size]
        batch_attention_mask = attention_mask[i * batch_size: (i + 1) * batch_size]

        with torch.no_grad():
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            logits = outputs.logits
            preds = (logits.sigmoid() > 0.5).int().cpu().numpy()
            predictions.append(preds)
            
    end_time = time.time()
    avg_runtime = (end_time - start_time) / num_batches

    # Concatenate predictions
    predictions = np.vstack(predictions)

    # Evaluate metrics for each label
    metrics = []
    for i, label in enumerate(labels):
        tp = np.sum((true_labels[:, i] == 1) & (predictions[:, i] == 1))
        fp = np.sum((true_labels[:, i] == 0) & (predictions[:, i] == 1))
        fn = np.sum((true_labels[:, i] == 1) & (predictions[:, i] == 0))
        tn = np.sum((true_labels[:, i] == 0) & (predictions[:, i] == 0))

        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

        metrics.append({'label': label, 'precision': precision, 'recall': recall, 'f1': f1})

    # Convert metrics to DataFrame
    metrics_df = pd.DataFrame(metrics)
    return metrics_df, avg_runtime

In [12]:
import re
# Use the model and tokenizer from the training session
trained_model = model  # Replace with your trained model instance
trained_tokenizer = tokenizer  # Replace with your tokenizer instance
test = load_dataset('NLBSE/nlbse25-code-comment-classification')['python_test']
# def clean_text(example):
#     if "combo" in example:  # Replace "text" with the relevant column name
#         example["combo"] = re.sub(r'https?://\S+|\t', '', example["combo"], flags=re.MULTILINE)
#         example["combo"] = example["combo"].strip()  # Remove leading/trailing spaces
#     return example
# test = test.map(clean_text)
# Define label names
labels =  ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary']

# Evaluate the model
metrics, avg_runtime = evaluate_roberta(test, trained_model, trained_tokenizer, labels)

print("Evaluation Metrics:")
print(metrics)
print(f"Average runtime per batch: {avg_runtime:.4f} seconds")
from huggingface_hub import login

# Retrieve API token from Kaggle secrets
import os
huggingface_token = user_secrets.get_secret("Huggingface")

login(token=huggingface_token)
trained_model.push_to_hub("NLBSE-Python-final")
trained_tokenizer.push_to_hub("NLBSE-Python-final")

README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

java_train-00000-of-00001.parquet:   0%|          | 0.00/680k [00:00<?, ?B/s]

java_test-00000-of-00001.parquet:   0%|          | 0.00/174k [00:00<?, ?B/s]

python_train-00000-of-00001.parquet:   0%|          | 0.00/126k [00:00<?, ?B/s]

python_test-00000-of-00001.parquet:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

pharo_train-00000-of-00001.parquet:   0%|          | 0.00/113k [00:00<?, ?B/s]

pharo_test-00000-of-00001.parquet:   0%|          | 0.00/30.6k [00:00<?, ?B/s]

Generating java_train split:   0%|          | 0/7614 [00:00<?, ? examples/s]

Generating java_test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Generating python_train split:   0%|          | 0/1884 [00:00<?, ? examples/s]

Generating python_test split:   0%|          | 0/406 [00:00<?, ? examples/s]

Generating pharo_train split:   0%|          | 0/1298 [00:00<?, ? examples/s]

Generating pharo_test split:   0%|          | 0/289 [00:00<?, ? examples/s]

Evaluation Metrics:
              label  precision    recall        f1
0             Usage   0.793388  0.793388  0.793388
1        Parameters   0.852459  0.812500  0.832000
2  DevelopmentNotes   0.428571  0.292683  0.347826
3            Expand   0.683333  0.640625  0.661290
4           Summary   0.688172  0.780488  0.731429
Average runtime per batch: 0.1064 seconds


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/MushfiqurRR/NLBSE-Python-final/commit/3b4ec0480ee4cc68232fbd2927ed778920fb3aac', commit_message='Upload tokenizer', commit_description='', oid='3b4ec0480ee4cc68232fbd2927ed778920fb3aac', pr_url=None, repo_url=RepoUrl('https://huggingface.co/MushfiqurRR/NLBSE-Python-final', endpoint='https://huggingface.co', repo_type='model', repo_id='MushfiqurRR/NLBSE-Python-final'), pr_revision=None, pr_num=None)