In [24]:
import torch
import numpy as np
from datasets import load_dataset, Features, Sequence, Value
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

MODEL_NAME = "distilbert-base-uncased"

language_categories = {
    "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"],
    "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"],
    "pharo": ["Keyimplementationpoints", "Example", "Responsibilities", "Classreferences", "Intent", "Keymessages", "Collaborators"]
}

dataset_name = "NLBSE/nlbse25-code-comment-classification"
languages = ["java", "python", "pharo"]

all_results = {}

for lang in languages:
    all_labels = language_categories[lang]
    num_labels = len(all_labels)

    print(f"\n=== Processing language: {lang} ===")
    raw_ds = load_dataset(dataset_name)
    train_ds = raw_ds[f"{lang}_train"]
    test_ds = raw_ds[f"{lang}_test"]

    print("Columns in training set:", train_ds.column_names)
    print("First training example:", train_ds[0])
    print("All labels:", all_labels)

    label_map = {lbl: idx for idx, lbl in enumerate(all_labels)}
    inverse_label_map = {idx: lbl for lbl, idx in label_map.items()}

    print("label map:", label_map)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    def encode_batch(examples):
        encodings = tokenizer(
            examples["comment_sentence"],
            truncation=True,
            padding=True,
            return_tensors="pt"
        )
        labels = torch.tensor(examples["labels"], dtype=torch.float32)
        return {
            "input_ids": encodings["input_ids"],
            "attention_mask": encodings["attention_mask"],
            "labels": labels
        }

    proc_train = train_ds.map(encode_batch, batched=True, batch_size=16)
    proc_test = test_ds.map(encode_batch, batched=True, batch_size=16)

    unused_columns = ['index', 'class', 'comment_sentence', 'partition', 'combo']
    proc_train = proc_train.remove_columns(unused_columns)
    proc_test = proc_test.remove_columns(unused_columns)

    features = Features({
        "input_ids": Sequence(Value("int64")),
        "attention_mask": Sequence(Value("int64")),
        "labels": Sequence(Value("float32"))
    })

    proc_train = proc_train.cast(features)
    proc_test = proc_test.cast(features)

    proc_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    proc_test.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    print("num_labels:", num_labels)
    print("Shape of a sample's label vector:", proc_train[0]["labels"].shape)
    print("Dtype of a sample's label vector:", proc_train[0]["labels"].dtype)

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=num_labels,
        problem_type="multi_label_classification"
    )

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        probs = 1 / (1 + np.exp(-predictions))
        preds = (probs > 0.5).astype(int)

        micro_precision, micro_recall, micro_f1, _ = precision_recall_fscore_support(
            labels, preds, average='micro', zero_division=0
        )
        acc = accuracy_score(labels, preds)

        per_cat_precision, per_cat_recall, per_cat_f1, _ = precision_recall_fscore_support(
            labels, preds, average=None, zero_division=0
        )

        per_category_metrics = {}
        for i, cat in enumerate(all_labels):
            per_category_metrics[f"{cat}_precision"] = per_cat_precision[i]
            per_category_metrics[f"{cat}_recall"] = per_cat_recall[i]
            per_category_metrics[f"{cat}_f1"] = per_cat_f1[i]

        metrics = {
            "accuracy": acc,
            "micro_precision": micro_precision,
            "micro_recall": micro_recall,
            "micro_f1": micro_f1
        }
        metrics.update(per_category_metrics)
        return metrics

    # (1 epoch for demonstration)
    training_args = TrainingArguments(
        output_dir=f"./{lang}_results",
        evaluation_strategy="no",
        save_strategy="no",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        learning_rate=5e-5,
        weight_decay=0.01,
        logging_dir=f"./{lang}_logs"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=proc_train,
        eval_dataset=proc_test,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    print("Starting training for language:", lang)
    trainer.train()
    print("Training done for language:", lang)

    print(f"Running evaluation on the test set for {lang}...")
    results = trainer.evaluate()
    print(f"Evaluation results for {lang}:", results)

    all_results[lang] = results

print("\n=== Summary of Results for All Languages ===")
for lang, res in all_results.items():
    print(f"Language: {lang}")
    for k, v in res.items():
        print(f"  {k}: {v}")
    print()


=== Processing language: java ===
Columns in training set: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels']
First training example: {'index': 0, 'class': 'Abfss.java', 'comment_sentence': 'azure blob file system implementation of abstractfilesystem.', 'partition': 0, 'combo': 'azure blob file system implementation of abstractfilesystem. | Abfss.java', 'labels': [1, 0, 0, 0, 0, 0, 0]}
All labels: ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational']
Label2ID: {'summary': 0, 'Ownership': 1, 'Expand': 2, 'usage': 3, 'Pointer': 4, 'deprecation': 5, 'rational': 6}


Map:   0%|          | 0/7614 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7614 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1725 [00:00<?, ? examples/s]

num_labels: 7
Shape of a sample's label vector: torch.Size([7])
Dtype of a sample's label vector: torch.float32


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training for language: java


  0%|          | 0/952 [00:00<?, ?it/s]

{'loss': 0.1631, 'grad_norm': 0.5765820145606995, 'learning_rate': 2.373949579831933e-05, 'epoch': 0.53}
{'train_runtime': 2473.7667, 'train_samples_per_second': 3.078, 'train_steps_per_second': 0.385, 'train_loss': 0.13865949726906143, 'epoch': 1.0}
Training done for language: java
Running evaluation on the test set for java...


  0%|          | 0/216 [00:00<?, ?it/s]

Evaluation results for java: {'eval_loss': 0.12433961778879166, 'eval_accuracy': 0.8034782608695652, 'eval_micro_precision': 0.8865519439133206, 'eval_micro_recall': 0.8008059873344847, 'eval_micro_f1': 0.8415003024803388, 'eval_summary_precision': 0.8887688984881209, 'eval_summary_recall': 0.922645739910314, 'eval_summary_f1': 0.9053905390539054, 'eval_Ownership_precision': 1.0, 'eval_Ownership_recall': 1.0, 'eval_Ownership_f1': 1.0, 'eval_Expand_precision': 0.5, 'eval_Expand_recall': 0.058823529411764705, 'eval_Expand_f1': 0.10526315789473684, 'eval_usage_precision': 0.9010695187165776, 'eval_usage_recall': 0.7819025522041764, 'eval_usage_f1': 0.8372670807453416, 'eval_Pointer_precision': 0.8848167539267016, 'eval_Pointer_recall': 0.9184782608695652, 'eval_Pointer_f1': 0.9013333333333333, 'eval_deprecation_precision': 1.0, 'eval_deprecation_recall': 0.6, 'eval_deprecation_f1': 0.75, 'eval_rational_precision': 0.16666666666666666, 'eval_rational_recall': 0.029411764705882353, 'eval_ra

Map:   0%|          | 0/1884 [00:00<?, ? examples/s]

Map:   0%|          | 0/406 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1884 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/406 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


num_labels: 5
Shape of a sample's label vector: torch.Size([5])
Dtype of a sample's label vector: torch.float32


  trainer = Trainer(


Starting training for language: python


  0%|          | 0/236 [00:00<?, ?it/s]

{'train_runtime': 170.0886, 'train_samples_per_second': 11.077, 'train_steps_per_second': 1.388, 'train_loss': 0.4555435827222921, 'epoch': 1.0}
Training done for language: python
Running evaluation on the test set for python...


  0%|          | 0/51 [00:00<?, ?it/s]

Evaluation results for python: {'eval_loss': 0.41218048334121704, 'eval_accuracy': 0.33497536945812806, 'eval_micro_precision': 0.7526881720430108, 'eval_micro_recall': 0.3211009174311927, 'eval_micro_f1': 0.45016077170418006, 'eval_Usage_precision': 0.7605633802816901, 'eval_Usage_recall': 0.4462809917355372, 'eval_Usage_f1': 0.5625, 'eval_Parameters_precision': 0.7551020408163265, 'eval_Parameters_recall': 0.2890625, 'eval_Parameters_f1': 0.4180790960451977, 'eval_DevelopmentNotes_precision': 0.0, 'eval_DevelopmentNotes_recall': 0.0, 'eval_DevelopmentNotes_f1': 0.0, 'eval_Expand_precision': 0.0, 'eval_Expand_recall': 0.0, 'eval_Expand_f1': 0.0, 'eval_Summary_precision': 0.7424242424242424, 'eval_Summary_recall': 0.5975609756097561, 'eval_Summary_f1': 0.6621621621621622, 'eval_runtime': 4.2252, 'eval_samples_per_second': 96.09, 'eval_steps_per_second': 12.07, 'epoch': 1.0}

=== Processing language: pharo ===
Columns in training set: ['index', 'class', 'comment_sentence', 'partition', 

Map:   0%|          | 0/1298 [00:00<?, ? examples/s]

Map:   0%|          | 0/289 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1298 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/289 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


num_labels: 7
Shape of a sample's label vector: torch.Size([7])
Dtype of a sample's label vector: torch.float32


  trainer = Trainer(


Starting training for language: pharo


  0%|          | 0/163 [00:00<?, ?it/s]

{'train_runtime': 175.7398, 'train_samples_per_second': 7.386, 'train_steps_per_second': 0.928, 'train_loss': 0.36342822115845475, 'epoch': 1.0}
Training done for language: pharo
Running evaluation on the test set for pharo...


  0%|          | 0/37 [00:00<?, ?it/s]

Evaluation results for pharo: {'eval_loss': 0.27968451380729675, 'eval_accuracy': 0.36678200692041524, 'eval_micro_precision': 0.7371794871794872, 'eval_micro_recall': 0.38205980066445183, 'eval_micro_f1': 0.5032822757111597, 'eval_Keyimplementationpoints_precision': 0.0, 'eval_Keyimplementationpoints_recall': 0.0, 'eval_Keyimplementationpoints_f1': 0.0, 'eval_Example_precision': 0.732824427480916, 'eval_Example_recall': 0.8067226890756303, 'eval_Example_f1': 0.768, 'eval_Responsibilities_precision': 0.5384615384615384, 'eval_Responsibilities_recall': 0.1346153846153846, 'eval_Responsibilities_f1': 0.2153846153846154, 'eval_Classreferences_precision': 0.0, 'eval_Classreferences_recall': 0.0, 'eval_Classreferences_f1': 0.0, 'eval_Intent_precision': 1.0, 'eval_Intent_recall': 0.4, 'eval_Intent_f1': 0.5714285714285714, 'eval_Keymessages_precision': 0.0, 'eval_Keymessages_recall': 0.0, 'eval_Keymessages_f1': 0.0, 'eval_Collaborators_precision': 0.0, 'eval_Collaborators_recall': 0.0, 'eval_