In [1]:
%pip install datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, set_seed
import torch
import numpy as np
import pandas as pd
import time
from datasets import load_dataset
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
from huggingface_hub import login
from torch.profiler import profile, ProfilerActivity

set_seed(42)


def evaluate_roberta(metrics, new_dataset, model, tokenizer, labels, language, batch_size=32, device='cuda'):
    model.to(device)
    model.eval()

    # Prepare data
    texts = new_dataset['combo']
    true_labels = np.array(new_dataset['labels'])  # shape: (num_samples, num_labels)

    # Tokenize the inputs
    inputs = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Split into batches
    num_samples = len(texts)
    num_batches = (num_samples + batch_size - 1) // batch_size

    total_time_across_runs = 0.0
    total_flops_across_runs = 0.0


    final_predictions = None

    for run_idx in range(10):
        run_time = 0.0
        run_flops = 0.0
        run_predictions = []
        for i in range(num_batches):
            batch_input_ids = input_ids[i * batch_size : (i + 1) * batch_size]
            batch_attention_mask = attention_mask[i * batch_size : (i + 1) * batch_size]

            # Profile *each batch* within this run
            with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                         with_flops=True) as p:
                with torch.no_grad():
                    start_time = time.time()
                    outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
                    logits = outputs.logits
                    preds = (logits.sigmoid() > 0.5).int().cpu().numpy()
                    run_time += time.time() - start_time

            run_flops += sum(k.flops for k in p.key_averages() if k.flops is not None)

            run_predictions.append(preds)

        total_time_across_runs += run_time
        total_flops_across_runs += run_flops

        final_predictions = np.vstack(run_predictions)

    # Average runtime and FLOPs over 10 runs
    average_time = total_time_across_runs / 10.0
    average_flops = total_flops_across_runs / 10.0


    for i, label in enumerate(labels):
        tp = np.sum((true_labels[:, i] == 1) & (final_predictions[:, i] == 1))
        fp = np.sum((true_labels[:, i] == 0) & (final_predictions[:, i] == 1))
        fn = np.sum((true_labels[:, i] == 1) & (final_predictions[:, i] == 0))
        tn = np.sum((true_labels[:, i] == 0) & (final_predictions[:, i] == 0))

        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        metrics.append({
            'language': language,
            'label': label,
            'precision': precision,
            'recall': recall,
            'f1': f1
        })


    #print("Avg runtime in seconds (over 10 runs):", average_time)
    average_GFLOPs = average_flops / 1e9
    #print("Average GFLOPs (over 10 runs):", average_GFLOPs)

    return average_time, average_GFLOPs

# Languages and labels
langs = ['java', 'python', 'pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}

ds = load_dataset('NLBSE/nlbse25-code-comment-classification')
metrics = []
average_time = 0
average_GFLOPS = 0
for lan in langs:
    tokenizer = AutoTokenizer.from_pretrained(f"MushfiqurRR/NLBSE-{lan.capitalize()}-final")
    model = AutoModelForSequenceClassification.from_pretrained(f"MushfiqurRR/NLBSE-{lan.capitalize()}-final")
    test_data = ds[f'{lan}_test']
    labels_data = labels[f'{lan}']
    A_T,A_GF = (evaluate_roberta(metrics,test_data,model,tokenizer,labels_data,lan))
    average_time+=A_T
    average_GFLOPS+= A_GF
    torch.cuda.empty_cache()
metrics = pd.DataFrame(metrics)
metrics.reset_index(drop=True, inplace=True)
print(metrics)
avg_f1 = metrics['f1'].mean()
print("f1",avg_f1)
print("average time",average_time)
print("average gflops",average_GFLOPS)
max_avg_flops = 5000
max_avg_runtime = 5
def score(avg_f1, avg_runtime, avg_flops):
    return (
        0.6 * avg_f1 +
        0.2 * max(0, ((max_avg_runtime - avg_runtime) / max_avg_runtime)) +
        0.2 * max(0, ((max_avg_flops - avg_flops) / max_avg_flops))
    )
print(round(score(avg_f1, average_time, average_GFLOPS), 2))

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

java_train-00000-of-00001.parquet:   0%|          | 0.00/680k [00:00<?, ?B/s]

java_test-00000-of-00001.parquet:   0%|          | 0.00/174k [00:00<?, ?B/s]

python_train-00000-of-00001.parquet:   0%|          | 0.00/126k [00:00<?, ?B/s]

python_test-00000-of-00001.parquet:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

pharo_train-00000-of-00001.parquet:   0%|          | 0.00/113k [00:00<?, ?B/s]

pharo_test-00000-of-00001.parquet:   0%|          | 0.00/30.6k [00:00<?, ?B/s]

Generating java_train split:   0%|          | 0/7614 [00:00<?, ? examples/s]

Generating java_test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Generating python_train split:   0%|          | 0/1884 [00:00<?, ? examples/s]

Generating python_test split:   0%|          | 0/406 [00:00<?, ? examples/s]

Generating pharo_train split:   0%|          | 0/1298 [00:00<?, ? examples/s]

Generating pharo_test split:   0%|          | 0/289 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/966 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

   language                    label  precision    recall        f1
0      java                  summary   0.904488  0.881166  0.892675
1      java                Ownership   1.000000  1.000000  1.000000
2      java                   Expand   0.439252  0.460784  0.449761
3      java                    usage   0.921951  0.877030  0.898930
4      java                  Pointer   0.806452  0.951087  0.872818
5      java              deprecation   0.818182  0.600000  0.692308
6      java                 rational   0.268293  0.323529  0.293333
7    python                    Usage   0.793388  0.793388  0.793388
8    python               Parameters   0.852459  0.812500  0.832000
9    python         DevelopmentNotes   0.428571  0.292683  0.347826
10   python                   Expand   0.683333  0.640625  0.661290
11   python                  Summary   0.688172  0.780488  0.731429
12    pharo  Keyimplementationpoints   0.733333  0.511628  0.602740
13    pharo                  Example   0.921739 