In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, set_seed
import torch
import numpy as np
import pandas as pd
import time
from datasets import load_dataset
from kaggle_secrets import UserSecretsClient
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
from huggingface_hub import login
from torch.profiler import profile, ProfilerActivity


# Set seed for reproducibility
set_seed(42)

# Huggingface login for token
user_secrets = UserSecretsClient()
Huggingface = user_secrets.get_secret("Huggingface")
login(token=Huggingface)

def evaluate_roberta(metrics,new_dataset, model, tokenizer, labels, language, batch_size=16, device='cuda'):
    model.to(device)

    # Prepare data
    texts = new_dataset['combo']
    true_labels = np.array(new_dataset['labels'])

    # Tokenize the inputs
    inputs = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Split into batches
    num_batches = (len(texts) + batch_size - 1) // batch_size
    predictions = []
    total_time = 0
    total_flops = 0  # To accumulate FLOPs

    # Perform inference in batches with FLOPs profiling
    for i in range(num_batches):
        batch_input_ids = input_ids[i * batch_size: (i + 1) * batch_size]
        batch_attention_mask = attention_mask[i * batch_size: (i + 1) * batch_size]

        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_flops=True) as p:
            with torch.no_grad():
                begin = time.time()
                outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
                logits = outputs.logits
                preds = (logits.sigmoid() > 0.5).int().cpu().numpy()
                total_time += time.time() - begin

        # Accumulate FLOPs for the batch
        total_flops += sum(k.flops for k in p.key_averages() if k.flops is not None)
        predictions.append(preds)

    # Concatenate predictions
    predictions = np.vstack(predictions)

    # Evaluate metrics for each label
    for i, label in enumerate(labels):
        tp = np.sum((true_labels[:, i] == 1) & (predictions[:, i] == 1))
        fp = np.sum((true_labels[:, i] == 0) & (predictions[:, i] == 1))
        fn = np.sum((true_labels[:, i] == 1) & (predictions[:, i] == 0))
        tn = np.sum((true_labels[:, i] == 0) & (predictions[:, i] == 0))

        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

        metrics.append({'language': language, 'label': label, 'precision': precision, 'recall': recall, 'f1': f1})

    # Calculate average time and GFLOPs
    # print(pd.DataFrame(metrics))
    average_time = total_time / len(texts)
    print("Avg runtime in seconds:", average_time)
    average_GFLOPs = (total_flops / 1e9)/len(texts)
    print("Average GFLOPs:", average_GFLOPs)
    return average_time,average_GFLOPs


# Languages and labels
langs = ['java', 'python', 'pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}

ds = load_dataset('NLBSE/nlbse25-code-comment-classification')
metrics = []
average_time = 0
average_GFLOPS = 0
for lan in langs:
    tokenizer = AutoTokenizer.from_pretrained(f"MushfiqurRR/NLBSE-{lan.capitalize()}-final")
    model = AutoModelForSequenceClassification.from_pretrained(f"MushfiqurRR/NLBSE-{lan.capitalize()}-final")
    test_data = ds[f'{lan}_test']
    labels_data = labels[f'{lan}']
    A_T,A_GF = (evaluate_roberta(metrics,test_data,model,tokenizer,labels_data,lan))
    average_time+=A_T
    average_GFLOPS+= A_GF
    torch.cuda.empty_cache()
metrics = pd.DataFrame(metrics)
metrics.reset_index(drop=True, inplace=True)
print(metrics)
avg_f1 = metrics['f1'].mean()
# print(average_time)
# print(average_GFLOPS)
max_avg_flops = 5000
max_avg_runtime = 5
def score(avg_f1, avg_runtime, avg_flops):
    return (0.6 * avg_f1 +
      0.2 * ((max_avg_runtime - avg_runtime) / max_avg_runtime) +
      0.2 * ((max_avg_flops - avg_flops) / max_avg_flops))
print(round(score(avg_f1, average_time, average_GFLOPS), 2))


Avg runtime in seconds: 0.0866441339686297
Average GFLOPs: 309.265448448
Avg runtime in seconds: 0.005903605169850617
Average GFLOPs: 22.351215727
Avg runtime in seconds: 0.010901392537417296
Average GFLOPs: 33.223759013
   language                    label  precision    recall        f1
0      java                  summary   0.904488  0.881166  0.892675
1      java                Ownership   1.000000  1.000000  1.000000
2      java                   Expand   0.439252  0.460784  0.449761
3      java                    usage   0.921951  0.877030  0.898930
4      java                  Pointer   0.806452  0.951087  0.872818
5      java              deprecation   0.818182  0.600000  0.692308
6      java                 rational   0.268293  0.323529  0.293333
7    python                    Usage   0.793388  0.793388  0.793388
8    python               Parameters   0.852459  0.812500  0.832000
9    python         DevelopmentNotes   0.428571  0.292683  0.347826
10   python                   E