In [2]:
!pip install transformers datasets nltk rouge-score sacrebleu sentence-transformers sentencepiece fsspec==2025.3.2 bert-score --quiet
!pip install indic-nlp-library camel-tools

  Building wheel for camel-kenlm (pyproject.toml) ... [?25l[?25hdone
  Created wheel for camel-kenlm: filename=camel_kenlm-2025.4.8-cp311-cp311-linux_x86_64.whl size=3455544 sha256=8e429ca5fa3f1e3161ed885fad5bc1d69136273ccc6cb509e363fd6650d55c8a
  Stored in directory: /root/.cache/pip/wheels/19/b9/62/8559aee1915ae6690fcc902a972a9ba0ff46d3ee67fea2aa44
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=ea14e4c73188159e834127af8ce8bdeb393496ae9e0ee1d494c1fb96a6ac88d5
  Stored in directory: /root/.cache/pip/wheels/1a/b0/8c/4b75c4116c31f83c8f9f047231251e13cc74481cca4a78a9ce
Successfully built camel-kenlm docopt
Installing collected packages: morfessor, docopt, camel-kenlm, pyrsistent, muddler, tokenizers, sphinx-argparse, transformers, indic-nlp-library, camel-tools
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.1
    Uninstalling tokenizers-0.21.1:
      

In [None]:
!pip install unbabel-comet
!pip install evaluate

In [None]:
# ─── IMPORTS ─────────────────────────────────────────────
import os
import json
import hashlib
import numpy as np
import torch
import pandas as pd
import os
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
import sacrebleu
from sentence_transformers import util
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel
from bert_score import score as bert_score

from nltk.tokenize import word_tokenize
from indicnlp.tokenize.indic_tokenize import trivial_tokenize
from camel_tools.tokenizers.word import simple_word_tokenize
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from evaluate import load as evaluate_load

In [None]:
# Install required packages
import nltk
nltk.download('wordnet')
import nltk
nltk.download('punkt')
import nltk
nltk.download('punkt_tab')


In [8]:
# ─── CONFIGURATION ──────────────────────────────────────────────
summary_dir = "data/code_summaries"
backtranslation_dir = "backtranslations_cache"
os.makedirs(backtranslation_dir, exist_ok=True)


# Mapping for summary field name → Display name
json_field_to_lang = {
    "chinese":     "Chinese",
    "french":      "French",
    "spanish":     "Spanish",
    "portuguese":  "Portuguese",
    "arabic":      "Arabic",
    "hindi":       "Hindi"
}

# Mapping for Display name → M2M-100 language code (used for backtranslation)
m2m_lang_map = {
    "Chinese":     "zh",
    "French":      "fr",
    "Spanish":     "es",
    "Portuguese":  "pt",
    "Arabic":      "ar",
    "Hindi":       "hi"
}

# Load M2M-100 model
m2m_model_name = "facebook/m2m100_418M"
tok_m2m = AutoTokenizer.from_pretrained(m2m_model_name)
model_m2m = AutoModelForSeq2SeqLM.from_pretrained(m2m_model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model_m2m = model_m2m.to(device)

# Caches
embedding_model     = None
bertscore_model     = None
bertscore_tokenizer = None
side_tokenizer      = None
side_model          = None

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
2025-05-17 15:50:41.303183: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747497041.484493      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747497041.534782      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Cloning into 'codeclarity'...
remote: Enumerating objects: 572, done.[K
remote: Counting objects: 100% (154/154), done.[K
remote: Compressing objects: 100% (129/129), done.[K
remote: Total 572 (delta 104), reused 24 (delta 24), pack-reused 418 (from 2)[K
Receiving objects: 100% (572/572), 11.16 MiB | 17.71 MiB/s, done.
Resolving deltas: 100% (315/315), done.


In [10]:
# ─── BACK-TRANSLATION FUNCTION ─────────────────────────────────────────────
def backtranslate_with_m2m(text, src_name):
    key = f"{src_name}_{hashlib.md5(text.encode()).hexdigest()}"
    cache_file = os.path.join(backtranslation_dir, key + ".txt")
    if os.path.exists(cache_file):
        return open(cache_file, 'r', encoding='utf-8').read()

    src_lang = m2m_lang_map.get(src_name)
    tgt_lang = "en"  # Always translating into English

    if src_lang is None:
        return text

    tok_m2m.src_lang = src_lang
    inputs = tok_m2m(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    generated_tokens = model_m2m.generate(**inputs, forced_bos_token_id=tok_m2m.get_lang_id(tgt_lang))
    output_text = tok_m2m.decode(generated_tokens[0], skip_special_tokens=True)

    with open(cache_file, 'w', encoding='utf-8') as f:
        f.write(output_text)

    return output_text

# ─── METRIC FUNCTIONS ──────────────────────────────────────────────────────
def compute_bertscore(refs, hyps):
    P, R, F1 = bert_score(
        hyps,
        refs,
        model_type="xlm-roberta-large",
        lang="en",
        rescale_with_baseline=False
    )
    return {
        "precision": round(P.mean().item(), 4),
        "recall":    round(R.mean().item(), 4),
        "f1":        round(F1.mean().item(), 4)
    }

tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

In [None]:
# ─── MEAN POOLING (for SIDE) ───────────────────────────────────────────────
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)


def compute_side_score(codes, hyps):
    global side_tokenizer, side_model
    if side_model is None:
        checkpoint = "/103080"  # <-- Change if needed
        side_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        side_model     = AutoModel.from_pretrained(checkpoint)
        if torch.cuda.is_available(): side_model = side_model.cuda()
        side_model.eval()
    scores = []
    for code, summ in zip(codes, hyps):
        enc = side_tokenizer([code, summ], padding=True, truncation=True, return_tensors="pt")
        if torch.cuda.is_available(): enc = {k:v.cuda() for k,v in enc.items()}
        with torch.no_grad(): out = side_model(**enc)
        pooled = mean_pooling(out, enc['attention_mask'])
        normed = torch.nn.functional.normalize(pooled, p=2, dim=1)
        scores.append(util.pytorch_cos_sim(normed[0], normed[1]).item())
    return round(float(np.mean(scores)),4)

def compute_meteor_score(refs, hyps):
    sc = []
    for r,h in zip(refs, hyps):
        rt = word_tokenize(r.lower()); ht = word_tokenize(h.lower())
        sc.append(meteor_score([rt], ht))
    return round(float(np.mean(sc)),4)

def compute_chrf_score(refs, hyps):
  refs = [r.lower() for r in refs]
  hyps = [h.lower() for h in hyps]

  res = sacrebleu.corpus_chrf(hyps, [refs], word_order=2)
  return round(res.score / 100, 4)

## ----BLEU METRIC-----------
def compute_bleu_sacre(refs, hyps, lang_name):
    lang_name = lang_name.lower()

    # Define tokenizer per language
    tokenizer_map = {
        "chinese": "zh",
        "french": "13a",
        "portuguese": "13a",
        "arabic": "intl",
        "hindi": "intl",
        "spanish": "13a"
    }

    # Default tokenizer if language not found
    tokenizer = tokenizer_map.get(lang_name, "13a")

    # Compute BLEU-4
    score = sacrebleu.corpus_bleu(hyps, [refs], tokenize=tokenizer)
    return round(score.score / 100, 4) # Normalize to 0–1 like nltk



def tokenize(text, lang):
    lang = lang.lower()
    if lang == "chinese":
        return list(text.strip())
    elif lang == "arabic":
        return simple_word_tokenize(text)
    elif lang == "hindi":
        return trivial_tokenize(text, lang='hi')
    elif lang in ["french", "portuguese"]:
        return word_tokenize(text, language=lang)
    else:
        return text.strip().split()


def compute_bleu_nltk(refs_tokenized, hyps_tokenized):
    smoothie = SmoothingFunction().method1
    score = corpus_bleu(
        refs_tokenized,
        hyps_tokenized,
        weights=(0.25, 0.25, 0.25, 0.25),
        smoothing_function=smoothie
    )
    return round(score, 4)

## ----COMET METRIC-----------

comet = evaluate_load("comet", config_name="Unbabel/wmt22-comet-da")

def compute_comet_score(sources, references, hypotheses, batch_size=8, gpus=0):
    result = comet.compute(
        sources=sources,
        predictions=hypotheses,
        references=references,
    )
    per_example = result.get("scores", [])
    mean_score = float(np.mean(per_example)) if per_example else 0.0
    return round(mean_score, 4), per_example


# COMPUTE ALL METRICS
def compute_all_metrics(codes, refs, hyps, lang_name, code_lang):
    print(f"  Computing backtranslation-based metrics for {lang_name}...")
    bt = [backtranslate_with_m2m(h, lang_name) for h in hyps]
    smoothie = SmoothingFunction().method4
    refs_tokenized = [[tokenize(r, lang_name)] for r in refs]
    hyps_tokenized = [tokenize(b, lang_name) for b in bt]

    # Compute BLEU using tokenized inputs
    bleu_nltk = compute_bleu_nltk(refs_tokenized, hyps_tokenized)

    bleu_sacre = compute_bleu_sacre(refs, bt, lang_name)
    bleu_diff = round(abs(bleu_nltk - bleu_sacre), 4)

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rl = [scorer.score(r, b)['rougeL'].fmeasure for r, b in zip(refs, bt)]
    comet_mean, comet_per_example = compute_comet_score(
        sources=hyps,
        references=refs,
        hypotheses=bt
    )

    return {
        "bleu4_nltk": round(bleu_nltk, 4),
        "bleu4_sacrebleu": bleu_sacre,
        "bleu4_diff": bleu_diff,
        "rougeL": round(np.mean(rl), 4),
        "meteor": compute_meteor_score(refs, bt),
        "chrf++": compute_chrf_score(refs, bt),
        "side_bt": compute_side_score(codes, bt),
        "comet_mean": comet_mean,
        "comet_per_example": comet_per_example
    }



Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]


📦 Evaluating model: qwen2.5-coder-7b-instruct

📄 File: php_summary_all_languages_Qwen2.5-Coder-7B-Instruct.json | Programming language: php

🔍 Evaluating Chinese summaries...
  ➤ Computing BERTScore...
  ➤ Computing direct multilingual embedding similarity (SIDE)...
  Computing backtranslation-based metrics for Chinese...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



📊 Results for Chinese → English:
    BERTScore (F1): 0.8702
    SIDE Original (multilingual): 0.8066
    SIDE Backtranslated (English): 0.8895
    BLEU-4 NLTK (BT): 0.3267
    BLEU-4 SacreBLEU (BT): 0.0744
    BLEU Difference: 0.2523
    ROUGE-L (BT): 0.332
    METEOR (BT): 0.2537
    CHRF++ (BT): 0.3171
    COMET mean score: 0.7435
------------------------------------------------------------

🔍 Evaluating French summaries...
  ➤ Computing BERTScore...
  ➤ Computing direct multilingual embedding similarity (SIDE)...
  Computing backtranslation-based metrics for French...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



📊 Results for French → English:
    BERTScore (F1): 0.8726
    SIDE Original (multilingual): 0.8294
    SIDE Backtranslated (English): 0.893
    BLEU-4 NLTK (BT): 0.0537
    BLEU-4 SacreBLEU (BT): 0.0539
    BLEU Difference: 0.0002
    ROUGE-L (BT): 0.316
    METEOR (BT): 0.2276
    CHRF++ (BT): 0.3039
    COMET mean score: 0.7406
------------------------------------------------------------

🔍 Evaluating Spanish summaries...
  ➤ Computing BERTScore...
  ➤ Computing direct multilingual embedding similarity (SIDE)...
  Computing backtranslation-based metrics for Spanish...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



📊 Results for Spanish → English:
    BERTScore (F1): 0.884
    SIDE Original (multilingual): 0.798
    SIDE Backtranslated (English): 0.8915
    BLEU-4 NLTK (BT): 0.0568
    BLEU-4 SacreBLEU (BT): 0.0743
    BLEU Difference: 0.0175
    ROUGE-L (BT): 0.3249
    METEOR (BT): 0.2601
    CHRF++ (BT): 0.3198
    COMET mean score: 0.7365
------------------------------------------------------------

🔍 Evaluating Portuguese summaries...
  ➤ Computing BERTScore...
  ➤ Computing direct multilingual embedding similarity (SIDE)...
  Computing backtranslation-based metrics for Portuguese...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



📊 Results for Portuguese → English:
    BERTScore (F1): 0.8854
    SIDE Original (multilingual): 0.8377
    SIDE Backtranslated (English): 0.8773
    BLEU-4 NLTK (BT): 0.0976
    BLEU-4 SacreBLEU (BT): 0.0989
    BLEU Difference: 0.0013
    ROUGE-L (BT): 0.352
    METEOR (BT): 0.2791
    CHRF++ (BT): 0.3368
    COMET mean score: 0.7588
------------------------------------------------------------

🔍 Evaluating Arabic summaries...
  ➤ Computing BERTScore...
  ➤ Computing direct multilingual embedding similarity (SIDE)...
  Computing backtranslation-based metrics for Arabic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



📊 Results for Arabic → English:
    BERTScore (F1): 0.8558
    SIDE Original (multilingual): 0.6306
    SIDE Backtranslated (English): 0.8426
    BLEU-4 NLTK (BT): 0.0543
    BLEU-4 SacreBLEU (BT): 0.0543
    BLEU Difference: 0.0
    ROUGE-L (BT): 0.2813
    METEOR (BT): 0.217
    CHRF++ (BT): 0.273
    COMET mean score: 0.7161
------------------------------------------------------------

🔍 Evaluating Hindi summaries...
  ➤ Computing BERTScore...
  ➤ Computing direct multilingual embedding similarity (SIDE)...
  Computing backtranslation-based metrics for Hindi...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



📊 Results for Hindi → English:
    BERTScore (F1): 0.8573
    SIDE Original (multilingual): 0.4914
    SIDE Backtranslated (English): 0.8189
    BLEU-4 NLTK (BT): 0.0643
    BLEU-4 SacreBLEU (BT): 0.0779
    BLEU Difference: 0.0136
    ROUGE-L (BT): 0.2451
    METEOR (BT): 0.2384
    CHRF++ (BT): 0.2508
    COMET mean score: 0.6829
------------------------------------------------------------

📄 File: ruby_summary_all_languages_Qwen2.5-Coder-7B-Instruct.json | Programming language: ruby

🔍 Evaluating Chinese summaries...
  ➤ Computing BERTScore...
  ➤ Computing direct multilingual embedding similarity (SIDE)...
  Computing backtranslation-based metrics for Chinese...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



📊 Results for Chinese → English:
    BERTScore (F1): 0.8708
    SIDE Original (multilingual): 0.8381
    SIDE Backtranslated (English): 0.8489
    BLEU-4 NLTK (BT): 0.3379
    BLEU-4 SacreBLEU (BT): 0.0925
    BLEU Difference: 0.2454
    ROUGE-L (BT): 0.3392
    METEOR (BT): 0.2862
    CHRF++ (BT): 0.3192
    COMET mean score: 0.7284
------------------------------------------------------------

🔍 Evaluating French summaries...
  ➤ Computing BERTScore...
  ➤ Computing direct multilingual embedding similarity (SIDE)...
  Computing backtranslation-based metrics for French...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



📊 Results for French → English:
    BERTScore (F1): 0.8761
    SIDE Original (multilingual): 0.8176
    SIDE Backtranslated (English): 0.8782
    BLEU-4 NLTK (BT): 0.0608
    BLEU-4 SacreBLEU (BT): 0.0674
    BLEU Difference: 0.0066
    ROUGE-L (BT): 0.3204
    METEOR (BT): 0.224
    CHRF++ (BT): 0.2913
    COMET mean score: 0.7344
------------------------------------------------------------

🔍 Evaluating Spanish summaries...
  ➤ Computing BERTScore...
  ➤ Computing direct multilingual embedding similarity (SIDE)...
  Computing backtranslation-based metrics for Spanish...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



📊 Results for Spanish → English:
    BERTScore (F1): 0.8868
    SIDE Original (multilingual): 0.8224
    SIDE Backtranslated (English): 0.8733
    BLEU-4 NLTK (BT): 0.0485
    BLEU-4 SacreBLEU (BT): 0.0807
    BLEU Difference: 0.0322
    ROUGE-L (BT): 0.3267
    METEOR (BT): 0.2465
    CHRF++ (BT): 0.3031
    COMET mean score: 0.733
------------------------------------------------------------

🔍 Evaluating Portuguese summaries...
  ➤ Computing BERTScore...
  ➤ Computing direct multilingual embedding similarity (SIDE)...
  Computing backtranslation-based metrics for Portuguese...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



📊 Results for Portuguese → English:
    BERTScore (F1): 0.8851
    SIDE Original (multilingual): 0.8384
    SIDE Backtranslated (English): 0.8911
    BLEU-4 NLTK (BT): 0.0953
    BLEU-4 SacreBLEU (BT): 0.1072
    BLEU Difference: 0.0119
    ROUGE-L (BT): 0.3512
    METEOR (BT): 0.2743
    CHRF++ (BT): 0.3287
    COMET mean score: 0.7494
------------------------------------------------------------

🔍 Evaluating Arabic summaries...
  ➤ Computing BERTScore...
  ➤ Computing direct multilingual embedding similarity (SIDE)...
  Computing backtranslation-based metrics for Arabic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



📊 Results for Arabic → English:
    BERTScore (F1): 0.8599
    SIDE Original (multilingual): 0.728
    SIDE Backtranslated (English): 0.8701
    BLEU-4 NLTK (BT): 0.0787
    BLEU-4 SacreBLEU (BT): 0.0787
    BLEU Difference: 0.0
    ROUGE-L (BT): 0.2859
    METEOR (BT): 0.2311
    CHRF++ (BT): 0.2773
    COMET mean score: 0.7172
------------------------------------------------------------

🔍 Evaluating Hindi summaries...
  ➤ Computing BERTScore...
  ➤ Computing direct multilingual embedding similarity (SIDE)...
  Computing backtranslation-based metrics for Hindi...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



📊 Results for Hindi → English:
    BERTScore (F1): 0.8638
    SIDE Original (multilingual): 0.5664
    SIDE Backtranslated (English): 0.8467
    BLEU-4 NLTK (BT): 0.0936
    BLEU-4 SacreBLEU (BT): 0.1118
    BLEU Difference: 0.0182
    ROUGE-L (BT): 0.2815
    METEOR (BT): 0.2657
    CHRF++ (BT): 0.2693
    COMET mean score: 0.7207
------------------------------------------------------------

📄 File: javascript_summary_all_languages_Qwen2.5-Coder-7B-Instruct.json | Programming language: javascript

🔍 Evaluating Chinese summaries...
  ➤ Computing BERTScore...
  ➤ Computing direct multilingual embedding similarity (SIDE)...
  Computing backtranslation-based metrics for Chinese...


In [None]:
# ─── MAIN EVALUATION ─────────────────────────────────────
def run_evaluation():
    base_dir = "codeclarity/generated_code_summaries/final-run-v2"
    all_results = []

    for model_folder in os.listdir(base_dir):
        model_path = os.path.join(base_dir, model_folder)
        if not os.path.isdir(model_path):
            continue  # Skip files

        print(f"\n Evaluating model: {model_folder}")

        for fname in os.listdir(model_path):
            if not fname.endswith('.json'):
                continue
            if "all_languages_combined" in fname:
                continue  # Skip combined file

            summary_path = os.path.join(model_path, fname)
            code_lang = fname.split('_')[0]
            print(f"\nFile: {fname} | Programming language: {code_lang}")

            with open(summary_path, encoding='utf-8') as f:
                data = json.load(f)

            # Extract codes from the data
            codes = [d.get('code', '') for d in data] # Assuming 'code' is the field name for code snippets
            refs = [d['summary_english'] for d in data]

            for field, lang_name in json_field_to_lang.items():
                if f'summary_{field}' not in data[0]:
                    print(f"  Field summary_{field} not found in dataset. Skipping {lang_name}.")
                    continue

                hyps = [d.get(f'summary_{field}', '') for d in data]
                if not any(hyps):
                    print(f"  All summaries for {lang_name} are empty or missing. Skipping.")
                    continue

                print(f"\n Evaluating {lang_name} summaries...")
                print("  ➤ Computing BERTScore...")
                bert = compute_bertscore(refs, hyps)
                print("  ➤ Computing direct multilingual embedding similarity (SIDE)...")
                side_original = compute_side_score(codes, hyps) # SIDE also needs codes
                # Pass codes to compute_all_metrics
                metrics = compute_all_metrics(codes, refs, hyps, lang_name, code_lang)
                side_drop = round(side_original - metrics["side_bt"], 4)


                print(f"\nResults for {lang_name} → English:")
                print(f"    BERTScore (F1): {bert['f1']}")
                print(f"    SIDE Original (multilingual): {side_original}")
                print(f"    SIDE Backtranslated (English): {metrics['side_bt']}")
                print(f"    BLEU-4 NLTK (BT): {metrics['bleu4_nltk']}")
                print(f"    BLEU-4 SacreBLEU (BT): {metrics['bleu4_sacrebleu']}")
                print(f"    BLEU Difference: {metrics['bleu4_diff']}")

                print(f"    ROUGE-L (BT): {metrics['rougeL']}")
                print(f"    METEOR (BT): {metrics['meteor']}")
                print(f"    CHRF++ (BT): {metrics['chrf++']}")
                print(f"    COMET mean score: {metrics['comet_mean']}")
                print("-" * 60)

                for i, entry in enumerate(data):
                    sample_id = entry.get("id", f"{code_lang}_{i}")
                    generated_summary = entry.get(f'summary_{field}', '')
                    backtranslated_summary = backtranslate_with_m2m(generated_summary, lang_name)

                    result = {
                        "sample_id": sample_id,
                        "model_folder_name": model_folder,
                        "model_name": entry.get("model_name", model_folder),
                        "programming_language": code_lang,
                        "language": lang_name,
                        "reference_summary": entry.get("summary_english", ""),
                        "generated_summary": generated_summary,
                        "backtranslated_summary": backtranslated_summary,
                        "bertscore_f1": bert["f1"],
                        "bertscore_precision": bert["precision"],
                        "bertscore_recall": bert["recall"],
                        "side_original": side_original,
                        "side_bt": metrics["side_bt"],
                        "side_drop": side_drop,
                        "bleu4_nltk": metrics["bleu4_nltk"],
                        "bleu4_sacrebleu": metrics["bleu4_sacrebleu"],
                        "bleu4_diff": metrics["bleu4_diff"],
                        "rougeL": metrics["rougeL"],
                        "meteor": metrics["meteor"],
                        "chrf++": metrics["chrf++"],
                        "comet_mean": metrics['comet_mean'],
                        "comet_example_score": metrics['comet_per_example'][i]
                    }

                    all_results.append(result)

    # ─── SAVE RESULTS ──────────────────────────────────────────────────────
    os.makedirs(backtranslation_dir, exist_ok=True)
    json_out = os.path.join(backtranslation_dir, 'all_scores.json')
    csv_out = os.path.join(backtranslation_dir, 'all_scores.csv')

    with open(json_out, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, indent=2)

    df = pd.DataFrame(all_results)
    df.to_csv(csv_out, index=False)

    print(f"\n✅ All scores saved to:\n- JSON: {json_out}\n- CSV: {csv_out}")


if __name__ == "__main__":
    run_evaluation()