In [23]:
# Mount Google Drive to access your files
from google.colab import drive
drive.mount('/content/gdrive')  # safer path


# Navigate to your folder (change the path to match your Google Drive structure)


Mounted at /content/gdrive


In [28]:
%cd /content/gdrive/MyDrive/CodeClarity

/content/gdrive/MyDrive/CodeClarity


In [29]:
# List files in the directory to confirm everything is accessible
!ls -la

total 2119
drwx------ 2 root root   4096 Apr 23 19:47 backtranslation_cache
drwx------ 2 root root   4096 Apr 23 19:47 backtranslation_cache_old
drwx------ 2 root root   4096 Apr 23 22:06 backtranslations_cache
-rw------- 1 root root 136830 Apr 16 19:31 codesearchnet_summary_english.json
-rw------- 1 root root 296089 Apr  6 17:22 codesearchnet_summary_french.json
-rw------- 1 root root 284262 Apr  6 17:37 codesearchnet_summary_german.json
-rw------- 1 root root 580711 Apr  6 17:06 codesearchnet_summary_hindi.json
-rw------- 1 root root 299366 Apr  6 17:29 codesearchnet_summary_portuguese.json
-rw------- 1 root root 295609 Apr  6 17:14 codesearchnet_summary_spanish.json
-rw------- 1 root root  19196 Apr 24 04:03 Code_Summary_Evaluation.ipynb
-rw------- 1 root root 210451 Apr 23 20:41 Code_Summary_Generation.ipynb
drwx------ 2 root root   4096 Apr 22 17:13 __pycache__
-rw------- 1 root root  15097 Apr 22 16:50 run_evaluation.py
-rw------- 1 root root   4041 Apr 17 18:00 run_generation.py

In [30]:
# Install required packages
!pip install --upgrade pip --quiet
!pip install transformers datasets nltk rouge-score sacrebleu sentence-transformers sentencepiece fsspec==2025.3.2 --quiet

In [31]:
# Depends from user to user
!pip install transformers sentencepiece nltk rouge sacrebleu sentence-transformers
import nltk
nltk.download('wordnet')



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
# Check if GPU is available
import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU model: {torch.cuda.get_device_name(0)}")

GPU available: False


In [34]:
import os
import json
import numpy as np
import torch
import hashlib
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
import sacrebleu
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer, AutoModelForSeq2SeqLM



# Directories
summary_dir = "/content/gdrive/MyDrive/CodeClarity/summaries/"
backtranslation_dir = "/content/gdrive/MyDrive/CodeClarity/backtranslations_cache/"
os.makedirs(backtranslation_dir, exist_ok=True)

# Language code mapping for NLLB model
lang_code_map = {
    "Spanish": "spa_Latn",
    "Mandarin Chinese": "zho_Hans",
    "Arabic": "arb_Arab",
    "Swahili": "swh_Latn",
    "Yoruba": "yor_Latn",
    "Tamil": "tam_Taml",
    "Hindi": "hin_Deva",
    "Portuguese": "por_Latn",
    "Filipino": "fil_Latn",
    "French": "fra_Latn"
}

# Load NLLB model for backtranslation
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Cache for embedding model (SIDE)
embedding_model = None
# Cache for BERTScore model
bertscore_model = None
bertscore_tokenizer = None

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)

def compute_bertscore(ref_file, cand_file, language):
    global bertscore_model, bertscore_tokenizer

    # Determine if files are JSONL or JSON
    with open(ref_file, 'r', encoding='utf-8') as f:
        first_line = f.readline().strip()
        is_jsonl = not first_line.startswith('[')

    # Load data according to file format
    if is_jsonl:
        with open(ref_file, 'r', encoding='utf-8') as f:
            ref_data = [json.loads(line) for line in f]
        with open(cand_file, 'r', encoding='utf-8') as f:
            cand_data = [json.loads(line) for line in f]
    else:
        with open(ref_file, 'r', encoding='utf-8') as f:
            ref_data = json.load(f)
        with open(cand_file, 'r', encoding='utf-8') as f:
            cand_data = json.load(f)

    refs = [item['summary'] for item in ref_data]
    cands = [item['summary'] for item in cand_data]

    if bertscore_model is None or bertscore_tokenizer is None:
        model_name = "xlm-roberta-large"
        print(f"Loading {model_name} for BERTScore...")
        bertscore_tokenizer = AutoTokenizer.from_pretrained(model_name)
        bertscore_model = AutoModel.from_pretrained(model_name)
        if torch.cuda.is_available():
            bertscore_model = bertscore_model.cuda()

    bertscore_model.eval()

    precision_scores = []
    recall_scores = []
    f1_scores = []

    batch_size = 16 if torch.cuda.is_available() else 8

    for i in range(0, len(refs), batch_size):
        batch_refs = refs[i:i+batch_size]
        batch_cands = cands[i:i+batch_size]

        with torch.no_grad():
            ref_inputs = bertscore_tokenizer(batch_refs, return_tensors="pt", padding=True, truncation=True, max_length=512)
            cand_inputs = bertscore_tokenizer(batch_cands, return_tensors="pt", padding=True, truncation=True, max_length=512)

            if torch.cuda.is_available():
                ref_inputs = {k: v.cuda() for k, v in ref_inputs.items()}
                cand_inputs = {k: v.cuda() for k, v in cand_inputs.items()}

            ref_embeddings = bertscore_model(**ref_inputs).last_hidden_state
            cand_embeddings = bertscore_model(**cand_inputs).last_hidden_state

        for j in range(len(batch_refs)):
            ref_mask = ref_inputs['attention_mask'][j].bool()
            cand_mask = cand_inputs['attention_mask'][j].bool()
            ref_emb = ref_embeddings[j, ref_mask] / ref_embeddings[j, ref_mask].norm(dim=1, keepdim=True)
            cand_emb = cand_embeddings[j, cand_mask] / cand_embeddings[j, cand_mask].norm(dim=1, keepdim=True)
            sim_matrix = torch.matmul(ref_emb, cand_emb.transpose(0, 1))
            precision = sim_matrix.max(dim=0)[0].mean().item()
            recall = sim_matrix.max(dim=1)[0].mean().item()
            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    return {
        "precision": round(sum(precision_scores) / len(precision_scores), 4),
        "recall": round(sum(recall_scores) / len(recall_scores), 4),
        "f1": round(sum(f1_scores) / len(f1_scores), 4)
    }

def compute_side_score(references, candidates):
    global embedding_model
    if embedding_model is None:
        print("Loading embedding model for SIDE...")
        embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
        if torch.cuda.is_available():
            embedding_model = embedding_model.to("cuda")

    batch_size = 32 if torch.cuda.is_available() else 16
    ref_embeddings = embedding_model.encode(references, batch_size=batch_size)
    cand_embeddings = embedding_model.encode(candidates, batch_size=batch_size)

    scores = [
        np.dot(ref_emb, cand_emb) / (np.linalg.norm(ref_emb) * np.linalg.norm(cand_emb))
        for ref_emb, cand_emb in zip(ref_embeddings, cand_embeddings)
    ]

    return round(sum(scores) / len(scores), 4)

def compute_meteor_score(references, candidates):
    return round(sum(meteor_score([r.split()], c.split()) for r, c in zip(references, candidates)) / len(references), 4)

def compute_chrf_score(references, candidates):
    chrf = sacrebleu.corpus_chrf(candidates, [references], word_order=2)
    return round(chrf.score, 4)

def backtranslate_with_nllb(text, source_lang, code_lang):
    """Translate text from source language to English using NLLB model with caching."""
    global model, tokenizer, lang_code_map

    # 1. Get the correct source language code
    source_lang_code = lang_code_map.get(source_lang)
    if not source_lang_code:
        print(f"Warning: Language {source_lang} not found in language map. Skipping translation.")
        return text  # Return original text as fallback

    target_lang_code = "eng_Latn"  # Always translate to English

    # Create unique cache key
    text_hash = hashlib.md5(text.encode('utf-8')).hexdigest()
    cache_file = os.path.join(backtranslation_dir, f"{code_lang.lower()}_{source_lang.lower()}_{text_hash}.txt")

    # Return from cache if exists
    if os.path.exists(cache_file):
        with open(cache_file, 'r', encoding='utf-8') as f:
            return f.read()

    try:
        forced_bos_token_id = tokenizer.lang_code_to_id[target_lang_code]

        inputs = tokenizer(text, return_tensors="pt")
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}

        translated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=forced_bos_token_id,
            max_length=512
        )

        result = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

        # Cache translation
        with open(cache_file, 'w', encoding='utf-8') as f:
            f.write(result)

        return result
    except Exception as e:
        print(f"Translation error: {e}")
        # Fallback to a simple echo if translation fails
        return f"[TRANSLATION ERROR: {text[:30]}...]"

def get_cached_backtranslation(text, lang, code_lang):
    """Get backtranslation with caching"""
    os.makedirs(backtranslation_dir, exist_ok=True)
    cache_key = hashlib.sha256((lang + ":" + text).encode("utf-8")).hexdigest()
    cache_path = os.path.join(backtranslation_dir, f"{cache_key}.json")

    if os.path.exists(cache_path):
        with open(cache_path, "r", encoding="utf-8") as f:
            return json.load(f)["bt"]
    else:
        bt = backtranslate_with_nllb(text, lang, code_lang)
        with open(cache_path, "w", encoding="utf-8") as f:
            json.dump({"bt": bt}, f)
        return bt

def compute_all_metrics(references, candidates, lang, code_lang):
    print(f"Backtranslating {len(candidates)} summaries from {lang}...")
    translated_candidates = []
    for i, text in enumerate(candidates):
        bt_text = get_cached_backtranslation(text, lang, code_lang)
        translated_candidates.append(bt_text)
        if (i+1) % 10 == 0:
            print(f"  Progress: {i+1}/{len(candidates)}")

    print("Computing all metrics...")

    smoothie = SmoothingFunction().method4
    ref_list = [[ref.split()] for ref in references]
    cand_list = [cand.split() for cand in translated_candidates]
    bleu_score = corpus_bleu(ref_list, cand_list, smoothing_function=smoothie)

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(r, c)['rougeL'].fmeasure for r, c in zip(references, translated_candidates)]
    avg_rouge = sum(rouge_scores) / len(rouge_scores)

    meteor = compute_meteor_score(references, translated_candidates)
    chrf = compute_chrf_score(references, translated_candidates)
    side = compute_side_score(references, translated_candidates)

    return {
        "bleu_bt": round(bleu_score, 4),
        "rougeL_bt": round(avg_rouge, 4),
        "meteor_bt": meteor,
        "chrf_bt": chrf,
        "side_bt": side
    }


def run_evaluation():

    # Process each programming language in a loop
    for code_lang in ["php", "python", "java", "go", "javascript", "ruby"]:
        print(f"\nEvaluating {code_lang} summaries...")

        languages = [
            "Spanish", "Mandarin Chinese", "Arabic", "Swahili", "Yoruba",
            "Tamil", "Hindi", "Portuguese", "Filipino", "French"
        ]

        # Try .jsonl extension for reference file
        ref_file = os.path.join(summary_dir, f"summary_{code_lang}_english.jsonl")
        if not os.path.exists(ref_file):
            print(f"Reference file not found for {code_lang}. Skipping this language.")
            continue

        # Determine file format (JSONL or JSON)
        with open(ref_file, 'r', encoding='utf-8') as f:
            first_line = f.readline().strip()
            is_jsonl = not first_line.startswith('[')

        # Load references
        if is_jsonl:
            with open(ref_file, "r", encoding="utf-8") as f:
                ref_data = [json.loads(line) for line in f]
        else:
            with open(ref_file, "r", encoding="utf-8") as f:
                ref_data = json.load(f)

        references = [entry["summary"] for entry in ref_data]
        results_for_lang = []

        for lang in languages:
            # Try jsonl extension
            lang_file = os.path.join(summary_dir, f"summary_{code_lang}_{lang.lower()}.jsonl")
            if not os.path.exists(lang_file):
                print(f"  Missing summary file for {code_lang}/{lang}. Skipping.")
                continue

            print(f"\n  Evaluating {code_lang} summaries in {lang}...")

            # Load candidates in the same format as references
            if is_jsonl:
                with open(lang_file, "r", encoding="utf-8") as f:
                    lang_data = [json.loads(line) for line in f]
            else:
                with open(lang_file, "r", encoding="utf-8") as f:
                    lang_data = json.load(f)

            candidates = [entry["summary"] for entry in lang_data]

            print(f"  Computing BERTScore...")
            bert_result = compute_bertscore(ref_file, lang_file, lang)

            print(f"  Computing direct multilingual embedding similarity...")
            direct_side = compute_side_score(references, candidates)

            print(f"  Computing backtranslation-based metrics...")
            metrics = compute_all_metrics(references, candidates, lang, code_lang)

            result = {
                "programming_language": code_lang,
                "language": lang,
                "bertscore_f1": bert_result["f1"],
                "bertscore_precision": bert_result["precision"],
                "bertscore_recall": bert_result["recall"],
                "direct_side": direct_side,
                **metrics
            }

            print(f"\n  Results for {lang} → English:")
            print(result)

            results_for_lang.append(result)

        # Save results for this programming language
        lang_results_file = os.path.join(backtranslation_dir, f"scores_{code_lang}.json")
        with open(lang_results_file, "w") as f:
            json.dump(results_for_lang, f, indent=2, cls=NumpyEncoder)

        print(f"Scores for {code_lang} saved to: {lang_results_file}")

    # Combine all results at the end
    all_results = []
    for code_lang in ["php", "python", "java", "go", "javascript", "ruby"]:
        lang_results_file = os.path.join(backtranslation_dir, f"scores_{code_lang}.json")
        if os.path.exists(lang_results_file):
            with open(lang_results_file, "r") as f:
                all_results.extend(json.load(f))

    # Save all combined results
    all_results_file = os.path.join(backtranslation_dir, "all_scores.json")
    with open(all_results_file, "w") as f:
        json.dump(all_results, f, indent=2, cls=NumpyEncoder)

    print(f"\nAll scores saved to: {all_results_file}")

# Run evaluation
run_evaluation()


Evaluating php summaries...

  Evaluating php summaries in Spanish...
  Computing BERTScore...
Loading xlm-roberta-large for BERTScore...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

  Computing direct multilingual embedding similarity...
Loading embedding model for SIDE...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  Computing backtranslation-based metrics...
Backtranslating 1 summaries from Spanish...
Translation error: NllbTokenizerFast has no attribute lang_code_to_id
Computing all metrics...

  Results for Spanish → English:
{'programming_language': 'php', 'language': 'Spanish', 'bertscore_f1': 0.9777, 'bertscore_precision': 0.9765, 'bertscore_recall': 0.9789, 'direct_side': np.float32(0.6896), 'bleu_bt': 0, 'rougeL_bt': 0.0067, 'meteor_bt': 0.0, 'chrf_bt': 0.9639, 'side_bt': np.float32(0.509)}
  Missing summary file for php/Mandarin Chinese. Skipping.

  Evaluating php summaries in Arabic...
  Computing BERTScore...
  Computing direct multilingual embedding similarity...
  Computing backtranslation-based metrics...
Backtranslating 1 summaries from Arabic...
Translation error: NllbTokenizerFast has no attribute lang_code_to_id
Computing all metrics...

  Results for Arabic → English:
{'programming_language': 'php', 'language': 'Arabic', 'bertscore_f1': 0.9737, 'bertscore_precision': 0.9723, '