Evaluate summarization quality by measuring text-overlap using BELU. Summary is back-translated to English and compared against the English summary.


In [None]:
# save GEMINI_API_KEY in .env file
# call `huggingface-cli login` to login to huggingface
import os
import dotenv
import json

dotenv.load_dotenv()

True

In [25]:
# english
with open("./codesearchnet_summary_english.json", "r") as f:
    dataset = json.load(f)
    for sample in dataset:
        sample["summary_english"] = sample["summary"]

file_paths = {
    "french": "./codesearchnet_summary_french.json",
    "german": "./codesearchnet_summary_german.json",
    "hindi": "./codesearchnet_summary_hindi.json",
    "portuguese": "./codesearchnet_summary_portuguese.json",
    "spanish": "./codesearchnet_summary_spanish.json",
}
for lang, file_path in file_paths.items():
    with open(file_path, "r") as f:
        dataset_lang = json.load(f)
        for idx, sample in enumerate(dataset_lang):
            dataset[idx][f"summary_{lang}"] = sample["summary"]
            
dataset[0]

{'code': 'def addidsuffix(self, idsuffix, recursive = True):\n        """Appends a suffix to this element\'s ID, and optionally to all child IDs as well. There is sually no need to call this directly, invoked implicitly by :meth:`copy`"""\n        if self.id: self.id += idsuffix\n        if recursive:\n            for e in self:\n                try:\n                    e.addidsuffix(idsuffix, recursive)\n                except Exception:\n                    pass',
 'docstring': "Appends a suffix to this element's ID, and optionally to all child IDs as well. There is sually no need to call this directly, invoked implicitly by :meth:`copy`",
 'summary': 'Sure! Let’s break down what this function does in simple terms:\n\n1. **Purpose**: The function `addidsuffix` adds a suffix (a piece of text) to the `id` of an element. It can also optionally add the same suffix to the `id` of all its child elements.\n\n2. **Parameters**:\n   - `idsuffix`: This is the text you want to append to the `i

In [24]:
from google import genai
client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])

In [26]:
# calculate back translation of summary to english

prompt_tmpl = """
Translate the following code from {source} to {target}
Don't include any other text than the translation.
Text: {text}
Translation:
"""

languages = ["french", "german", "hindi", "portuguese", "spanish"]

for sample in dataset:
    for source in languages:
        summary = sample[f"summary_{source}"]
        summary_back = client.models.generate_content(
            model="gemini-2.0-flash", 
            contents=prompt_tmpl.format(source=source, target="english", text=summary)
        )
        sample[f"summary_{source}_english"] = summary_back.text

In [27]:
# calculate blue between english and backtranslated summary
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize

def calculate_bleu(reference, candidate):
    """Calculate BLEU score between reference and candidate strings"""
    reference_tokens = [word_tokenize(reference.lower())]
    candidate_tokens = word_tokenize(candidate.lower())
    return sentence_bleu(reference_tokens, candidate_tokens)

# Calculate BLEU score between original and translated text
languages = ["french", "german", "hindi", "portuguese", "spanish"]

for sample in dataset:
    for lang in languages:
        text1 = sample["summary_english"]
        text2 = sample[f"summary_{lang}_english"]
        score = calculate_bleu(text1, text2)
        sample[f"bleu_summary_{lang}"] = score


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [28]:
import pandas as pd

columns = ["sample_id"] + [f"bleu_{lang}" for lang in languages]
data = []
for idx, sample in enumerate(dataset):
    record = [idx]
    for lang in languages:
        record.append(sample[f"bleu_summary_{lang}"])
    data.append(record)

df_bleu = pd.DataFrame(data, columns=columns)

In [29]:
df_bleu.to_csv("bleu_summary.csv", index=False)

In [30]:
df_bleu.head()

Unnamed: 0,sample_id,bleu_french,bleu_german,bleu_hindi,bleu_portuguese,bleu_spanish
0,0,0.101082,0.115502,0.104754,0.19347,0.22731
1,1,0.249721,0.216234,0.227322,0.223811,0.26924
2,2,0.376477,0.24321,0.225984,0.340312,0.348646
3,3,0.391,0.337511,0.261976,0.465752,0.437511
4,4,0.300351,0.225045,0.18837,0.3529,0.335318


In [31]:
df_bleu.describe()

Unnamed: 0,sample_id,bleu_french,bleu_german,bleu_hindi,bleu_portuguese,bleu_spanish
count,100.0,100.0,100.0,100.0,100.0,100.0
mean,49.5,0.263333,0.228706,0.1644995,0.276547,0.286424
std,29.011492,0.068812,0.073908,0.06747216,0.073292,0.070914
min,0.0,0.101082,0.072219,5.980519e-79,0.103755,0.116892
25%,24.75,0.218369,0.180035,0.1147524,0.230398,0.243659
50%,49.5,0.263531,0.233202,0.1656304,0.279988,0.289129
75%,74.25,0.306948,0.27737,0.2165171,0.330416,0.332025
max,99.0,0.395623,0.423066,0.3490217,0.465752,0.473574


In [19]:
df_bleu.max()

sample_id          99
source        english
target          hindi
bleu         0.946186
dtype: object