In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.getcwd()

'/workspace/Approach2/shalaka/files'

In [3]:
import numpy as np
from evaluate import load
from sentence_transformers import SentenceTransformer, util

def compute_translation_scores(predictions, references, lang='en'):
    """
    Compute reference-based translation evaluation scores using:
    1. BERTScore (MuRIL)
    2. LaBSE cosine similarity
    
    Args:
        predictions (list of str): Model outputs
        references (list of str): Reference translations
        lang (str): Language code for BERTScore ('en', 'hi', etc.)
    
    Returns:
        dict: {'muril_f1': float, 'labse_cosine': float}
    """
    assert len(predictions) == len(references), "Predictions and references must have the same length"

    # ---------- BERTScore (MuRIL) ----------
    bertscore = load("bertscore")
    bert_results = bertscore.compute(
        predictions=predictions,
        references=references,
        model_type='google/muril-base-cased',
        num_layers=4,
        lang=lang
    )
    muril_f1_mean = float(np.mean(bert_results['f1']))

    # ---------- LaBSE Cosine Similarity ----------
    model = SentenceTransformer('sentence-transformers/LaBSE')
    pred_emb = model.encode(predictions, convert_to_tensor=True)
    ref_emb = model.encode(references, convert_to_tensor=True)
    cosine_sim_matrix = util.cos_sim(pred_emb, ref_emb)
    # Take diagonal (each prediction with its reference)
    labse_cosine_mean = float(cosine_sim_matrix.diag().mean())

    return {'muril_score': muril_f1_mean, 'labse_cosine': labse_cosine_mean}

In [6]:
import os
import pandas as pd
# CSV folder is current directory
csv_folder = "."

# Collect results
results = []

for file_name in os.listdir(csv_folder):
    if file_name.endswith(".csv"):
        file_path = os.path.join(csv_folder, file_name)
        df = pd.read_csv(file_path)

        predictions = df['prediction'].tolist()
        references = df['gt'].tolist()

        # Compute translation scores
        scores = compute_translation_scores(predictions, references, lang='en')

        # Append results with file info
        results.append({
            "file": file_name,
            "scores": scores
        })

# Sort results by strategy (the word after the first '_')
results_sorted = sorted(results, key=lambda x: x['file'])

# Print sorted results
for r in results_sorted:
    print(f"File: {r['file']}, Scores: {r['scores']}")

File: approach1_bert_outputs.csv, Scores: {'muril_score': 0.7805719828164136, 'labse_cosine': 0.9199128746986389}
File: approach1_mpnet_outputs.csv, Scores: {'muril_score': 0.7812708218892416, 'labse_cosine': 0.9209620952606201}
File: approach1_t5_outputs.csv, Scores: {'muril_score': 0.7837716495549237, 'labse_cosine': 0.9230378270149231}
File: approach2_combined_2x.csv, Scores: {'muril_score': 0.7794243373252727, 'labse_cosine': 0.9119709730148315}
File: approach2_combined_x.csv, Scores: {'muril_score': 0.776060202607402, 'labse_cosine': 0.9046943783760071}
File: approach2_with_punct.csv, Scores: {'muril_score': 0.7685058194178122, 'labse_cosine': 0.9145123362541199}
File: approach2_without_punct.csv, Scores: {'muril_score': 0.7829732663101621, 'labse_cosine': 0.9121865630149841}
File: cadence_outputs.csv, Scores: {'muril_score': 0.780904061264462, 'labse_cosine': 0.9209813475608826}
File: deepseek_outputs.csv, Scores: {'muril_score': 0.7765315219208047, 'labse_cosine': 0.919695317745