In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.getcwd()

'/workspace/Approach1/perf-outputs'

In [6]:
labse_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
labse_model = AutoModel.from_pretrained("sentence-transformers/LaBSE")

muril_tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
muril_model = AutoModel.from_pretrained("google/muril-base-cased")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
labse_model.to(device)
muril_model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(197285, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [7]:
def compute_ref_based_scores(mt_texts, ref_texts, method="max"):
    results = []
    
    for i, mt in enumerate(mt_texts):
        ref = ref_texts[i]
        
        labse_ref_scores = []
        muril_ref_scores = []
        
        # --- LaBSE ---
        inputs = labse_tokenizer([ref, mt], padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            embeddings = labse_model(**inputs).pooler_output
        labse_ref_mt = F.cosine_similarity(embeddings[0].unsqueeze(0), embeddings[1].unsqueeze(0)).item()

        # --- MuRIL ---
        inputs = muril_tokenizer([ref, mt], padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = muril_model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        muril_ref_mt = F.cosine_similarity(embeddings[0].unsqueeze(0), embeddings[1].unsqueeze(0)).item()

        results.append({
            "mt": mt,
            "labse_ref_mt": labse_ref_mt,
            "muril_ref_mt": muril_ref_mt
        })
    
    df_scores = pd.DataFrame(results)
    
    # System-level averages
    system_scores = {
        "labse_ref_mt": df_scores["labse_ref_mt"].mean(),
        "muril_ref_mt": df_scores["muril_ref_mt"].mean()
    }
    
    return system_scores

In [8]:
labse = {}
muril = {}

In [3]:
import numpy as np
from evaluate import load
from sentence_transformers import SentenceTransformer, util

def compute_translation_scores(predictions, references, lang='en'):
    """
    Compute reference-based translation evaluation scores using:
    1. BERTScore (MuRIL)
    2. LaBSE cosine similarity
    
    Args:
        predictions (list of str): Model outputs
        references (list of str): Reference translations
        lang (str): Language code for BERTScore ('en', 'hi', etc.)
    
    Returns:
        dict: {'muril_f1': float, 'labse_cosine': float}
    """
    assert len(predictions) == len(references), "Predictions and references must have the same length"

    # ---------- BERTScore (MuRIL) ----------
    bertscore = load("bertscore")
    bert_results = bertscore.compute(
        predictions=predictions,
        references=references,
        model_type='google/muril-base-cased',
        num_layers=4,
        lang=lang
    )
    muril_f1_mean = float(np.mean(bert_results['f1']))

    # ---------- LaBSE Cosine Similarity ----------
    model = SentenceTransformer('sentence-transformers/LaBSE')
    pred_emb = model.encode(predictions, convert_to_tensor=True)
    ref_emb = model.encode(references, convert_to_tensor=True)
    cosine_sim_matrix = util.cos_sim(pred_emb, ref_emb)
    # Take diagonal (each prediction with its reference)
    labse_cosine_mean = float(cosine_sim_matrix.diag().mean())

    return {'muril_score': muril_f1_mean, 'labse_cosine': labse_cosine_mean}

## T5 Punctuation Restoration

In [6]:
file_name = "approach1_eng_to_eng_t5_outputs_punct_restor_data.csv"
mode = "t5_punct"

In [7]:
df = pd.read_csv(file_name)

predictions = df['prediction'].tolist()
references = df['gt'].tolist()

scores = compute_ref_based_scores(predictions, references)
labse[mode] = scores['labse_ref_mt']
muril[mode] = scores['muril_ref_mt']

print(f"LaBSE and MuRIL Scores for {mode}:")
print(scores)

LaBSE and MuRIL Scores for t5_punct:
{'labse_ref_mt': 0.9895218774622835, 'muril_ref_mt': 0.9998911888104498}


## T5 Seq-to-Seq

In [4]:
file_name = "approach1_eng_to_eng_t5_outputs_mar_data.csv"
mode = "t5"

In [5]:
df = pd.read_csv(file_name)
df.head()


Unnamed: 0,prediction,src,gt
0,"Chanting, the choir raised the volume as the c...",Chanting the choir raised the volume as the ce...,"Chanting, the choir raised the volume as the c..."
1,A six-month-old calf was submitted for examina...,A six-month-old calf was submitted for examina...,A six-month-old calf was submitted for examina...
2,Planning authorities should provide alternativ...,Planning authorities should provide alternativ...,Planning authorities should provide alternativ...
3,"As the machine develops, the forms we use to r...",As the machine develops the forms we use to re...,"As the machine develops, the forms we use to r..."
4,"As mentioned, first impressions can be mislead...","As mentioned, first impressions can be mislead...","As mentioned first, impressions can be mislead..."


In [6]:
predictions = df['prediction'].tolist()
references = df['gt'].tolist()

compute_translation_scores(predictions, references)

{'muril_score': 0.9779256262161113, 'labse_cosine': 0.9862358570098877}

## IndicTrans2 Sentences Meant

In [10]:
file_name = "sent_meant_outputs.csv"
mode = "original_meant"

In [11]:
df = pd.read_csv(file_name)

predictions = df['prediction'].tolist()
references = df['gemini'].tolist()

scores = compute_ref_based_scores(predictions, references)
labse[mode] = scores['labse_ref_mt']
muril[mode] = scores['muril_ref_mt']

print(f"LaBSE and MuRIL Scores for {mode}:")
print(scores)

LaBSE and MuRIL Scores for original_meant:
{'labse_ref_mt': 0.9312928307939459, 'muril_ref_mt': 0.9983996759962153}


## Cadence Approach1

In [7]:
file_name = "cadence_outputs.csv"
mode = "cadence"

In [10]:
df = pd.read_csv(file_name)
df.head()

Unnamed: 0,src,prediction_mar,gt_mar
0,Chanting the choir raised the volume as the ce...,उत्सवी गायकांनी प्रार्थनेचा उच्चार केल्याने गा...,"धर्मगुरू प्रार्थना म्हणत असताना, घोष करणाऱ्या ..."
1,A six-month-old calf was submitted for examina...,जन्मानंतर लगेचच अस्तित्वात असलेल्या चारही पाया...,तपासणीसाठी आणलेल्या सहा महिन्यांच्या एका वासरा...
2,Planning authorities should provide alternativ...,नियोजन अधिकाऱ्यांनी छोट्या व्यवसायांसाठी पर्या...,नियोजन प्राधिकरणांनी लहान व्यवसायांसाठी पर्याय...
3,As the machine develops the forms we use to re...,जसजसे मशीन विकसित होईल तसतसे आम्ही मागील प्रकल...,"जसजशी यंत्रणा विकसित होईल, तसतसे मागील प्रकल्प..."
4,"As mentioned, first impressions can be mislead...","नमूद केल्याप्रमाणे, पहिली छाप दिशाभूल करणारी अ...","आधी सांगितल्याप्रमाणे, पहिली छाप फसवी असू शकते."


In [9]:
predictions = df['prediction'].tolist()
references = df['gt'].tolist()

compute_translation_scores(predictions, references)

KeyError: 'prediction'

## Cadence Punctuation Restoration

In [12]:
file_name = "approach1_eng_to_eng_cadence_outputs_punct_restor_data.csv"
mode = "cadence_approach1"

In [13]:
df = pd.read_csv(file_name)
df.head()

Unnamed: 0,prediction,src,gt
0,Chanting the choir raised the volume as the ce...,Chanting the choir raised the volume as the ce...,"Chanting, the choir raised the volume as the c..."
1,A six-month-old calf was submitted for examina...,A six-month-old calf was submitted for examina...,A six-month-old calf was submitted for examina...
2,Planning authorities should provide alternativ...,Planning authorities should provide alternativ...,Planning authorities should provide alternativ...
3,As the machine develops the forms we use to re...,As the machine develops the forms we use to re...,"As the machine develops, the forms we use to r..."
4,"As mentioned, first impressions can be mislead...","As mentioned, first impressions can be mislead...","As mentioned first, impressions can be mislead..."


In [14]:
predictions = df['prediction'].tolist()
references = df['gt'].tolist()

compute_translation_scores(predictions, references)

{'muril_score': 0.9736823285067523, 'labse_cosine': 0.9833676218986511}

## DeepSeek

In [15]:
file_name = "shalaka_deepseek_outputs.csv"
mode = "deepseek"

In [16]:
df = pd.read_csv(file_name)

predictions = df['prediction'].tolist()
references = df['gt'].tolist()

scores = compute_ref_based_scores(predictions, references)
labse[mode] = scores['labse_ref_mt']
muril[mode] = scores['muril_ref_mt']

print(f"LaBSE and MuRIL Scores for {mode}:")
print(scores)

LaBSE and MuRIL Scores for deepseek:
{'labse_ref_mt': 0.9197003432997951, 'muril_ref_mt': 0.9981370844222881}


In [19]:
for key, val in labse.items():
    print(f"{key} : {val:.4f}")

cadence : 0.9210
cadence_approach1 : 0.9834
deepseek : 0.9197


In [17]:
for key, val in muril.items():
    print(f"{key} : {val:.6f}")

cadence : 0.998364
cadence_approach1 : 0.999806
deepseek : 0.998137
