In [1]:
# # ============================================================================
# # CELL 4: Authenticate HuggingFace
# # ============================================================================

import os
# Set your HuggingFace token
# Get it from: https://huggingface.co/settings/tokens
HF_TOKEN = "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"  # REPLACE THIS!

# Or use Kaggle secrets (recommended)
from kaggle_secrets import UserSecretsClient
try:
    user_secrets = UserSecretsClient()
    HF_TOKEN = user_secrets.get_secret("HUGGINGFACE_TOKEN")
    print("\n✓ Loaded HF token from Kaggle secrets")
except:
    print("\n⚠️  Using hardcoded token (not recommended for production)")

os.environ["HF_TOKEN"] = HF_TOKEN



⚠️  Using hardcoded token (not recommended for production)


In [2]:
# ============================================================
# STEP 0: Install & Imports (run once per session)
# ============================================================
!pip install -q transformers IndicTransToolkit sentencepiece accelerate

import torch
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from IndicTransToolkit.processor import IndicProcessor

# ============================================================
# STEP 1: Device & Model Setup
# ============================================================
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

src_lang, tgt_lang = "hin_Deva", "eng_Latn"
model_name = "ai4bharat/indictrans2-indic-en-1B"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# NOTE:
# If you get an error about flash_attention_2, remove `attn_implementation` argument.
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
    attn_implementation="flash_attention_2",  # remove this line if it errors
).to(DEVICE)

model.eval()
ip = IndicProcessor(inference=True)

# ============================================================
# STEP 2: Load Dataset & Sample Rows
# ============================================================
# >>> CHANGE THIS to your **actual file name** inside the dataset folder
# e.g. "/kaggle/input/hindidataset/data.csv"  or  ".../train.csv" etc.
FILE_PATH = "/kaggle/input/hindidataset/translated_dataset.csv"

df = pd.read_csv(FILE_PATH)

# Check columns exist
print("Columns:", df.columns)

required_cols = ["input", "input_hi",
                 "instruction", "instruction_hi",
                 "output", "output_hi"]

for col in required_cols:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# Sample up to 2000 rows for evaluation
N = 2000
df["orig_index"] = df.index  # keep original index
if len(df) > N:
    df_sample = df.sample(N, random_state=42).reset_index(drop=True)
else:
    df_sample = df.reset_index(drop=True)

print(f"Total rows: {len(df)}, Using rows: {len(df_sample)}")

# ============================================================
# STEP 3: Helper – Back-Translation Function (Hindi -> English)
# ============================================================
from tqdm import tqdm

def backtranslate_hindi_to_english(sentences, batch_size=16, max_length=256):
    sentences = [s if isinstance(s, str) else "" for s in sentences]
    all_outputs = []

    total = len(sentences)
    num_batches = (total + batch_size - 1) // batch_size

    # progress bar
    pbar = tqdm(total=num_batches, desc="Back-translating", ncols=100)

    for batch_index in range(num_batches):
        start = batch_index * batch_size
        end = start + batch_size
        batch_sents = sentences[start:end]

        batch_proc = ip.preprocess_batch(
            batch_sents,
            src_lang=src_lang,
            tgt_lang=tgt_lang,
        )

        inputs = tokenizer(
            batch_proc,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=max_length,
                num_beams=5,
                num_return_sequences=1,
            )

        decoded = tokenizer.batch_decode(
            generated_tokens,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )

        decoded = ip.postprocess_batch(decoded, lang=tgt_lang)
        all_outputs.extend(decoded)

        pbar.update(1)  # update progress bar

    pbar.close()
    print("✔ Back-translation complete!")
    return all_outputs


# ============================================================
# STEP 4: Apply Back-Translation to input_hi & instruction_hi
# ============================================================
print("Back-translating input_hi ...")
df_sample["input_hi_backtrans"] = backtranslate_hindi_to_english(
    df_sample["input_hi"].tolist(),
    batch_size=16,
)

print("Back-translating instruction_hi ...")
df_sample["instruction_hi_backtrans"] = backtranslate_hindi_to_english(
    df_sample["instruction_hi"].tolist(),
    batch_size=16,
)

print("Back-translating output_hi ...")
df_sample["output_hi_backtrans"] = backtranslate_hindi_to_english(
    df_sample["output_hi"].tolist(),
    batch_size=16,   # adjust if needed
)


# Quick sanity check
print(df_sample[[
    "orig_index", "input", "input_hi",
    "input_hi_backtrans", "instruction",
    "instruction_hi", "instruction_hi_backtrans"
]].head())


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m546.1/546.1 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m93.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m67.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB

2025-11-26 13:03:37.489017: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764162217.688717      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764162217.746947      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

tokenization_indictrans.py:   0%|          | 0.00/8.04k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-en-1B:
- tokenization_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


dict.SRC.json:   0%|          | 0.00/3.39M [00:00<?, ?B/s]

dict.TGT.json:   0%|          | 0.00/645k [00:00<?, ?B/s]

model.SRC:   0%|          | 0.00/3.26M [00:00<?, ?B/s]

model.TGT:   0%|          | 0.00/759k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

configuration_indictrans.py:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-en-1B:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py:   0%|          | 0.00/79.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-en-1B:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/4.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

Columns: Index(['input', 'output', 'instruction', 'output_token_count',
       'instruction_hi', 'input_hi', 'output_hi'],
      dtype='object')
Total rows: 16936, Using rows: 2000
Back-translating input_hi ...


Back-translating: 100%|███████████████████████████████████████████| 125/125 [02:02<00:00,  1.02it/s]


✔ Back-translation complete!
Back-translating instruction_hi ...


Back-translating: 100%|███████████████████████████████████████████| 125/125 [01:43<00:00,  1.21it/s]


✔ Back-translation complete!
Back-translating output_hi ...


Back-translating: 100%|███████████████████████████████████████████| 125/125 [47:11<00:00, 22.65s/it]

✔ Back-translation complete!
   orig_index                                              input  \
0        3997                         Capital Gains in an S Corp   
1        9870  Pay off car loan entirely or leave $1 until th...   
2        1889  Will a credit card issuer cancel an account if...   
3       12031  What happens to people without any retirement ...   
4        8991          View asset/holdings breakdown within fund   

                                            input_hi  \
0                           एस कॉर्प में पूंजीगत लाभ   
1  कार लोन का पूरी तरह से भुगतान करें या लोन अवधि...   
2  क्या क्रेडिट कार्ड जारीकर्ता किसी खाते को रद्द...   
3  बिना किसी सेवानिवृत्ति बचत के लोगों का क्या हो...   
4  निधि के भीतर परिसंपत्ति/होल्डिंग्स का टूटना देखें   

                                  input_hi_backtrans  \
0                            Capital Gains in S Corp   
1  Pay off the car loan in full or leave $1 at th...   
2  Will the credit card issuer cancel an account ...   
3




In [3]:
OUTPUT_PATH = "/kaggle/working/backtranslation_results.csv"
df_sample.to_csv(OUTPUT_PATH, index=False)
print("Saved to:", OUTPUT_PATH)


Saved to: /kaggle/working/backtranslation_results.csv


In [3]:
!pip install -q sentence-transformers

import numpy as np
from sentence_transformers import SentenceTransformer
import torch

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device for embeddings:", DEVICE)

embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(embed_model_name, device=DEVICE)


Using device for embeddings: cuda


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
def compute_embeddings(texts, batch_size=64):
    """
    texts: list of strings
    returns: numpy array of shape (len(texts), dim)
    """
    # Handle NaNs or non-strings
    texts = [t if isinstance(t, str) else "" for t in texts]

    embeddings = embedder.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True,  # so cosine similarity == dot product
    )
    return embeddings


In [5]:
# Embeddings for original English input
emb_input = compute_embeddings(df_sample["input"].tolist())

# Embeddings for back-translated Hindi input
emb_input_bt = compute_embeddings(df_sample["input_hi_backtrans"].tolist())

# Cosine similarity (since normalized: cos_sim = dot product)
cos_sim_input = np.sum(emb_input * emb_input_bt, axis=1)

df_sample["sim_input"] = cos_sim_input

print("Input similarity stats:")
print(df_sample["sim_input"].describe())


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Input similarity stats:
count    2000.000000
mean        0.945073
std         0.070747
min         0.345920
25%         0.923802
50%         0.970547
75%         0.994579
max         1.000000
Name: sim_input, dtype: float64


In [6]:
emb_instr = compute_embeddings(df_sample["instruction"].tolist())
emb_instr_bt = compute_embeddings(df_sample["instruction_hi_backtrans"].tolist())

cos_sim_instr = np.sum(emb_instr * emb_instr_bt, axis=1)

df_sample["sim_instruction"] = cos_sim_instr

print("Instruction similarity stats:")
print(df_sample["sim_instruction"].describe())


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Instruction similarity stats:
count    2000.000000
mean        0.884392
std         0.096992
min         0.706102
25%         0.853444
50%         0.922874
75%         0.957251
max         0.974493
Name: sim_instruction, dtype: float64


In [7]:
# Embeddings for original English output
emb_output = compute_embeddings(df_sample["output"].tolist())

# Embeddings for back-translated Hindi output
emb_output_bt = compute_embeddings(df_sample["output_hi_backtrans"].tolist())

# Cosine similarity (embeddings already normalized)
cos_sim_output = np.sum(emb_output * emb_output_bt, axis=1)

df_sample["sim_output"] = cos_sim_output

print("Output similarity stats:")
print(df_sample["sim_output"].describe())


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Output similarity stats:
count    2000.000000
mean        0.939458
std         0.051087
min         0.094861
25%         0.922519
50%         0.950360
75%         0.971436
max         1.000000
Name: sim_output, dtype: float64


In [8]:
cols_to_save = [
    "orig_index",
    # input
    "input", "input_hi", "input_hi_backtrans", "sim_input",
    # instruction
    "instruction", "instruction_hi", "instruction_hi_backtrans", "sim_instruction",
    # output
    "output", "output_hi", "output_hi_backtrans", "sim_output",
]

out_path = "/kaggle/working/translation_similarity_results_all.csv"
df_sample[cols_to_save].to_csv(out_path, index=False)
print("Saved similarity results to:", out_path)


Saved similarity results to: /kaggle/working/translation_similarity_results_all.csv
