In [None]:
!pip install transformers



In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# Load the Facebook M2M100 model and tokenizer
model_name = "facebook/m2m100_418M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Define a translation pipeline
translator = pipeline("translation", model=model, tokenizer=tokenizer)

# File paths
input_file = "/content/gemma-2b-it_recipes.txt"
translated_file = "translated_recipe_hi.txt"
back_translated_file = "back_translated_recipe.txt"

# Read the English recipes
with open(input_file, "r", encoding="utf-8") as f:
    recipes = f.readlines()

# Translate and back-translate
translated_recipes = []
back_translated_recipes = []

for recipe in recipes:
    if recipe.strip():
        # Step 1: English → Hindi
        encoded_en = tokenizer(recipe.strip(), return_tensors="pt")
        translated_ids = model.generate(**encoded_en, forced_bos_token_id=tokenizer.get_lang_id("hi"))
        translated = tokenizer.batch_decode(translated_ids, skip_special_tokens=True)[0]
        translated_recipes.append(translated)

        # Step 2: Hindi → English (Back Translation)
        encoded_hi = tokenizer(translated, return_tensors="pt")
        back_translated_ids = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id("en"))
        back_translated = tokenizer.batch_decode(back_translated_ids, skip_special_tokens=True)[0]
        back_translated_recipes.append(back_translated)

        # Print for monitoring
        print(f"\nOriginal: {recipe.strip()}")
        print(f"Translated (EN → HI): {translated}")
        print(f"Back-Translated (HI → EN): {back_translated}")

# Save translated Hindi recipes
with open(translated_file, "w", encoding="utf-8") as f:
    f.write("\n".join(translated_recipes))

# Save back-translated English recipes
with open(back_translated_file, "w", encoding="utf-8") as f:
    f.write("\n".join(back_translated_recipes))

print(f"\nTranslation and back-translation completed! Files saved:\n- Hindi Recipes: '{translated_file}'\n- Back-Translated English Recipes: '{back_translated_file}'")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

Device set to use cpu



Original: Recipe 1:
Translated (EN → HI): नुस्खा 1 :
Back-Translated (HI → EN): The recipe 1 :

Original: **The Dish:**
Translated (EN → HI): * डिश के बारे में: *
Back-Translated (HI → EN): About the dish: *

Original: **Spicy Carrot Bread with Strawberry Glaze**
Translated (EN → HI): ** स्ट्रॉबेरी ग्लेज़ के साथ स्वादिष्ट कार्बोहाइड्रेट रोटी**
Back-Translated (HI → EN): ** delicious carbohydrate bread with strawberry glaze**

Original: **Ingredients:**
Translated (EN → HI): * सामग्री : *
Back-Translated (HI → EN): • Materials : *

Original: * 1 loaf (12 slices) bread, sliced into 1/2-inch thick slices
Translated (EN → HI): * 1 लहसुन (12 स्लाइड्स) रोटी, 1/2 इंच मोटी स्लाइड्स में स्लाइड
Back-Translated (HI → EN): * 1 cheese (12 slides) bread, slide in 1/2 inch thick slides

Original: * 1 large carrot, diced
Translated (EN → HI): * 1 बड़ा कार्बोहाइड्रेट, कटा हुआ
Back-Translated (HI → EN): * 1 large carbohydrate, cut

Original: * 1/2 cup chocolate chips
Translated (EN → HI): 1/2 कप चॉकलेट

In [None]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.5.1


In [None]:
import sacrebleu
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

# File paths
original_file = "/content/gemma-2b-it_recipes.txt"  # Original English recipes
back_translated_file = "back_translated_recipe.txt"  # Back-translated English recipes

# Read files
with open(original_file, "r", encoding="utf-8") as f:
    original_recipes = [line.strip() for line in f.readlines() if line.strip()]

with open(back_translated_file, "r", encoding="utf-8") as f:
    back_translated_recipes = [line.strip() for line in f.readlines() if line.strip()]

# Ensure both files have the same number of lines
assert len(original_recipes) == len(back_translated_recipes), "Mismatch in number of lines!"

# BLEU Score Evaluation
bleu = sacrebleu.corpus_bleu(back_translated_recipes, [original_recipes])
print(f"\nBLEU Score: {bleu.score:.2f}")

# CHRF Score Evaluation
chrf = sacrebleu.corpus_chrf(back_translated_recipes, [original_recipes])
print(f"CHRF Score: {chrf.score:.2f}")

# Cosine Similarity Evaluation using LaBSE
model = SentenceTransformer("sentence-transformers/LaBSE")

def get_similarity(text1, text2):
    emb1 = model.encode(text1, convert_to_tensor=True)
    emb2 = model.encode(text2, convert_to_tensor=True)
    return 1 - cosine(emb1, emb2)

# Compute similarity for all recipes
similarities = [get_similarity(orig, back) for orig, back in zip(original_recipes, back_translated_recipes)]
average_similarity = sum(similarities) / len(similarities)

print(f"Average Cosine Similarity: {average_similarity:.4f}")



BLEU Score: 34.17
CHRF Score: 53.83


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

Average Cosine Similarity: 0.8364


In [None]:
!pip uninstall torch torchvision torchaudio -y
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install --upgrade transformers

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl.metadata (26 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.21.0%2Bcpu-cp311-cp311-linux_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl.metadata (6.6 kB)
Downloading https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl (178.7 MB)
[2K   

Collecting transformers
  Downloading transformers-4.50.2-py3-none-any.whl.metadata (39 kB)
Downloading transformers-4.50.2-py3-none-any.whl (10.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.50.0
    Uninstalling transformers-4.50.0:
[31mERROR: Operation cancelled by user[0m[31m
[0m^C


In [None]:
!pip install laserembeddings

[0mCollecting laserembeddings
  Downloading laserembeddings-1.1.2-py3-none-any.whl.metadata (5.1 kB)
Collecting numpy<2.0.0,>=1.15.4 (from laserembeddings)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses==0.0.35 (from laserembeddings)
  Downloading sacremoses-0.0.35.tar.gz (859 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m859.8/859.8 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting subword-nmt<0.4.0,>=0.3.6 (from laserembeddings)
  Downloading subword_nmt-0.3.8-py3-none-any.whl.metadata (9.2 kB)
Collecting torch<2.0.0,>=1.0.1.post2 (from laserembeddings)
  Downloading torch-1.13.1-cp311-cp311-manylinux1_x86_64.whl.metadata (24 kB)
Collecting transliterate==1.10.2 (from laserembeddings)
  Download

In [None]:
!pip uninstall torch torchvision torchaudio -y
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install --upgrade transformers

[0mLooking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch
  Using cached https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl.metadata (26 kB)
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cpu/torchvision-0.21.0%2Bcpu-cp311-cp311-linux_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cpu/torchaudio-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl.metadata (6.6 kB)
Using cached https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl (178.7 MB)
Using cached https://download.pytorch.org/whl/cpu/torchvision-0.21.0%2Bcpu-cp311-cp311-linux_x86_64.whl (1.8 MB)
Using cached https://download.pytorch.org/whl/cpu/torchaudio-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl (1.7 MB)
[0mInstalling collected packages: torch, torchvision, torchaudio
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. T

In [None]:
!python -m laserembeddings download-models

Downloading models into /usr/local/lib/python3.11/dist-packages/laserembeddings/data

✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt    

✨ You're all set!


In [None]:
import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np
from sentence_transformers import SentenceTransformer
from laserembeddings import Laser
from sklearn.metrics.pairwise import cosine_similarity

# Load Recipe Texts
def load_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f.readlines() if line.strip()]

# Load English and Hindi recipes
english_recipes = load_text("/content/gemma-2b-it_recipes.txt")
hindi_recipes = load_text("/content/translated_recipe_hi.txt")

assert len(english_recipes) == len(hindi_recipes), "Mismatch in number of recipes!"

# ------------------- LaBSE -------------------
def get_labse_embeddings(sentences):
    model_name = "sentence-transformers/LaBSE"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling
    return embeddings.numpy()

# ------------------- XLM-R -------------------
def get_xlmr_embeddings(sentences):
    model_name = "xlm-roberta-large"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling
    return embeddings.numpy()

# ------------------- LASER -------------------
def get_laser_embeddings(sentences):
    laser = Laser()
    return laser.embed_sentences(sentences, lang="en")  # Detects language automatically

# ------------------- Compute Similarity -------------------
def compute_similarity(embeddings1, embeddings2):
    return cosine_similarity(embeddings1, embeddings2).diagonal()

# ------------------- Run All Models -------------------
labse_en = get_labse_embeddings(english_recipes)
labse_hi = get_labse_embeddings(hindi_recipes)
labse_sim = compute_similarity(labse_en, labse_hi)

xlmr_en = get_xlmr_embeddings(english_recipes)
xlmr_hi = get_xlmr_embeddings(hindi_recipes)
xlmr_sim = compute_similarity(xlmr_en, xlmr_hi)

laser_en = get_laser_embeddings(english_recipes)
laser_hi = get_laser_embeddings(hindi_recipes)
laser_sim = compute_similarity(laser_en, laser_hi)

# ------------------- Display Results -------------------
for i, (eng, hin, s1, s2, s3) in enumerate(zip(english_recipes, hindi_recipes, labse_sim, xlmr_sim, laser_sim)):
    print(f"🔹 Recipe {i+1}")
    print(f"   English: {eng}")
    print(f"   Hindi: {hin}")
    print(f"   🔹 LaBSE Similarity: {s1:.4f}")
    print(f"   🔹 XLM-R Similarity: {s2:.4f}")
    print(f"   🔹 LASER Similarity: {s3:.4f}")
    print("-" * 50)

# ------------------- Summary -------------------
print("\n✅ Average Similarity Scores:")
print(f"   🔹 LaBSE: {np.mean(labse_sim):.4f}")
print(f"   🔹 XLM-R: {np.mean(xlmr_sim):.4f}")
print(f"   🔹 LASER: {np.mean(laser_sim):.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

🔹 Recipe 1
   English: Recipe 1:
   Hindi: नुस्खा 1 :
   🔹 LaBSE Similarity: 0.9398
   🔹 XLM-R Similarity: 0.9960
   🔹 LASER Similarity: 0.8767
--------------------------------------------------
🔹 Recipe 2
   English: **The Dish:**
   Hindi: * डिश के बारे में: *
   🔹 LaBSE Similarity: 0.8229
   🔹 XLM-R Similarity: 0.9961
   🔹 LASER Similarity: 0.8010
--------------------------------------------------
🔹 Recipe 3
   English: **Spicy Carrot Bread with Strawberry Glaze**
   Hindi: ** स्ट्रॉबेरी ग्लेज़ के साथ स्वादिष्ट कार्बोहाइड्रेट रोटी**
   🔹 LaBSE Similarity: 0.8793
   🔹 XLM-R Similarity: 0.9979
   🔹 LASER Similarity: 0.8314
--------------------------------------------------
🔹 Recipe 4
   English: **Ingredients:**
   Hindi: * सामग्री : *
   🔹 LaBSE Similarity: 0.8864
   🔹 XLM-R Similarity: 0.9951
   🔹 LASER Similarity: 0.8545
--------------------------------------------------
🔹 Recipe 5
   English: * 1 loaf (12 slices) bread, sliced into 1/2-inch thick slices
   Hindi: * 1 लहसुन (12 स्ल