# 1. Installing Dependencies

In [None]:
!pip install transformers sentencepiece langdetect -q


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.4/981.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


### `transformers`: For using MarianMT translation models

### `sentencepiece`: needed by some translation tokenizers

### `langdetect`: to automatically detect the input text's language

In [None]:
# Importing libraries
from langdetect import detect
from transformers import MarianMTModel, MarianTokenizer


In [None]:
# input text
text = "आप कैसे हैं?"

# Detect the source language
src_lang = detect(text)
print("Detected Language Code:", src_lang)


Detected Language Code: hi


### "hi" is the ISO 639-1 language code for Hindi.

# 2.Language Setup and Supported Translation Models

In [None]:
from transformers import MarianTokenizer, MarianMTModel

# Language name to code mapping
language_codes = {
    "English": "en",
    "Hindi": "hi",
    "French": "fr",
    "German": "de",
    "Spanish": "es"
}

# Available direct translation models (HuggingFace supports these)
available_pairs = {
    ("hi", "en"): "Helsinki-NLP/opus-mt-hi-en",
    ("en", "hi"): "Helsinki-NLP/opus-mt-en-hi",
    ("en", "fr"): "Helsinki-NLP/opus-mt-en-fr",
    ("fr", "en"): "Helsinki-NLP/opus-mt-fr-en",
    ("en", "de"): "Helsinki-NLP/opus-mt-en-de",
    ("de", "en"): "Helsinki-NLP/opus-mt-de-en",
    ("en", "es"): "Helsinki-NLP/opus-mt-en-es",
    ("es", "en"): "Helsinki-NLP/opus-mt-es-en"
}




In [32]:
# Function to Load MarianMT Model and Tokenizer from Hugging Face
def load_model(model_name):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

In [33]:
# Translate input text using the specified MarianMT model
def run_translation(text, model_name):
    tokenizer, model = load_model(model_name)
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

In [34]:
# Main translation logic handles direct and two-step translations based on available models

def translate_text(text, src_lang_name, tgt_lang_name):
    src_lang = language_codes.get(src_lang_name)
    tgt_lang = language_codes.get(tgt_lang_name)

    if not src_lang or not tgt_lang:
        return "One or both selected languages are not supported."

    model_key = (src_lang, tgt_lang)

    if model_key in available_pairs:
        print(f"Using direct model: {available_pairs[model_key]}")
        return run_translation(text, available_pairs[model_key])

    elif (src_lang != "en") and (tgt_lang != "en"):
        # Use two-step: source → English → target
        to_en_key = (src_lang, "en")
        from_en_key = ("en", tgt_lang)

        if to_en_key in available_pairs and from_en_key in available_pairs:
            print(f" Using two-step: {to_en_key} then {from_en_key}")
            english_text = run_translation(text, available_pairs[to_en_key])
            final_output = run_translation(english_text, available_pairs[from_en_key])
            return final_output
        else:
            return "Two-step translation path not available."

    else:
        return "Sorry, no model available for that translation direction."

In [None]:
# Example usage:
# print(translate_text("मुझे खाना पसंद है", "Hindi", "French"))

In [None]:
print(translate_text("I hate books", "English", "Hindi"))

🔁 Using direct model: Helsinki-NLP/opus-mt-en-hi




मैं पुस्तकों से नफरत है


In [26]:
!pip install streamlit pyngrok transformers sentencepiece langdetect -q


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [28]:
!ngrok config add-authtoken 30XEouoDhpqdqrS1WfWSmCy5Jct_5iA5vFpq4htpTjP9aniNa


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


# 3.Launch Streamlit with Ngrok

In [36]:
# Launching Streamlit App via Ngrok Tunnel for Google Colab
# Write the entire Streamlit app code to a temporary Python file
# This is required because Streamlit runs apps from .py scripts,
# and Colab doesn't support native Streamlit execution.
# We'll run this script using ngrok to serve the app publicly.


from pyngrok import ngrok
import os

# Write your Streamlit code to a temporary file
with open("main_app.py", "w") as f:
    f.write("""
import streamlit as st
from langdetect import detect
from transformers import MarianTokenizer, MarianMTModel

language_codes = {
    "English": "en",
    "Hindi": "hi",
    "French": "fr",
    "German": "de",
    "Spanish": "es"
}

available_pairs = {
    ("hi", "en"): "Helsinki-NLP/opus-mt-hi-en",
    ("en", "hi"): "Helsinki-NLP/opus-mt-en-hi",
    ("en", "fr"): "Helsinki-NLP/opus-mt-en-fr",
    ("fr", "en"): "Helsinki-NLP/opus-mt-fr-en",
    ("en", "de"): "Helsinki-NLP/opus-mt-en-de",
    ("de", "en"): "Helsinki-NLP/opus-mt-de-en",
    ("en", "es"): "Helsinki-NLP/opus-mt-en-es",
    ("es", "en"): "Helsinki-NLP/opus-mt-es-en"
}

def load_model(model_name):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

def run_translation(text, model_name):
    tokenizer, model = load_model(model_name)
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

def translate_text(text, src_lang_name, tgt_lang_name):
    src_lang = language_codes.get(src_lang_name)
    tgt_lang = language_codes.get(tgt_lang_name)

    if not src_lang or not tgt_lang:
        return "One or both selected languages are not supported."

    model_key = (src_lang, tgt_lang)

    if model_key in available_pairs:
        return run_translation(text, available_pairs[model_key])

    elif (src_lang != "en") and (tgt_lang != "en"):
        to_en_key = (src_lang, "en")
        from_en_key = ("en", tgt_lang)

        if to_en_key in available_pairs and from_en_key in available_pairs:
            english_text = run_translation(text, available_pairs[to_en_key])
            return run_translation(english_text, available_pairs[from_en_key])
        else:
            return "Two-step translation path not available."

    else:
        return "No model available for the selected translation direction."

# Streamlit UI
st.set_page_config(page_title="Translator", layout="centered")
st.title(" Multilingual Text Translator")

text = st.text_area("Enter text to translate:", height=100)
src_lang = st.selectbox("From Language", list(language_codes.keys()))
tgt_lang = st.selectbox("To Language", list(language_codes.keys()))

if st.button("Translate"):
    if text.strip() == "":
        st.warning("Please enter some text.")
    else:
        output = translate_text(text, src_lang, tgt_lang)
        st.success("Translated Text:")
        st.write(output)
""")

# Run the app via Streamlit and expose via ngrok
!streamlit run main_app.py &>/content/log.txt &  # Run in background
public_url = ngrok.connect(addr="8501")
print(" Streamlit app is live at:", public_url)


 Streamlit app is live at: NgrokTunnel: "https://494d3ad862e0.ngrok-free.app" -> "http://localhost:8501"


In [37]:
!pip install -q sentence-transformers


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m905.2 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m620.4 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

# 5. Semantic similarity

In [39]:
from sentence_transformers import SentenceTransformer, util

# Load a sentence embedding model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Example translations

your_translation = translate_text("नमस्ते आप कैसे है", "Hindi", "English")
google_translation = "hello how are you"

# Get sentence embeddings
embedding1 = model.encode(your_translation, convert_to_tensor=True)
embedding2 = model.encode(google_translation, convert_to_tensor=True)

# Compute cosine similarity
similarity_score = util.pytorch_cos_sim(embedding1, embedding2).item()

print(f"Semantic Similarity Score: {similarity_score:.4f}")


Using direct model: Helsinki-NLP/opus-mt-hi-en
Semantic Similarity Score: 0.8049
