The languages each model supports:

1. facebook/hf-seamless-m4t-medium

here you can find the supported languages:

https://huggingface.co/facebook/hf-seamless-m4t-medium/blob/main/tokenizer_config.json#L1887-L2089

2. Helsinki-NLP/opus-mt-en-roa

here you can find the supported languages:

https://huggingface.co/Helsinki-NLP/opus-mt-en-roa

3. facebook/nllb-200-distilled-600M

here you can find the supported languages:

https://huggingface.co/facebook/nllb-200-distilled-600M

4. google/madlad400-10b-mt

here you can find the supported languages:

https://huggingface.co/google/madlad400-10b-mt

In [8]:
!pip install langid

Collecting langid
  Downloading langid-1.1.6.tar.gz (1.9 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/1.9 MB[0m [31m18.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langid
  Building wheel for langid (setup.py) ... [?25l[?25hdone
  Created wheel for langid: filename=langid-1.1.6-py3-none-any.whl size=1941171 sha256=3b4eccc480bff8288f232db8882c4d1eedb5aa5a7aa083afd92c2bb2cf942dad
  Stored in directory: /root/.cache/pip/wheels/23/c8/c6/eed80894918490a175677414d40bd7c851413bbe03d4856c3c
Successfully built langid
Installing collected packages: langid
Successfully installed langid-1.1.6


In [21]:
from transformers import SeamlessM4TModel, AutoProcessor
from transformers import AutoTokenizer, MarianMTModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import langid

def translate_text():
    # Get input text from the user
    input_text = input("Enter the text you want to translate: ")

    # Detect the input language using langid
    input_language, _ = langid.classify(input_text)
    print(f"Detected input language: {input_language}")

    # Get the output language from the user
    output_language = input("Enter the language code you want to translate to (e.g., 'fra' for French): ")

    # List of available models
    model_choices = [
        "facebook/hf-seamless-m4t-medium",
        "Helsinki-NLP/opus-mt-en-roa",
        "facebook/nllb-200-distilled-600M",
        "google/madlad400-10b-mt"
    ]

    # Display the model options to the user
    print("\nAvailable models:")
    for i, model in enumerate(model_choices, 1):
        print(f"{i}. {model}")

    # Get the model selection from the user
    model_index = int(input("Select the translation model by number: ")) - 1

    model_name = model_choices[model_index]
    print(f"\nYou have selected the model: {model_name}")

    if model_index == 0:
      # Load the pre-trained SeamlessM4T model and processor
      model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
      processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")

      # Check if CUDA is available and set the device accordingly
      device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

      # Move the model to the appropriate device
      model.to(device)

      # Process the input text
      inputs = processor(text=input_text, src_lang=input_language, return_tensors="pt").to(device)

      # Perform translation
      with torch.no_grad():
          outputs = model.generate(**inputs, tgt_lang=output_language, generate_speech=False)

      # Extract the first sequence and decode it
      translated_text = processor.decode(outputs[0].tolist()[0], skip_special_tokens=True)

      # Print the translated text
      print(f"\nTranslated text: {translated_text}")
    elif model_index == 1:
      src_text = [">>" + output_language + "<<" + input_text]
      model_name = "Helsinki-NLP/opus-mt-en-roa"
      tokenizer = MarianTokenizer.from_pretrained(model_name)
      print(tokenizer.supported_language_codes)

      model = MarianMTModel.from_pretrained(model_name)
      translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
      translations = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
      for i, translation in enumerate(translations):
          print(f"Translated: {translation}\n")
    elif model_index == 2:
      # Load the tokenizer and the model
      tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
      model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

      source_text = ">>" + output_language + "<<" + input_text

      # Tokenize the input text
      input_ids = tokenizer(source_text, return_tensors="pt").input_ids  # Batch size 1

      # Generate the translation (you can specify the target language)
      translated_tokens = model.generate(input_ids=input_ids, max_length=50, num_beams=4, early_stopping=True)

      # Decode the translated tokens to get the translated text
      translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
      print(translated_text)
    elif model_index == 3:
      model_name = 'google/madlad400-10b-mt'
      model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")
      tokenizer = T5Tokenizer.from_pretrained(model_name)

      text = "<2" + output_language + ">" +  input_text
      input_ids = tokenizer(text, return_tensors="pt").input_ids.to(model.device)
      outputs = model.generate(input_ids=input_ids)

      translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
      print(translated_text)


# Example usage
if __name__ == "__main__":
    translate_text()


Enter the text you want to translate: It was a good project
Detected input language: en
Enter the language code you want to translate to (e.g., 'fra' for French): pes

Available models:
1. facebook/hf-seamless-m4t-medium
2. Helsinki-NLP/opus-mt-en-roa
3. facebook/nllb-200-distilled-600M
4. google/madlad400-10b-mt
Select the translation model by number: 1

You have selected the model: facebook/hf-seamless-m4t-medium


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



Translated text: این یک پروژه خوب بود
