**Dependencies**

In [None]:
!git clone https://github.com/facebookresearch/seamless_communication.git && cd seamless_communication && pip install .

In [None]:
pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2+cu118 -f https://download.pytorch.org/whl/torch_stable.html


**Facebook m4t Setup**

In [1]:
import torch
from transformers import AutoProcessor, SeamlessM4Tv2Model

processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

preprocessor_config.json:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/19.7k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.17M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.72k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/211k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.24G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/9.91M [00:00<?, ?B/s]

SeamlessM4Tv2Model(
  (shared): Embedding(256102, 1024, padding_idx=0)
  (text_encoder): SeamlessM4Tv2Encoder(
    (embed_tokens): SeamlessM4Tv2ScaledWordEmbedding(256102, 1024, padding_idx=0)
    (embed_positions): SeamlessM4Tv2SinusoidalPositionalEmbedding()
    (layers): ModuleList(
      (0-23): 24 x SeamlessM4Tv2EncoderLayer(
        (self_attn): SeamlessM4Tv2Attention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (ffn): SeamlessM4Tv2FeedForwardNetwork(
          (fc1): Linear(in_features=1024, out_features=8192, bias=True)
          (fc2): Linear(in_features=8192, out_fea

In [3]:
#snd (sindhi) and pbt (pashto)
english_text = "I am going to islamabad tomorrow"

text_inputs = processor(text=english_text, src_lang="eng", return_tensors="pt").to(device)
output_tokens = model.generate(**text_inputs, tgt_lang="pbt", generate_speech=False)
translated_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
print("Translated Text:", translated_text)

Translated Text: زه سبا اسلام آباد ته ځم


**Translation Pipepline + Data Proceccsing**

In [6]:
import pandas as pd
import re
import ast
from tqdm import tqdm
import torch
from transformers import AutoProcessor, SeamlessM4Tv2Model

SRC_LANG = "eng"  # Source language code
TGT_LANG = "pbt"  # Target language code

print(f"Model is running on device: {device}")
def translate_m4t(processor, model, text, tgt_lang=TGT_LANG, src_lang=SRC_LANG):
    text_inputs = processor(text=text, src_lang=src_lang, return_tensors="pt").to(device)
    output_tokens = model.generate(**text_inputs, tgt_lang=tgt_lang, generate_speech=False)
    translated_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
    return translated_text

def translate(processor, model, SQuAD, chunk_size=1000):
    total_rows = len(SQuAD)

    for chunk_start in tqdm(range(0, total_rows, chunk_size), desc="Processing chunks", ncols=100):
        chunk_end = min(chunk_start + chunk_size, total_rows)
        subset = SQuAD.iloc[chunk_start:chunk_end]
        
        rows = []
        for j in tqdm(range(len(subset)), desc=f"Processing rows {chunk_start + 1} to {chunk_end}", ncols=100):
            data_num = subset.iloc[j]["data_num"]
            paragraph_num = subset.iloc[j]["paragraph_num"]
            id = subset.iloc[j]["id"]
            title = subset.iloc[j]["title"]
            context = ast.literal_eval(subset.iloc[j]["context"])
            question = subset.iloc[j]["question"]
            is_impossible = subset.iloc[j]["is_impossible"]
            
            title_ = translate_m4t(processor, model, title)
            question_ = translate_m4t(processor, model, question)

            context_ = []
            review = True
            for sentence in context:
                if "••" in sentence:
                    sentence = re.sub("••'", "\"", sentence)
                    sentence = re.sub("'••", "\"", sentence)
                    sentence = re.sub("••", "\"", sentence)
                sentence_ = translate_m4t(processor, model, sentence)
                if sentence_.count("\"") == 2:
                    sentence_ = re.sub("\"", "••", sentence_)
                    review = False
                context_.append(sentence_)

            context_ = " ".join(context_)
            row = (data_num, paragraph_num, id, title_, context_, question_, is_impossible, review)
            rows.append(row)

        df_translated = pd.DataFrame(rows, columns=["data_num", "paragraph_num", "id", "title", "context",
                                                    "question", "is_impossible", "review"])

        output_path = f"/kaggle/working/SQuAD_Translated_{chunk_start // chunk_size + 1}.csv"
        df_translated.to_csv(output_path, index=False)
        print(f"Saved translated data for chunk {chunk_start // chunk_size + 1} to {output_path}")

Model is running on device: cuda


In [9]:
SQuAD_train = pd.read_csv("/kaggle/input/squadsubsetenglish/SquadSubset.csv")
translate(processor, model, SQuAD_train, chunk_size=10)

Processing chunks:   0%|                                                   | 0/2000 [00:00<?, ?it/s]
Processing rows 1 to 10:   0%|                                               | 0/10 [00:00<?, ?it/s][A
Processing rows 1 to 10:  10%|███▉                                   | 1/10 [00:02<00:24,  2.75s/it][A
Processing rows 1 to 10:  20%|███████▊                               | 2/10 [00:13<01:00,  7.60s/it][A
Processing rows 1 to 10:  30%|███████████▋                           | 3/10 [00:24<01:03,  9.02s/it][A
Processing rows 1 to 10:  40%|███████████████▌                       | 4/10 [00:34<00:56,  9.45s/it][A
Processing rows 1 to 10:  50%|███████████████████▌                   | 5/10 [00:38<00:36,  7.29s/it][A
Processing rows 1 to 10:  60%|███████████████████████▍               | 6/10 [00:41<00:23,  5.96s/it][A
Processing rows 1 to 10:  70%|███████████████████████████▎           | 7/10 [00:44<00:15,  5.05s/it][A
Processing rows 1 to 10:  80%|███████████████████████████████▏     

Saved translated data for chunk 1 to /kaggle/working/SQuAD_Translated_1.csv



Processing rows 11 to 20:   0%|                                              | 0/10 [00:00<?, ?it/s][A
Processing rows 11 to 20:  10%|███▊                                  | 1/10 [00:04<00:41,  4.57s/it][A
Processing rows 11 to 20:  20%|███████▌                              | 2/10 [00:09<00:40,  5.05s/it][A
Processing rows 11 to 20:  30%|███████████▍                          | 3/10 [00:15<00:36,  5.17s/it][A
Processing rows 11 to 20:  40%|███████████████▏                      | 4/10 [00:20<00:30,  5.16s/it][A
Processing rows 11 to 20:  50%|███████████████████                   | 5/10 [00:25<00:26,  5.24s/it][A
Processing rows 11 to 20:  60%|██████████████████████▊               | 6/10 [00:31<00:20,  5.24s/it][A
Processing rows 11 to 20:  70%|██████████████████████████▌           | 7/10 [00:41<00:20,  6.82s/it][A
Processing rows 11 to 20:  80%|██████████████████████████████▍       | 8/10 [00:47<00:13,  6.75s/it][A
Processing rows 11 to 20:  90%|████████████████████████████████

Saved translated data for chunk 2 to /kaggle/working/SQuAD_Translated_2.csv



Processing rows 21 to 30:   0%|                                              | 0/10 [00:00<?, ?it/s][A
Processing rows 21 to 30:  10%|███▊                                  | 1/10 [00:06<00:56,  6.33s/it][A
Processing rows 21 to 30:  20%|███████▌                              | 2/10 [00:10<00:40,  5.01s/it][A
Processing rows 21 to 30:  30%|███████████▍                          | 3/10 [00:16<00:37,  5.34s/it][A
Processing rows 21 to 30:  40%|███████████████▏                      | 4/10 [00:21<00:32,  5.48s/it][A
Processing rows 21 to 30:  50%|███████████████████                   | 5/10 [00:27<00:27,  5.51s/it][A
Processing rows 21 to 30:  60%|██████████████████████▊               | 6/10 [00:51<00:34,  8.57s/it][A
Processing chunks:   0%|                                        | 2/2000 [03:03<50:48:19, 91.54s/it]


KeyboardInterrupt: 

In [12]:
import pandas as pd
import re
import ast
from tqdm import tqdm
import torch
from transformers import AutoProcessor, SeamlessM4Tv2Model

SRC_LANG = "eng"  # Source language code
TGT_LANG = "snd"  # Target language code

def translate_m4t(processor, model, text, tgt_lang=TGT_LANG, src_lang=SRC_LANG):
    text_inputs = processor(text=text, src_lang=src_lang, return_tensors="pt").to(device)
    output_tokens = model.generate(**text_inputs, tgt_lang=tgt_lang, generate_speech=False)
    translated_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
    return translated_text

def translate_subset(processor, model, SQuAD, data, subset_size=5, output_dir="/kaggle/working"):
    for i in data:
        print(f"Translating subset of data: {(i + 1)}/{len(data)}")
        df = SQuAD[SQuAD["data_num"] == i].head(subset_size)  
        rows = []
        for j in tqdm(range(len(df))):
            data_num = df.iloc[j]["data_num"]
            paragraph_num = df.iloc[j]["paragraph_num"]
            id = df.iloc[j]["id"]
            title = df.iloc[j]["title"]
            context = ast.literal_eval(df.iloc[j]["context"])
            question = df.iloc[j]["question"]
            is_impossible = df.iloc[j]["is_impossible"]
            title_ = translate_m4t(processor, model, title)
            question_ = translate_m4t(processor, model, question)

            context_ = []
            review = True
            for sentence in context:
                if "••" in sentence:
                    sentence = re.sub("••'", "\"", sentence)
                    sentence = re.sub("'••", "\"", sentence)
                    sentence = re.sub("••", "\"", sentence)
                sentence_ = translate_m4t(processor, model, sentence)
                if sentence_.count("\"") == 2:
                    sentence_ = re.sub("\"", "••", sentence_)
                    review = False
                context_.append(sentence_)

            context_ = " ".join(context_)
            row = (data_num, paragraph_num, id, title_, context_, question_, is_impossible, review)
            rows.append(row)

        df_translated = pd.DataFrame(rows, columns=["data_num", "paragraph_num", "id", "title", "context",
                                                    "question", "is_impossible", "review"])
        output_path = f"{output_dir}/SQuAD_Translated_subset_{i + 1}.csv"
        df_translated.to_csv(output_path, index=False)
        print(f"Saved translated data for subset {i + 1} to {output_path}")

SQuAD_train = pd.read_csv("/kaggle/input/squadsubsetenglish/SquadSubset.csv")
translate_subset(processor, model, SQuAD_train, list(range(0, 1)), subset_size=5)  

Translating subset of data: 1/1


100%|██████████| 5/5 [00:27<00:00,  5.45s/it]

Saved translated data for subset 1 to /kaggle/working/SQuAD_Translated_subset_1.csv





**Google Translate API**

Testing

In [12]:
from google.cloud import translate_v2 as translate

def translate_text(target: str, text: str) -> dict:
    """Translates text into the target language.

    Target must be an ISO 639-1 language code.
    """
    translate_client = translate.Client()

    if isinstance(text, bytes):
        text = text.decode("utf-8")

    result = translate_client.translate(text, target_language=target)

    print("Text: {}".format(result["input"]))
    print("Translation: {}".format(result["translatedText"]))
    print("Detected source language: {}".format(result["detectedSourceLanguage"]))

    return result

import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "/"

target_language = "ps"  # Spanish
text_to_translate = "Hello, world!"

response = translate_text(target_language, text_to_translate)
print(response)


Text: "Beyoncé Giselle Knowles-Carter (/biː'jɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame ••in the late 1990s•• as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles Crazy in Love and Baby Boy."
Translation: &quot;بیونسي ګیزیل نولس-کارټر (/biː&#39;jɒnseɪ/ bee-YON-say) (د سپتمبر 4، 1981 زیږیدلی) یو امریکایی سندرغاړی، سندرغاړی، ریکارډ جوړونکی او لوبغاړی دی. په هوسټن، ټیکساس کې زیږیدلی او لوی شوی، هغې په مختلفو سندرو کې سندرې ترسره کړې. او د ماشوم په توګه د نڅا سیالۍ، او په وروستیو کې شهرت 