Dependencies

In [None]:
import torch
from transformers import AutoProcessor, SeamlessM4Tv2Model
import pandas as pd
import re
import ast
from tqdm import tqdm

In [None]:
!git clone https://github.com/facebookresearch/seamless_communication.git && cd seamless_communication && pip install .


In [None]:
pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2+cu118 -f https://download.pytorch.org/whl/torch_stable.html


Load Model

In [None]:
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Sample Testing

In [None]:
english_text = "I am going to islamabad tomorrow"
text_inputs = processor(text=english_text, src_lang="eng", return_tensors="pt").to(device)
output_tokens = model.generate(**text_inputs, tgt_lang="pbt", generate_speech=False)
translated_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)

print("Translated Text in Pashto:", translated_text)


Squad Translation Pipeline

In [None]:
SRC_LANG = "eng"  # Source language code
TGT_LANG = "urd"  # Target language code

def translate_m4t(processor, model, text, tgt_lang=TGT_LANG, src_lang=SRC_LANG):
    text_inputs = processor(text=text, src_lang=src_lang, return_tensors="pt").to(device)
    output_tokens = model.generate(**text_inputs, tgt_lang=tgt_lang, generate_speech=False)
    translated_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
    return translated_text

def translate(processor, model, SQuAD, data):
    for i in data:
        print(f"Translating data: {(i + 1)}/{len(data)}")
        df = SQuAD[SQuAD["data_num"] == i]
        rows = []
        for j in tqdm(range(len(df))):
            data_num = df.iloc[j]["data_num"]
            paragraph_num = df.iloc[j]["paragraph_num"]
            id = df.iloc[j]["id"]
            title = df.iloc[j]["title"]
            context = ast.literal_eval(df.iloc[j]["context"])
            question = df.iloc[j]["question"]
            is_impossible = df.iloc[j]["is_impossible"]

            title_ = translate_m4t(processor, model, title)
            question_ = translate_m4t(processor, model, question)

            context_ = []
            review = True
            for sentence in context:
                if "••" in sentence:
                    sentence = re.sub("••'", "\"", sentence)
                    sentence = re.sub("'••", "\"", sentence)
                    sentence = re.sub("••", "\"", sentence)
                sentence_ = translate_m4t(processor, model, sentence)
                if sentence_.count("\"") == 2:
                    sentence_ = re.sub("\"", "••", sentence_)
                    review = False
                context_.append(sentence_)

            context_ = " ".join(context_)
            row = (data_num, paragraph_num, id, title_, context_, question_, is_impossible, review)
            rows.append(row)

        df_translated = pd.DataFrame(rows, columns=["data_num", "paragraph_num", "id", "title", "context",
                                                    "question", "is_impossible", "review"])
        df_translated.to_csv(f"SQuAD-UR/train-v2.0/{i}.csv", index=False)



In [None]:
SQuAD_train = pd.read_csv("/SQuAD/train-v2.0-clean.csv")
translate(processor, model, SQuAD_train, list(range(0, 442)))

SQuAD_dev = pd.read_csv("/SQuAD/dev-v2.0-clean.csv")
translate(processor, model, SQuAD_dev, list(range(0, 35)))