In [None]:
!git clone https://github.com/cardiffnlp/xlm-t

Cloning into 'xlm-t'...
remote: Enumerating objects: 212, done.[K
remote: Counting objects: 100% (77/77), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 212 (delta 71), reused 43 (delta 43), pack-reused 135 (from 1)[K
Receiving objects: 100% (212/212), 6.46 MiB | 10.24 MiB/s, done.
Resolving deltas: 100% (120/120), done.


In [None]:
import os
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer
from tqdm.notebook import tqdm
from google.colab import files

base_dir = "/content/xlm-t/data/sentiment"

def process_language(target_language):
    splits = ["train", "test", "val"]
    lang_codes = {
        "french": "fr",
        "german": "de",
        "italian": "it",
        "spanish": "es"
    }
    lang_to_model = {lang: f"Helsinki-NLP/opus-mt-{code}-en" for lang, code in lang_codes.items()}
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_name = lang_to_model[target_language]
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to(device)

    def helsinki_translate_batch(texts):
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            outputs = model.generate(**inputs)
        return tokenizer.batch_decode(outputs, skip_special_tokens=True)

    def word_by_word_translate(texts):
        results = []
        for text in texts:
            words = text.split()
            translated_words = [helsinki_translate_batch([word])[0] for word in words]
            results.append(" ".join(translated_words))
        return results

    csv_path = os.path.join(base_dir, f"{target_language}_processed.csv")
    existing_df = pd.read_csv(csv_path, encoding="utf-8") if os.path.exists(csv_path) else pd.DataFrame()
    all_rows = []
    lang_path = os.path.join(base_dir, target_language)
    save_interval = 100

    for split in splits:
        with open(os.path.join(lang_path, f"{split}_text.txt"), encoding="utf-8") as text_file, \
             open(os.path.join(lang_path, f"{split}_labels.txt"), encoding="utf-8") as label_file:
            texts = text_file.read().splitlines()
            labels = label_file.read().splitlines()

        # filter out already processed rows for this split
        processed_count = 0
        if not existing_df.empty:
            processed_count = (existing_df["Split"] == split).sum()
        texts = texts[processed_count:]
        labels = labels[processed_count:]

        batch_size = 32
        for i in tqdm(range(0, len(texts), batch_size), desc=f"{target_language} - {split}"):
            batch = texts[i:i + batch_size]
            batch_labels = labels[i:i + batch_size]
            fluent = helsinki_translate_batch(batch)
            literal = word_by_word_translate(batch)

            for text, label, lit, flu in zip(batch, batch_labels, literal, fluent):
                all_rows.append({
                    "Original": text,
                    "Split": split,
                    "Literal_Translation": lit,
                    "Fluent_Translation": flu,
                    "Sentiment": label
                })

                if len(all_rows) % save_interval == 0:
                    df = pd.concat([existing_df, pd.DataFrame(all_rows)], ignore_index=True)
                    df.to_csv(csv_path, index=False, encoding="utf-8")
                    files.download(csv_path)

    df = pd.concat([existing_df, pd.DataFrame(all_rows)], ignore_index=True)
    df.to_csv(csv_path, index=False, encoding="utf-8")
    files.download(csv_path)
    print(f"✅ Done: {target_language}")

In [None]:
process_language("spanish")

In [None]:
process_language("french")

In [None]:
process_language("german")

In [None]:
process_language("italian")

In [None]:
process_language('arabic')