# Translating Tweets

In [1]:
!git clone https://github.com/cardiffnlp/xlm-t

Cloning into 'xlm-t'...
remote: Enumerating objects: 212, done.[K
remote: Counting objects: 100% (77/77), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 212 (delta 71), reused 43 (delta 43), pack-reused 135 (from 1)[K
Receiving objects: 100% (212/212), 6.46 MiB | 8.74 MiB/s, done.
Resolving deltas: 100% (120/120), done.


In [2]:
import os
import pandas as pd

def process_language(language):
    base_dir = "/content/xlm-t/data/sentiment"
    lang_path = os.path.join(base_dir, language)

    all_rows = []

    for split in ["train", "val", "test"]:
        text_file_path = os.path.join(lang_path, f"{split}_text.txt")
        label_file_path = os.path.join(lang_path, f"{split}_labels.txt")

        if not os.path.exists(text_file_path) or not os.path.exists(label_file_path):
            print(f"Missing files for split '{split}' in {language}, skipping.")
            continue

        with open(text_file_path, encoding="utf-8") as text_file, \
             open(label_file_path, encoding="utf-8") as label_file:
            texts = text_file.read().splitlines()
            labels = label_file.read().splitlines()

        for text, label in zip(texts, labels):
            all_rows.append({
                "original_text": text,
                "label": int(label),
                "Split": split
            })

    df = pd.DataFrame(all_rows)
    output_path = os.path.join(base_dir, f"{language}.csv")
    df.to_csv(output_path, index=False, encoding="utf-8")
    print(f"Saved combined dataset to: {output_path}")

In [None]:
process_language("hindi")
process_language("arabic")
process_language("french")
process_language("german")
process_language("italian")
process_language("portuguese")
process_language("spanish")

Saved combined dataset to: /content/xlm-t/data/sentiment/hindi.csv
Saved combined dataset to: /content/xlm-t/data/sentiment/arabic.csv
Saved combined dataset to: /content/xlm-t/data/sentiment/french.csv
Saved combined dataset to: /content/xlm-t/data/sentiment/german.csv
Saved combined dataset to: /content/xlm-t/data/sentiment/italian.csv
Saved combined dataset to: /content/xlm-t/data/sentiment/portuguese.csv
Saved combined dataset to: /content/xlm-t/data/sentiment/spanish.csv


In [None]:
import csv
import time
import random
import os
from openai import AzureOpenAI

# API setup
# Include own sandbox_api_key, sandbox_endpoint, and sandbox_api_version here!
os.environ["AI_SANDBOX_KEY"] = ""
sandbox_api_key = os.environ.get('AI_SANDBOX_KEY')

sandbox_endpoint = ""
sandbox_api_version = ""

client = AzureOpenAI(
    api_key=sandbox_api_key,
    azure_endpoint=sandbox_endpoint,
    api_version=sandbox_api_version
)

prompt = '''
Translate the text below to English. Keep the translation as close to the original in tone and style as you can.
'''

csv_dir = "/content/xlm-t/data/sentiment"
output_dir = os.path.join(csv_dir, "translated")
os.makedirs(output_dir, exist_ok=True)

def translate(language):
    input_path = os.path.join(csv_dir, f"{language}.csv")
    output_path = os.path.join(output_dir, f"{language}_gpt4o_translations.csv")

    with open(input_path, newline='', encoding="utf-8") as infile, \
         open(output_path, "w", newline='', encoding="utf-8") as outfile:

        reader = csv.DictReader(infile)
        fieldnames = ["row_index", "original_text", "translated_text", "label", "Split"]
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        for idx, row in enumerate(reader):
            original_text = row["original_text"]
            label = row["label"]
            split = row.get("Split", "unspecified")  # default to "unspecified" if missing
            full_prompt = f"{prompt}\n\n{original_text}"

            try:
                response = client.chat.completions.create(
                    model="gpt-4o",
                    temperature=0,
                    max_tokens=500,
                    top_p=0.1,
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": full_prompt},
                    ]
                )

                translated_text = response.choices[0].message.content.strip()

                writer.writerow({
                    "row_index": idx,
                    "original_text": original_text,
                    "translated_text": translated_text,
                    "label": label,
                    "Split": split
                })

            except Exception as e:
                print(f"Error on row {idx}: {e}")
                writer.writerow({
                    "row_index": idx,
                    "original_text": original_text,
                    "translated_text": f"ERROR: {e}",
                    "label": label,
                    "Split": split
                })

            time.sleep(random.uniform(0.3, 0.5))

if __name__ == "__main__":
    languages = ["french", "german", "italian", "spanish", "arabic"]
    for lang in languages:
        translate(lang)
        print(f"Done with {lang}")
    print("Done!")

# Finetuning Sentiment Analysis Model

In [5]:
!pip install datasets



In [6]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import shutil

In [18]:
def preprocess_data(df):
    df = df[['translated_text', 'label', 'Split']].rename(columns={'translated_text': 'text'})
    df = df.dropna()
    df = df[~df['text'].str.startswith("ERROR:")]  # skip translation errors
    df = df[df['label'].isin([0, 1, 2])]
    return df

def tokenize_function(example, tokenizer):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=128)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    report = classification_report(labels, preds, output_dict=True)
    return {
        'accuracy': acc,
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1': report['weighted avg']['f1-score']
    }

def plot_confusion_matrix(cm, class_names):
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    return plt

In [19]:
def finetune_sentiment_model(language):
    print(f"\nProcessing language: {language}")
    base_dir = "/content/xlm-t/data/sentiment/translated"
    df = pd.read_csv(f"{base_dir}/{language}_gpt4o_translations.csv")

    model_name = 'cardiffnlp/twitter-roberta-base-sentiment'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    class_names = ["Negative", "Neutral", "Positive"]

    data = preprocess_data(df)

    train_df = data[data['Split'] == 'train']
    val_df = data[data['Split'] == 'val']
    test_df = data[data['Split'] == 'test']

    train_ds = Dataset.from_pandas(train_df[['text', 'label']])
    val_ds = Dataset.from_pandas(val_df[['text', 'label']])
    test_ds = Dataset.from_pandas(test_df[['text', 'label']])

    train_ds = train_ds.map(lambda x: tokenize_function(x, tokenizer), batched=True)
    val_ds = val_ds.map(lambda x: tokenize_function(x, tokenizer), batched=True)
    test_ds = test_ds.map(lambda x: tokenize_function(x, tokenizer), batched=True)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

    training_args = TrainingArguments(
        output_dir=f"./results_{language}_gpt4o_temp",
        eval_strategy="epoch",
        save_strategy="no",
        learning_rate=3e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=15,
        weight_decay=0.1,
        logging_steps=10,
        logging_dir=f"./logs_{language}_gpt4o",
        report_to="none",
        metric_for_best_model="eval_accuracy",
        warmup_ratio=0.1,
    )

    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=2,
        early_stopping_threshold=0.001
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping_callback]
    )

    trainer.train()

    print("Evaluating on test set...")
    test_metrics = trainer.evaluate(test_ds)

    predictions = trainer.predict(test_ds)
    preds = predictions.predictions.argmax(-1)
    labels = predictions.label_ids

    correct_predictions = (preds == labels).sum()
    total_samples = len(labels)

    cm = confusion_matrix(labels, preds)
    class_report = classification_report(labels, preds, target_names=class_names, output_dict=True)

    test_metrics.update({
        "detailed_metrics": class_report,
        "total_samples": total_samples,
        "correct_predictions": int(correct_predictions),
        "confusion_matrix": cm.tolist()
    })

    # Save confusion matrix
    plt = plot_confusion_matrix(cm, class_names)
    cm_path = f"{language}_gpt4o_confusion_matrix.png"
    plt.savefig(cm_path)
    plt.close()
    print(f"Saved confusion matrix to {cm_path}")

    # Save JSON results
    out_file = f"{language}_gpt4o_results.json"
    with open(out_file, "w") as f:
        json.dump(test_metrics, f, indent=2)
    print(f"Saved results to {out_file}")

    # Clean temp outputs
    temp_path = f"./results_{language}_gpt4o_temp"
    if os.path.exists(temp_path):
        shutil.rmtree(temp_path)
        print(f"Cleaned up {temp_path}")

In [20]:
finetune_sentiment_model("french")


Processing language: french


Map:   0%|          | 0/1820 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

Map:   0%|          | 0/862 [00:00<?, ? examples/s]

  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6988,0.738671,0.6875,0.682353,0.6875,0.671096
2,0.598,0.659557,0.746875,0.748138,0.746875,0.740395
3,0.3959,0.81976,0.734375,0.74913,0.734375,0.733655
4,0.3049,0.844349,0.734375,0.733836,0.734375,0.731709


Evaluating on test set...


Saved confusion matrix to french_gpt4o_confusion_matrix.png
Saved results to french_gpt4o_results.json
Cleaned up ./results_french_gpt4o_temp


In [21]:
finetune_sentiment_model("german")


Processing language: german


Map:   0%|          | 0/1768 [00:00<?, ? examples/s]

Map:   0%|          | 0/309 [00:00<?, ? examples/s]

Map:   0%|          | 0/841 [00:00<?, ? examples/s]

  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.644,0.577238,0.754045,0.755483,0.754045,0.750222
2,0.4584,0.604998,0.773463,0.775156,0.773463,0.772092
3,0.3828,0.6232,0.757282,0.756095,0.757282,0.75537
4,0.1874,0.80683,0.757282,0.760425,0.757282,0.758023


Evaluating on test set...


Saved confusion matrix to german_gpt4o_confusion_matrix.png
Saved results to german_gpt4o_results.json
Cleaned up ./results_german_gpt4o_temp


In [22]:
finetune_sentiment_model("spanish")


Processing language: spanish


Map:   0%|          | 0/1784 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Map:   0%|          | 0/841 [00:00<?, ? examples/s]

  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7194,0.681095,0.672078,0.67184,0.672078,0.671714
2,0.588,0.717819,0.688312,0.682095,0.688312,0.674703
3,0.4531,0.859821,0.668831,0.670396,0.668831,0.669536
4,0.239,1.068933,0.672078,0.669815,0.672078,0.669528


Evaluating on test set...


Saved confusion matrix to spanish_gpt4o_confusion_matrix.png
Saved results to spanish_gpt4o_results.json
Cleaned up ./results_spanish_gpt4o_temp


In [23]:
finetune_sentiment_model("italian")


Processing language: italian


Map:   0%|          | 0/1805 [00:00<?, ? examples/s]

Map:   0%|          | 0/319 [00:00<?, ? examples/s]

Map:   0%|          | 0/855 [00:00<?, ? examples/s]

  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7269,0.69509,0.727273,0.730976,0.727273,0.72559
2,0.6053,0.685753,0.749216,0.754441,0.749216,0.747359
3,0.3722,0.743253,0.730408,0.742975,0.730408,0.727894
4,0.2732,0.931411,0.739812,0.740786,0.739812,0.738802


Evaluating on test set...


Saved confusion matrix to italian_gpt4o_confusion_matrix.png
Saved results to italian_gpt4o_results.json
Cleaned up ./results_italian_gpt4o_temp


In [24]:
finetune_sentiment_model("arabic")


Processing language: arabic


Map:   0%|          | 0/1661 [00:00<?, ? examples/s]

Map:   0%|          | 0/298 [00:00<?, ? examples/s]

Map:   0%|          | 0/832 [00:00<?, ? examples/s]

  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7448,0.649639,0.708054,0.704247,0.708054,0.700042
2,0.61,0.674116,0.731544,0.731241,0.731544,0.730122
3,0.3876,0.780583,0.704698,0.698673,0.704698,0.698226
4,0.2032,1.014329,0.677852,0.678004,0.677852,0.672081


Evaluating on test set...


Saved confusion matrix to arabic_gpt4o_confusion_matrix.png
Saved results to arabic_gpt4o_results.json
Cleaned up ./results_arabic_gpt4o_temp
