# T5 for text-translation

We used XStance dataset in our implementations. However, since XStance dataset has German and French sentences, we need to translate them to English. We utilized 2 T5 pre-trained models to achieve this.

In [None]:
!pip install transformers
!pip install sentencepiece



In [None]:
import json
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline

In [None]:
def parse_jsonl_file(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Load JSON object from each line
            json_data = json.loads(line)
            data.append(json_data)
    return data

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Dataset is being fetched from Google Drive. Therefore, correct path for dataset should be provided.

In [None]:
train_file_path = '/content/drive/MyDrive/CS533_Shared/project/dataset/XStance/train.jsonl'
train_data = parse_jsonl_file(train_file_path)

In [None]:
valid_file_path = '/content/drive/MyDrive/CS533_Shared/project/dataset/XStance/valid.jsonl'
valid_data = parse_jsonl_file(valid_file_path)

In [None]:
test_file_path = '/content/drive/MyDrive/CS533_Shared/project/dataset/XStance/test.jsonl'
test_data = parse_jsonl_file(test_file_path)

In [None]:
train_data_trunc_fr = [{'language': entry['language'], 'comment': entry['comment'], 'label': entry['label'], 'topic': entry['topic']} for entry in train_data if entry['language'] == "fr" and len(entry['comment'].split(" ")) < 50]
train_data_trunc_de = [{'language': entry['language'], 'comment': entry['comment'], 'label': entry['label'], 'topic': entry['topic']} for entry in train_data if entry['language'] == "de" and len(entry['comment'].split(" ")) < 50]

In [None]:
print(f"train data fr length: {len(train_data_trunc_fr)}")
print(f"train data de length: {len(train_data_trunc_de)}")

train data fr length: 9743
train data de length: 31012


In [None]:
valid_data_trunc_fr = [{'language': entry['language'], 'comment': entry['comment'], 'label': entry['label'], 'topic': entry['topic']} for entry in valid_data if entry['language'] == "fr" and len(entry['comment'].split(" ")) < 50]
valid_data_trunc_de = [{'language': entry['language'], 'comment': entry['comment'], 'label': entry['label'], 'topic': entry['topic']} for entry in valid_data if entry['language'] == "de" and len(entry['comment'].split(" ")) < 50]

In [None]:
print(f"valid data fr length: {len(valid_data_trunc_fr)}")
print(f"valid data de length: {len(valid_data_trunc_de)}")

valid data fr length: 899
valid data de length: 2610


In [None]:
test_data_trunc_fr = [{'language': entry['language'], 'comment': entry['comment'], 'label': entry['label'], 'topic': entry['topic']} for entry in test_data if entry['language'] == "fr" and len(entry['comment'].split(" ")) < 50]
test_data_trunc_de = [{'language': entry['language'], 'comment': entry['comment'], 'label': entry['label'], 'topic': entry['topic']} for entry in test_data if entry['language'] == "de" and len(entry['comment'].split(" ")) < 50]

In [None]:
print(f"test data fr length: {len(test_data_trunc_fr)}")
print(f"test data de length: {len(test_data_trunc_de)}")


test data fr length: 3672
test data de length: 11085


In [None]:
all_fr_data = train_data_trunc_fr + valid_data_trunc_fr + test_data_trunc_fr
all_de_data = train_data_trunc_de + valid_data_trunc_de + test_data_trunc_de

In [None]:
print(f"All fr length: {len(all_fr_data)}")
print(f"All de length: {len(all_de_data)}")

All fr length: 14314
All de length: 44707


We take 10,000 row from the dataset

In [None]:
all_fr_data = all_fr_data[:5000]
all_de_data = all_de_data[:5000]

This T5 model is pre-trained for translating French sentences to English.

In [None]:
fr_model_name = "Helsinki-NLP/opus-mt-fr-en"
fr_model = pipeline("translation", model=fr_model_name)



In [None]:
all_fr_data[0]

{'language': 'fr',
 'comment': "C'est un sujet délicat, tout dépend de l'état du patient, il faut que l'on respect sa dignité. La décision peut être prise par sa famille selon son état.",
 'label': 'FAVOR',
 'topic': 'Society'}

In [None]:
english_translations = []
for fr_data in all_fr_data:
    fr_sentence = fr_data["comment"]
    fr_sentence_label = fr_data["label"]
    fr_sentence_topic = fr_data["topic"]

    translated_sentence = fr_model(fr_sentence)
    translated_sentence = translated_sentence[0]["translation_text"]
    english_translations.append({"topic": fr_sentence_topic, "comment": translated_sentence, "label": fr_sentence_label})

This T5 model is pre-trained for translating German sentences to English.

In [None]:
de_model_name = "Helsinki-NLP/opus-mt-de-en"
de_model = pipeline("translation", model=de_model_name)



In [None]:
for de_data in all_de_data:
    de_sentence = de_data["comment"]
    de_sentence_label = de_data["label"]
    de_sentence_topic = de_data["topic"]

    translated_sentence = de_model(de_sentence)
    translated_sentence = translated_sentence[0]["translation_text"]
    english_translations.append({"topic": de_sentence_topic, "comment": translated_sentence, "label": de_sentence_label})

At the end, we created a CSV file from translated contents.


In [None]:
import csv

csv_file_path = 'output.csv'

# Write the list of dictionaries to a CSV file
with open(csv_file_path, 'w', newline='') as csvfile:
    fieldnames = ['comment', 'label', 'topic']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write the header
    writer.writeheader()

    # Write the data
    for entry in english_translations:
        writer.writerow(entry)