# Notebook for Dataset Translation

In [34]:
import requests
import csv
import re
from tqdm.notebook import tqdm

Run LibreTranslate locally.

Method 1 (Docker):

```bash
git clone https://github.com/LibreTranslate/LibreTranslate
./run.sh [args]
```

Method 2 (pip):

```bash
pip install libretranslate
libretranslate [args]
```

Note:

- For both methods use args: ```--load-only en,ar ```
- For Mac: **Disable AirPlay Receiver!** (it runs on port 5000)


In [27]:
# Test if server works

url = "http://127.0.0.1:5000/translate"

params = {
    "q": "Hello, how are you?",
    "source": "en",
    "target": "ar"
}

try:
    response = requests.post(url, json=params)
    if response.status_code == 200:
        translated_text = response.json()["translatedText"]
        print(translated_text)
    else:
        print(f"Request failed with status code {response.status_code}")
except requests.exceptions.RequestException as e:
    print(f"Request failed: {e}")

مرحبا، كيف حالك؟?


In [38]:
# Functions to load, translate, and save datasets

def translate_text(text, src_language, trg_language):
    url = "http://127.0.0.1:5000/translate"
    headers = {'Content-Type': 'application/json'}
    data = {
        "q": text,
        "source": src_language,
        "target": trg_language,
        "format": "text",
        "api_key": ""
    }

    try:
        response = requests.post(url, json=data, headers=headers)
        if response.status_code == 200:
            return response.json()["translatedText"]
        else:
            print(f"HTTP error: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Network error: {e}")

def read_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():  # Ensure the line is not empty
                parts = line.strip().split('\t')
                data.append(parts)
    return data

def save_to_csv(data, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for row in data:
            writer.writerow(row)

def process_and_translate_dataset(file_path, output_file):
    data = read_data(file_path)
    translated_data = []
    
    print(f"Translating {file_path} . . .")
            
    for row in tqdm(data, desc="Translating tweets"):
        tweet_id, tweet_text, off_label, hs_label, vulgar_label, violence_label = row
        # Remove all emojis from the tweet text
        tweet_text = emoji_pattern.sub(r'', tweet_text)
        # Replace <LF> with .
        tweet_text = tweet_text.replace("<LF>", ".")
        translated_text = translate_text(tweet_text, 'ar', 'en')
        translated_data.append([tweet_id, translated_text, off_label, hs_label, vulgar_label, violence_label])
        # print(f"Tweet {tweet_id} done.")
    
    print(f"File {file_path} done.")
    
    save_to_csv(translated_data, output_file)
    
def process_and_translate_dataset_test(file_path, output_file):
    data = read_data(file_path)
    translated_data = []
    
    print(f"Translating {file_path} . . .")
            
    for row in tqdm(data, desc="Translating tweets"):
        tweet_id, tweet_text = row
        # Remove all emojis from the tweet text
        tweet_text = emoji_pattern.sub(r'', tweet_text)
        # Replace <LF> with .
        tweet_text = tweet_text.replace("<LF>", ".")
        translated_text = translate_text(tweet_text, 'ar', 'en')
        translated_data.append([tweet_id, translated_text])
        # print(f"Tweet {tweet_id} done.")
    
    print(f"File {file_path} done.")
    
    save_to_csv(translated_data, output_file)



In [30]:
# Test translate function

msg_en = "Hello. How are 😂😂😝😉 you?"
msg_ar = "مرحباً كيف حالك؟?"

print(translate_text(msg_en, "en", "ar"))
print(translate_text(msg_ar, "ar", "en"))

# Batched
msgs = ["Hello, how are you?", "What is your name?"]

print(translate_text(msgs, "en", "ar"))

مرحباً كيف حالك؟?
Hi. How are you?
['مرحبا، كيف حالك؟?', 'ما اسمك؟?']


In [31]:
file_paths = [
    "arabic-data/OSACT2022-sharedTask-dev.txt",
    "arabic-data/OSACT2022-sharedTask-test-tweets.txt",
    "arabic-data/OSACT2022-sharedTask-train.txt"
]

output_files = [
    "translated-data/OSACT2022-sharedTask-dev-en.csv",
    "translated-data/OSACT2022-sharedTask-test-tweets-en.csv",
    "translated-data/OSACT2022-sharedTask-train-en.csv"
]

In [32]:
# Test process_and_translate_dataset function for single rows in dataset

# Regex pattern to match emojis
emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F700-\U0001F77F"  # alchemical symbols
    u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    u"\U0001FA00-\U0001FA6F"  # Chess Symbols
    u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    u"\U00002702-\U000027B0"  # Dingbats
    u"\U000024C2-\U0001F251"  # Enclosed characters
    "]+", flags=re.UNICODE)

def process_and_translate_dataset_test(file_path, output_file, row_id):
    data = read_data(file_path)
    translated_data = []
    
    # for row in data:
    row = data[row_id]
    tweet_id, tweet_text, off_label, hs_label, vulgar_label, violence_label = row
    print(f"Before replacement: {tweet_text}")
    # Remove all emojis from the tweet text
    tweet_text = emoji_pattern.sub(r'', tweet_text)
    # Replace <LF> with .
    tweet_text = tweet_text.replace("<LF>", ".")
    print(f"After replacement: {tweet_text}")
    translated_text = translate_text(tweet_text, 'ar', 'en')
    translated_data.append([tweet_id, translated_text, off_label, hs_label, vulgar_label, violence_label])
    print(f"After transalation: {tweet_text}")
    save_to_csv(translated_data, output_file)

row_id = 2
process_and_translate_dataset_test('arabic-data/OSACT2022-sharedTask-dev.txt', f'translated-data/test{row_id}', row_id)

Before replacement: RT @USER ابديت السناب  الجديد ❌<LF>حاس الناس حوس ،أشوف مشاهير. تضيفني، مشاهير تتابع يومياتي ، ابديت كرهني بالسناب  كله 😤
After replacement: RT @USER ابديت السناب  الجديد .حاس الناس حوس ،أشوف مشاهير. تضيفني، مشاهير تتابع يومياتي ، ابديت كرهني بالسناب  كله 
After transalation: RT @USER ابديت السناب  الجديد .حاس الناس حوس ،أشوف مشاهير. تضيفني، مشاهير تتابع يومياتي ، ابديت كرهني بالسناب  كله 


In [36]:
process_and_translate_dataset(file_paths[0], output_files[0])

Translating arabic-data/OSACT2022-sharedTask-dev.txt . . .


Translating tweets:   0%|          | 0/1270 [00:00<?, ?it/s]

File arabic-data/OSACT2022-sharedTask-dev.txt done.
Translating arabic-data/OSACT2022-sharedTask-test-tweets.txt . . .


Translating tweets:   0%|          | 0/2541 [00:00<?, ?it/s]

ValueError: not enough values to unpack (expected 6, got 2)

In [40]:
process_and_translate_dataset(file_paths[2], output_files[2])
process_and_translate_dataset_test(file_paths[1], output_files[1])

Translating arabic-data/OSACT2022-sharedTask-train.txt . . .


Translating tweets:   0%|          | 0/8887 [00:00<?, ?it/s]

File arabic-data/OSACT2022-sharedTask-train.txt done.
Translating arabic-data/OSACT2022-sharedTask-test-tweets.txt . . .


Translating tweets:   0%|          | 0/2541 [00:00<?, ?it/s]

File arabic-data/OSACT2022-sharedTask-test-tweets.txt done.
