# IMPORT

In [2]:
import os
import re
import time
import json
import asyncio
import numpy as np
import pandas as pd

from tqdm import tqdm
from googletrans import Translator

# FUNCTION

In [3]:
async def gg_translate(sentence, from_lang="vi", to_lang="en"):
    async with Translator() as translator:
        result = await translator.translate(sentence, src=from_lang, dest=to_lang)
        return result

async def run_gg_translate(sentence, from_lang="vi", to_lang="en"):
    translated_sentence_result = await gg_translate(sentence, from_lang=from_lang, to_lang=to_lang)
    return translated_sentence_result.text

In [4]:
async def row_translation(row_sentences, from_lang="vi", to_lang="en"):
    tasks = [run_gg_translate(sen) for sen in row_sentences]
    translated_sentences = await asyncio.gather(*tasks)
    return translated_sentences

# LOAD DATA

In [5]:
df_path = r"C:\APAC\all_projects\finetuning-airflow-project\projects\data\gather_all_contents.csv"
df = pd.read_csv(df_path)

# TRANSLATE

In [6]:
columns = df.columns
need_translated_columns = [
    col_name 
    for col_name in columns 
    if ("relative_sen" in col_name)
]

In [7]:
need_translated_df = df[need_translated_columns]
need_translated_values = need_translated_df.values

need_translated_columns = need_translated_df.columns
translated_columns = [f"translated_{col_name}" for col_name in need_translated_columns]
translated_data = {col_name: [] for col_name in translated_columns}

In [8]:
df.head()
df["Unnamed: 0"]

0       9981
1       9980
2       9979
3       9978
4       9977
        ... 
9977       4
9978       3
9979       2
9980       1
9981       0
Name: Unnamed: 0, Length: 9982, dtype: int64

In [10]:
save_path = r"C:\APAC\all_projects\finetuning-airflow-project\projects\data\save_translation\{id}.npy"
attempts = 3

for i, rows in tqdm(df.iterrows()):
    row_id = rows["Unnamed: 0"]
    row_save_path = save_path.format(id=row_id)
    if os.path.isfile(row_save_path):
        continue
    rows_values = rows[need_translated_columns].values
    translated_data = {col_name: None for col_name in translated_columns}
    
    #-- Translation
    for attempt in range(attempts):
        try:
            translated_rows = await row_translation(rows_values)
            for i, col_name in enumerate(need_translated_columns):
                trans_col_name = f"translated_{col_name}"
                translated_data[trans_col_name] = translated_rows[i]
                np.save(row_save_path, translated_data)
            break
        except Exception as e:
            print(f"ID: {row_id} - Try: {attempt}")
            print(e)
            time.sleep(2)
        
        

3144it [00:02, 1366.98it/s]

ID: 6458 - Try: 0
Server disconnected without sending a response.


3250it [03:04,  2.58s/it]  

ID: 6374 - Try: 0



3435it [11:54,  3.00s/it]

ID: 6180 - Try: 0



3444it [12:33,  3.84s/it]

ID: 6165 - Try: 0



3448it [12:54,  4.14s/it]

ID: 6164 - Try: 0



3693it [23:25,  2.41s/it]

ID: 5886 - Try: 0



3715it [24:24,  2.47s/it]

ID: 5862 - Try: 0



4195it [44:55,  2.55s/it]

ID: 5328 - Try: 0



4217it [45:56,  2.48s/it]

ID: 5308 - Try: 0



4235it [46:49,  2.49s/it]

ID: 5286 - Try: 0



4378it [53:00,  2.47s/it]

ID: 5133 - Try: 0



4381it [53:15,  3.64s/it]

ID: 5130 - Try: 0



4424it [55:09,  2.52s/it]

ID: 5077 - Try: 0



4490it [57:50,  2.59s/it]

ID: 5001 - Try: 0



4535it [59:47,  2.31s/it]

ID: 4956 - Try: 0



4567it [1:01:12,  2.36s/it]

ID: 4924 - Try: 0
<StreamReset stream_id:1, error_code:ErrorCodes.REFUSED_STREAM, remote_reset:True>


4593it [1:02:22,  3.36s/it]

ID: 4888 - Try: 0



4690it [1:06:33,  2.60s/it]

ID: 4780 - Try: 0



4764it [1:09:36,  2.14s/it]

ID: 4707 - Try: 0



5016it [1:19:34,  2.29s/it]

ID: 4417 - Try: 0



5276it [1:29:52,  2.44s/it]

ID: 4131 - Try: 0



5347it [1:32:40,  1.04s/it]


CancelledError: 