In [13]:
import pandas as pd
import translators as ts
import logging
import time
from requests.exceptions import HTTPError
import random


In [14]:
# Define the input CSV, output CSV, and languages
input_csv = '/gns_code/data/bert/translated_titles_continued.csv'
output_csv = '/gns_code/data/bert/Fine-Tuning_Dataset.csv'
sample_output_csv = '/gns_code/data/bert/translated_sample.csv'
languages = ['en']

In [15]:
# Load your DataFrame
df = pd.read_csv(input_csv, header=0)

In [16]:
df.head()

Unnamed: 0,index,label,translated_title,original_title
0,25016,0,El fentanilo supera a la heroína como la droga...,Fentanyl Outpaces Heroin as the Deadliest Drug...
1,25017,0,Knifeman нападает на солдата в Парижском метро...,"Knifeman attacks soldier in Paris subway, terr..."
2,25018,0,O Hezbollah emerge um vencedor da turbulência ...,Hezbollah emerges a winner from Mideast turmoi...
3,25019,0,رئيس غامبيا ، في السلطة 22 عامًا ، يفقد الانتخ...,"Gambia’s President, in Power 22 Years, Loses E..."
4,25020,0,Buzz Aldrin: Retirez la station spatiale inter...,Buzz Aldrin: Retire the International Space St...


In [17]:
# Configure logging to save messages to a log file
logging.basicConfig(filename='/gns_code/logs/trans logs/translation_from_X_to_ENG_log.txt', level=logging.INFO, format='%(levelname)s: %(message)s')

def clean_dataframe(df):
    # Remove rows with empty titles
    df = df.dropna(subset=['original_title'])
    
    # Remove duplicate titles
    df = df.drop_duplicates(subset=['original_title'])
    
    return df

def translate_and_log(df, languages, output_csv):
    # Clean the DataFrame
    df = clean_dataframe(df)
    df = df.reset_index()

    # Create a new DataFrame to store the translated titles, their labels, and the original titles
    translated_df = pd.DataFrame(columns=['index', 'label', 'translated_title', 'original_title'])

    for i, (index, row) in enumerate(df.iterrows()):
        label = row['label']
        original_title = row['translated_title']
        text_to_translate = original_title

        if text_to_translate is None:
            continue

        # Select the language based on the current index modulo the number of languages
        lang = languages[i % len(languages)]
        logging.info(f"Translating to {lang}:")

        while True:
            try:
                translated_text = ts.translate_text(text_to_translate, translator='google', from_language='auto', to_language=lang)

                # Check if the translated_text is None
                if translated_text is None:
                    logging.warning(f"Translation to {lang} failed. Text may be None. Skipping this iteration.")
                    break  # Skip this iteration and continue with the next one

                logging.info(f"Translation to {lang}: {translated_text}")
                translated_df = pd.concat([translated_df, pd.DataFrame({'index': [index], 'label': [label], 'translated_title': [translated_text], 'original_title': [original_title]})], ignore_index=True)

                # Write the translated DataFrame to the output CSV file
                with open(output_csv, 'a') as f:
                    translated_df.to_csv(f, header=f.tell()==0, index=False)
                translated_df = pd.DataFrame(columns=['index', 'label', 'translated_title', 'original_title'])  # Reset the DataFrame

                break  # Translation successful, break out of the retry loop

            except HTTPError as e:
                if e.response.status_code == 429:
                    retry_time = random.randint(30, 90)  # Random retry time between 30 to 90 seconds
                    logging.warning(f"Rate limit exceeded. Waiting for {retry_time} seconds...")
                    time.sleep(retry_time)
                    continue  # Retry the translation
                else:
                    logging.error(f"Translation failed: {e}")
                    break  # Break the retry loop for other errors
            except IndexError as e:
                logging.error(f"IndexError occurred: {e}")
                break  # Skip this iteration and continue with the next one
        
        logging.info(f"Original Text: {text_to_translate}\n")

        
def test_translation_and_export_to_csv(input_csv, sample_output_csv, languages, sample_size=10):
    # Load your DataFrame
    df = pd.read_csv(input_csv, header=0)

    # Get a sample of data for testing (10 titles)
    sample_df = df.sample(n=sample_size)

    # Call the function to translate, log, and store the translated titles with their labels and original titles
    translate_and_log(sample_df, languages, sample_output_csv)


In [18]:
translate_and_log(df, languages, output_csv)
