In [1]:
import pandas as pd
import translators as ts
import logging
import time
from requests.exceptions import HTTPError
import random


Using region  server backend.



In [2]:
# Define the input CSV, output CSV, and languages
input_csv = '/gns_code/data/bert/WELFake_Dataset.csv'
output_csv = '/gns_code/data/bert/translated_titles.csv'
sample_output_csv = '/gns_code/data/bert/translated_sample.csv'
languages = ['fr', 'es', 'ru', 'pt', 'ar']

In [3]:
# Load your DataFrame
df = pd.read_csv(input_csv, header=0)

In [4]:
# Configure logging to save messages to a log file
logging.basicConfig(filename='/gns_code/logs/trans logs/translation_log.txt', level=logging.INFO, format='%(levelname)s: %(message)s')

def clean_dataframe(df):
    # Remove rows with empty titles
    df = df.dropna(subset=['title'])
    
    # Remove duplicate titles
    df = df.drop_duplicates(subset=['title'])
    
    return df

def translate_and_log(df, languages, output_csv):
    # Clean the DataFrame
    df = clean_dataframe(df)
    df = df.reset_index()

    # Create a new DataFrame to store the translated titles, their labels, and the original titles
    translated_df = pd.DataFrame(columns=['index', 'label', 'translated_title', 'original_title'])

    for i, (index, row) in enumerate(df.iterrows()):
        label = row['label']
        original_title = row['title']
        text_to_translate = original_title

        if text_to_translate is None:
            continue

        # Select the language based on the current index modulo the number of languages
        lang = languages[i % len(languages)]
        logging.info(f"Translating to {lang}:")

        while True:
            try:
                translated_text = ts.translate_text(text_to_translate, translator='google', from_language='auto', to_language=lang)
                logging.info(f"Translation to {lang}: {translated_text}")
                translated_df = pd.concat([translated_df, pd.DataFrame({'index': [index], 'label': [label], 'translated_title': [translated_text], 'original_title': [original_title]})], ignore_index=True)
                
                # Write the translated DataFrame to the output CSV file
                with open(output_csv, 'a') as f:
                    translated_df.to_csv(f, header=f.tell()==0, index=False)
                translated_df = pd.DataFrame(columns=['index', 'label', 'translated_title', 'original_title'])  # Reset the DataFrame
                
                break  # Translation successful, break out of the retry loop
            except HTTPError as e:
                if e.response.status_code == 429:
                    retry_time = random.randint(30, 90)  # Random retry time between 30 to 90 seconds
                    logging.warning(f"Rate limit exceeded. Waiting for {retry_time} seconds...")
                    time.sleep(retry_time)
                    continue  # Retry the translation
                else:
                    logging.error(f"Translation failed: {e}")
                    break  # Break the retry loop for other errors
            except IndexError as e:
                logging.error(f"IndexError occurred: {e}")
                break  # Skip this iteration and continue with the next one
        
        logging.info(f"Original Text: {text_to_translate}\n")

        
def test_translation_and_export_to_csv(input_csv, sample_output_csv, languages, sample_size=10):
    # Load your DataFrame
    df = pd.read_csv(input_csv, header=0)

    # Get a sample of data for testing (10 titles)
    sample_df = df.sample(n=sample_size)

    # Call the function to translate, log, and store the translated titles with their labels and original titles
    translate_and_log(sample_df, languages, sample_output_csv)


In [5]:
translate_and_log(df, languages, output_csv)
