In [2]:
from googletrans import Translator
import time
import os
import pandas as pd
from tqdm import tqdm

df = pd.read_csv("data_final/us_news.csv")

n_batches = 10
batch_size = len(df) // n_batches

translator = Translator()
tqdm.pandas()

def safe_translate_to_zh(text):
    try:
        return translator.translate(text, src='en', dest='zh-cn').text
    except Exception as e:
        print(f"⚠️ Translation error: {e}")
        return "Translation error"

output_dir = 'us_batches'
os.makedirs(output_dir, exist_ok=True)

for i in range(n_batches):
    start = i * batch_size
    end = (i + 1) * batch_size if i < n_batches - 1 else len(df)
    batch = df.iloc[start:end]

    print(f"\n🚀 Translating batch {i+1}/{n_batches} ({start} to {end})")

    translated = batch['Headlines'].progress_apply(safe_translate_to_zh)
    translated_df = pd.DataFrame({'Headlines_Chinese': translated})

    translated_df.to_csv(f'{output_dir}/us_news_translated_batch_{i+1}.csv', index=False, encoding='utf-8-sig')
    print(f"✅ Saved batch {i+1}")
    time.sleep(1)



🚀 Translating batch 1/10 (0 to 400)


100%|██████████| 400/400 [04:44<00:00,  1.41it/s]


✅ Saved batch 1

🚀 Translating batch 2/10 (400 to 800)


100%|██████████| 400/400 [05:16<00:00,  1.26it/s]


✅ Saved batch 2

🚀 Translating batch 3/10 (800 to 1200)


100%|██████████| 400/400 [05:27<00:00,  1.22it/s]


✅ Saved batch 3

🚀 Translating batch 4/10 (1200 to 1600)


 80%|███████▉  | 319/400 [04:15<03:00,  2.22s/it]

⚠️ Translation error: The read operation timed out


100%|██████████| 400/400 [05:22<00:00,  1.24it/s]


✅ Saved batch 4

🚀 Translating batch 5/10 (1600 to 2000)


100%|██████████| 400/400 [05:27<00:00,  1.22it/s]


✅ Saved batch 5

🚀 Translating batch 6/10 (2000 to 2400)


 53%|█████▎    | 211/400 [02:53<06:15,  1.99s/it]

⚠️ Translation error: The read operation timed out


100%|██████████| 400/400 [05:24<00:00,  1.23it/s]


✅ Saved batch 6

🚀 Translating batch 7/10 (2400 to 2800)


100%|██████████| 400/400 [05:14<00:00,  1.27it/s]


✅ Saved batch 7

🚀 Translating batch 8/10 (2800 to 3200)


100%|██████████| 400/400 [05:02<00:00,  1.32it/s]


✅ Saved batch 8

🚀 Translating batch 9/10 (3200 to 3600)


100%|██████████| 400/400 [05:15<00:00,  1.27it/s]


✅ Saved batch 9

🚀 Translating batch 10/10 (3600 to 4000)


 52%|█████▏    | 208/400 [02:50<06:58,  2.18s/it]

⚠️ Translation error: The read operation timed out


100%|██████████| 400/400 [05:25<00:00,  1.23it/s]


✅ Saved batch 10


In [5]:
import pandas as pd
import glob
import re

def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower() for text in re.split('(\d+)', s)]

batch_files = sorted(glob.glob('us_batches/us_news_translated_batch_*.csv'), key=natural_sort_key)

translated_batches = [pd.read_csv(f, encoding='utf-8-sig') for f in batch_files]
df_translated = pd.concat(translated_batches, ignore_index=True)

df_us = pd.read_csv("data_final/us_news.csv")

df_final = df_us[['Date']].copy()
df_final['Headlines'] = df_translated['Headlines_Chinese']

df_final.to_csv('data_final/us_news_to_cn.csv', index=False, encoding='utf-8-sig')
print("🎉 All done! Final file saved: us_news_to_cn.csv")


🎉 All done! Final file saved: us_news_to_cn.csv
