In [None]:
import pandas as pd
from googletrans import Translator
from tqdm import tqdm
import time
import os

# Load the full dataset
df = pd.read_csv('data_final/cn_news.csv')

# Divide into 10 batches
n_batches = 10
batch_size = len(df) // n_batches

# Initialize translator
translator = Translator()
tqdm.pandas()

def safe_translate(text):
    try:
        return translator.translate(text, src='zh-cn', dest='en').text
    except Exception as e:
        print(f"⚠️ Translation error: {e}")
        return "[Translation failed]"

# Output folder to store temporary results
output_dir = "cn_batches"
os.makedirs(output_dir, exist_ok=True)

for i in range(n_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size if i < n_batches - 1 else len(df)
    batch = df.iloc[start_idx:end_idx]
    
    print(f"\n🚀 Translating batch {i+1}/{n_batches} ({start_idx} to {end_idx})")

    translated = batch['Headlines'].progress_apply(safe_translate)
    translated_df = pd.DataFrame({'Headlines_English': translated})

    batch_output_path = os.path.join(output_dir, f'cn_news_translated_batch_{i+1}.csv')
    translated_df.to_csv(batch_output_path, index=False)

    print(f"✅ Saved batch {i+1} to {batch_output_path}")
    time.sleep(3)



🚀 Translating batch 1/10 (0 to 357)


100%|██████████| 357/357 [03:51<00:00,  1.54it/s]


✅ Saved batch 1 to data_final/batches/cn_news_translated_batch_1.csv

🚀 Translating batch 2/10 (357 to 714)


100%|██████████| 357/357 [04:21<00:00,  1.36it/s]


✅ Saved batch 2 to data_final/batches/cn_news_translated_batch_2.csv

🚀 Translating batch 3/10 (714 to 1071)


100%|██████████| 357/357 [07:08<00:00,  1.20s/it]


✅ Saved batch 3 to data_final/batches/cn_news_translated_batch_3.csv

🚀 Translating batch 4/10 (1071 to 1428)


100%|██████████| 357/357 [06:42<00:00,  1.13s/it]


✅ Saved batch 4 to data_final/batches/cn_news_translated_batch_4.csv

🚀 Translating batch 5/10 (1428 to 1785)


100%|██████████| 357/357 [07:21<00:00,  1.24s/it]


✅ Saved batch 5 to data_final/batches/cn_news_translated_batch_5.csv

🚀 Translating batch 6/10 (1785 to 2142)


 89%|████████▉ | 318/357 [07:23<07:11, 11.05s/it]

⚠️ Translation error: The read operation timed out


 99%|█████████▊| 352/357 [11:36<05:20, 64.15s/it]

⚠️ Translation error: The read operation timed out


100%|██████████| 357/357 [11:45<00:00,  1.98s/it]


✅ Saved batch 6 to data_final/batches/cn_news_translated_batch_6.csv

🚀 Translating batch 7/10 (2142 to 2499)


  3%|▎         | 12/357 [00:15<13:09,  2.29s/it]

⚠️ Translation error: The read operation timed out


100%|██████████| 357/357 [07:26<00:00,  1.25s/it]


✅ Saved batch 7 to data_final/batches/cn_news_translated_batch_7.csv

🚀 Translating batch 8/10 (2499 to 2856)


100%|██████████| 357/357 [07:36<00:00,  1.28s/it]


✅ Saved batch 8 to data_final/batches/cn_news_translated_batch_8.csv

🚀 Translating batch 9/10 (2856 to 3213)


100%|██████████| 357/357 [07:32<00:00,  1.27s/it]


✅ Saved batch 9 to data_final/batches/cn_news_translated_batch_9.csv

🚀 Translating batch 10/10 (3213 to 3578)


 83%|████████▎ | 303/365 [06:10<02:22,  2.29s/it]

⚠️ Translation error: The read operation timed out


100%|██████████| 365/365 [07:25<00:00,  1.22s/it]


✅ Saved batch 10 to data_final/batches/cn_news_translated_batch_10.csv


In [3]:
import glob
import re

def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split('(\d+)', s)]

input_files = sorted(glob.glob('cn_batches/cn_news_translated_batch_*.csv'), key=natural_sort_key)

for file in input_files:
    try:
        df = pd.read_csv(file, encoding='utf-8')
        df.to_csv(file, index=False, encoding='utf-8-sig')
    except Exception as e:
        print(f"❌ Error with file {file}: {e}")

df_original = pd.read_csv('data_final/cn_news.csv')
df_translated = pd.concat([pd.read_csv(f, encoding='utf-8-sig') for f in input_files], ignore_index=True)

df_final = df_original[['time']].copy()
df_final['Headlines'] = df_translated['Headlines_English']

final_output_path = 'data_final/cn_news_to_en.csv'
df_final.to_csv(final_output_path, index=False, encoding='utf-8-sig')

print("✅ Merged batches saved in correct order to:", final_output_path)


✅ Merged batches saved in correct order to: data_final/cn_news_to_en.csv
