In [None]:
from googletrans import Translator
import pandas as pd
from tqdm import tqdm

def drop_value(filename):
    df = pd.read_csv(filename)
    df = df[df['content'] != "No content"]
    df = df.reset_index(drop=True)
    return df

# Translate each row's title and content
def translate_all_rows(df):
    translator = Translator()
    titles_en = []
    contents_en = []

    for i in tqdm(range(len(df))):
        try:
            title = df.loc[i, 'title']
            content = df.loc[i, 'content']

            title_en = translator.translate(title, src='vi', dest='en').text
            content_en = translator.translate(content, src='vi', dest='en').text

        except Exception as e:
            print(f"Error at row {i}: {e}")
            title_en, content_en = "", ""

        titles_en.append(title_en)
        contents_en.append(content_en)

    df['title_en'] = titles_en
    df['content_en'] = contents_en
    return df

# Testing
df = drop_value("...csv")
df_translated = translate_all_rows(df)
df_translated.to_csv("...csv", index=False)

In [None]:
#restructuring the df (dropping the vietnamese title and content columns)
import pandas as pd
df = pd.read_csv("...csv")
df.drop(columns=["title", "content"], inplace=True)
df = df[["title_en", "url", "date", "content_en"]]
df.to_csv("...csv", index=False)

In [None]:
# Helsinki model for translation - FAILED due to low semantic quality and poor Vietnamese support
import pandas as pd
from transformers import pipeline
import re

df = pd.read_csv("example.csv").head(1)

translator = pipeline("translation", model="Helsinki-NLP/opus-mt-vi-en")

def translate_full_text(text, max_chars=500):
    if not isinstance(text, str) or not text.strip():
        return ""

    sentences = re.split(r'(?<=[.!?])\s+', text.strip())  # split by sentence
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 <= max_chars:
            current_chunk += sentence + " "
        else:
            # Translate current chunk
            try:
                translated = translator(current_chunk.strip())[0]['translation_text']
                chunks.append(translated)
            except Exception as e:
                print(f"Translation failed on chunk: {e}")
            current_chunk = sentence + " "

    if current_chunk:
        try:
            translated = translator(current_chunk.strip())[0]['translation_text']
            chunks.append(translated)
        except Exception as e:
            print(f"Translation failed on last chunk: {e}")

    return " ".join(chunks)

df["title_en"] = df["title"].apply(lambda x: translate_full_text(x, max_chars=512))
df["content_en"] = df["content"].apply(lambda x: translate_full_text(x, max_chars=512))
df.to_csv("example.csv", index=False)
