In [6]:
# Dependencies
# !pip install transformers sentencepiece tqdm pandas gdown sacremoses -q -U googletrans
!pip install googletrans pandas gdown tqdm

# Download raw files
!gdown 1Cd1boZux0fZvcX_z2-9zUKeKWrf1WFkE # Train file
!gdown 1sE_TdtM3E2a19ZUqVS-p_5qFCZi0uQjW # Val file
!gdown 1vEXyamOlioPpFssxR4r8JY5acjRM3E0m # Test file
!ls

Downloading...
From: https://drive.google.com/uc?id=1Cd1boZux0fZvcX_z2-9zUKeKWrf1WFkE
To: /content/PHOENIX-2014-T.train.corpus.csv
100% 1.69M/1.69M [00:00<00:00, 121MB/s]
Downloading...
From: https://drive.google.com/uc?id=1sE_TdtM3E2a19ZUqVS-p_5qFCZi0uQjW
To: /content/PHOENIX-2014-T.dev.corpus.csv
100% 119k/119k [00:00<00:00, 98.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1vEXyamOlioPpFssxR4r8JY5acjRM3E0m
To: /content/PHOENIX-2014-T.test.corpus.csv
100% 142k/142k [00:00<00:00, 94.7MB/s]
PHOENIX-2014-T.dev.corpus.csv	PHOENIX-2014-T.train.corpus.csv
PHOENIX-2014-T.test.corpus.csv	sample_data


In [7]:
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
# from transformers import MarianMTModel, MarianTokenizer
# import torch
import re

In [8]:
# @title
# Enhanced text cleaning functions based on PHOENIX-2014-T dataset analysis
def clean_vietnamese_text(text):
    """
    Clean Vietnamese text - remove symbols, normalize punctuation and whitespace
    """
    if not text or str(text).strip() == "":
        return ""

    text = str(text).strip()

    # Remove non-text symbols (musical notes, special symbols)
    text = re.sub(r'[♪♫…""`~@#$%^&*()_+=\[\]{}|\\:;"<>?/]', "", text)

    # Keep only letters, numbers, basic punctuation and Vietnamese characters
    text = re.sub(r"[^\w\s.,!?-]", "", text)

    # Fix punctuation spacing
    text = re.sub(r"\s+([.,!?])", r"\1", text)  # Remove space before punctuation
    text = re.sub(r"([.,!?])([^\s])", r"\1 \2", text)  # Add space after punctuation

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text


def clean_german_text(text):
    """
    Clean German text - remove symbols, normalize punctuation and whitespace
    Enhanced for PHOENIX dataset specific issues
    """
    if not text or str(text).strip() == "":
        return ""

    text = str(text).strip()

    # Convert to lowercase for better translation
    text = text.lower()

    # Remove non-text symbols
    text = re.sub(r'[♪♫…""`~@#$%^&*()_+=\[\]{}|\\:;"<>?/]', "", text)

    # Keep only letters, numbers, basic punctuation and German characters
    text = re.sub(r"[^\w\s.,!?-äöüÄÖÜß]", "", text)

    # Fix common German text issues in PHOENIX dataset
    # Remove multiple consecutive punctuation marks
    text = re.sub(r"[.,!?]{2,}", ".", text)

    # Fix spacing around numbers and temperature expressions
    text = re.sub(r"(\d+)\s*grad", r"\1 grad", text)
    text = re.sub(r"minus\s+(\d+)", r"minus \1", text)
    text = re.sub(r"plus\s+(\d+)", r"plus \1", text)

    # Fix punctuation spacing
    text = re.sub(r"\s+([.,!?])", r"\1", text)
    text = re.sub(r"([.,!?])([^\s])", r"\1 \2", text)

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text


def clean_german_gloss(text):
    """
    Clean German sign language gloss - enhanced for PHOENIX dataset specific patterns
    """
    if not text or str(text).strip() == "":
        return ""

    text = str(text).strip().upper()

    # Remove all symbols except letters, numbers, spaces, hyphens
    text = re.sub(r"[^\w\s-]", "", text)

    # Handle specific PHOENIX dataset patterns
    # Fix hyphenated compounds and multi-word expressions
    text = re.sub(r"(\w+)-(\w+)", r"\1 \2", text)  # Split hyphenated words

    # Normalize special gloss notation (IX, HABEN2, etc.)
    text = re.sub(
        r"(\w+)(\d+)", r"\1", text
    )  # Remove numbers from end of words (HABEN2 -> HABEN)

    # Handle location markers (common in sign language)
    text = re.sub(r"\bIX\b", "PUNKT", text)  # IX often means pointing/location

    # Handle repeated words (sign language emphasis)
    text = re.sub(r"\b(\w+)\s+\1\b", r"\1", text)  # Remove immediate repetitions

    # Clean up multiple spaces and hyphens
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"-+", "-", text)

    # Remove leading/trailing hyphens and spaces
    text = re.sub(r"^[-\s]+|[-\s]+$", "", text)

    # Handle empty results
    if not text.strip():
        return ""

    return text

In [9]:
# @title
def validate_data_quality(df):
    """
    Check and report data quality issues
    """
    print("  📊 Data Quality Check:")

    # Check for empty/null values
    empty_gloss = df["orth"].isna().sum() + (df["orth"].str.strip() == "").sum()
    empty_translation = (
        df["translation"].isna().sum() + (df["translation"].str.strip() == "").sum()
    )

    print(f"    - Empty gloss entries: {empty_gloss}")
    print(f"    - Empty translation entries: {empty_translation}")

    # Check for very short entries (likely incomplete)
    short_gloss = (df["orth"].str.len() < 5).sum()
    short_translation = (df["translation"].str.len() < 10).sum()

    print(f"    - Very short gloss (<5 chars): {short_gloss}")
    print(f"    - Very short translation (<10 chars): {short_translation}")

    # Check for very long entries (likely errors)
    long_gloss = (df["orth"].str.len() > 200).sum()
    long_translation = (df["translation"].str.len() > 300).sum()

    print(f"    - Very long gloss (>200 chars): {long_gloss}")
    print(f"    - Very long translation (>300 chars): {long_translation}")

    return df

def filter_valid_entries(df):
    """
    Remove entries that are too short/long or have quality issues
    """
    initial_count = len(df)

    # Remove entries with very short content (likely incomplete)
    df = df[
        (df["orth"].str.len() >= 3)
        & (df["translation"].str.len() >= 5)
        & (df["orth"].str.strip() != "")
        & (df["translation"].str.strip() != "")
    ]

    # Remove entries that are too long (likely concatenated errors)
    df = df[(df["orth"].str.len() <= 200) & (df["translation"].str.len() <= 300)]

    filtered_count = len(df)
    removed_count = initial_count - filtered_count

    if removed_count > 0:
        print(
            f"  🧹 Filtered out {removed_count} low-quality entries ({removed_count/initial_count*100:.1f}%)"
        )

    return df

In [10]:
# Translation setup
# We will use googletrans for translation
# from transformers import MarianMTModel, MarianTokenizer
# import torch

# model_name = "Helsinki-NLP/opus-mt-de-vi"
# tokenizer = MarianTokenizer.from_pretrained(model_name)
# model = MarianMTModel.from_pretrained(model_name)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

from googletrans import Translator
import asyncio
import pandas as pd # Added pandas import
from pathlib import Path # Added Path import
from tqdm.auto import tqdm # Added tqdm import


translator = Translator()


async def translate_batch(texts):
    """
    Translate a list of texts using Google Translate in one batch with better error handling
    """
    # Handle empty or NaN values safely
    clean_texts = [
        "" if (pd.isna(t) or str(t).strip() == "") else str(t) for t in texts
    ]

    # Skip completely empty batches
    if all(text == "" for text in clean_texts):
        return [""] * len(clean_texts)

    try:
        # Translate as a batch
        translations = await translator.translate(clean_texts, src='de', dest='vi')
        return [t.text for t in translations]

    except Exception as e:
        print(f"    ⚠️ Translation error: {str(e)}")
        return [""] * len(clean_texts)

# The data processing and translation loop has been moved to cell da51082b which includes the fix for the asyncio error.

In [11]:
!pip install nest_asyncio -q
import nest_asyncio
nest_asyncio.apply()

I've added `nest_asyncio` to handle the event loop issue. Now, I will re-run the translation code using `asyncio.run()`.

In [12]:
# Translation setup
# We will use googletrans for translation
# from transformers import MarianMTModel, MarianTokenizer
# import torch

# model_name = "Helsinki-NLP/opus-mt-de-vi"
# tokenizer = MarianTokenizer.from_pretrained(model_name)
# model = MarianMTModel.from_pretrained(model_name)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

from googletrans import Translator
import asyncio

translator = Translator()


async def translate_batch(texts):
    """
    Translate a list of texts using Google Translate in one batch with better error handling
    """
    # Handle empty or NaN values safely
    clean_texts = [
        "" if (pd.isna(t) or str(t).strip() == "") else str(t) for t in texts
    ]

    # Skip completely empty batches
    if all(text == "" for text in clean_texts):
        return [""] * len(clean_texts)

    try:
        # Translate as a batch
        translations = await translator.translate(clean_texts, src='de', dest='vi')
        return [t.text for t in translations]

    except Exception as e:
        print(f"    ⚠️ Translation error: {str(e)}")
        return [""] * len(clean_texts)


files = [
    "PHOENIX-2014-T.dev.corpus.csv",
    "PHOENIX-2014-T.test.corpus.csv",
    "PHOENIX-2014-T.train.corpus.csv",
]

batch_size = 32  # Reduced for stability

for file in files:
    if not Path(file).is_file():
        print(f"File not found: {file}")
        continue

    print(f"Processing {file}...")
    df = pd.read_csv(file, delimiter="|", encoding="utf-8")
    initial_rows = len(df)
    print(f"  📈 Initial rows: {initial_rows}")


    # Keep only the columns we need
    if "orth" in df.columns and "translation" in df.columns:
        df = df[["orth", "translation"]]
    else:
        print(f"  ❌ Required columns not found in {file}")
        continue

    # Validate data quality
    validate_data_quality(df)

    # Filter out low-quality entries
    df = filter_valid_entries(df)
    after_filtering_rows = len(df)
    print(f"  📉 Rows after initial filtering: {after_filtering_rows} ({initial_rows - after_filtering_rows} removed)")


    if len(df) == 0:
        print(f"  ❌ No valid entries remaining in {file}")
        continue

    # Clean the original German data first
    print("  🧹 Cleaning German gloss...")
    df["orth"] = df["orth"].apply(clean_german_gloss)

    print("  🧹 Cleaning German sentences...")
    df["translation"] = df["translation"].apply(clean_german_text)

    # Remove entries that became empty after cleaning
    before_cleaning_empty_removal = len(df)
    df = df[(df["orth"].str.strip() != "") & (df["translation"].str.strip() != "")]
    after_cleaning_empty_removal = len(df)
    print(f"  📉 Rows after removing empty entries post-cleaning: {after_cleaning_empty_removal} ({before_cleaning_empty_removal - after_cleaning_empty_removal} removed)")


    if len(df) == 0:
        print(f"  ❌ No entries remaining after cleaning {file}")
        continue

    # Translate to Vietnamese
    print("  🔄 Translating to Vietnamese...")
    translations = []
    texts = df["translation"].tolist()

    # Process in batches with progress bar
    async def process_file():
        for i in tqdm(
            range(0, len(texts), batch_size),
            desc=f"Translating {file}",
            dynamic_ncols=True,
            leave=False,
        ):
            batch_texts = texts[i : i + batch_size]
            batch_translations = await translate_batch(batch_texts)
            translations.extend(batch_translations)

    asyncio.run(process_file())


    # Clean the Vietnamese translations
    print("  🧹 Cleaning Vietnamese translations...")
    cleaned_translations = [clean_vietnamese_text(trans) for trans in translations]

    df["viSentence"] = cleaned_translations

    # Final filtering - remove entries with empty translations
    before_final_filtering = len(df)
    df = df[df["viSentence"].str.strip() != ""]
    after_final_filtering = len(df)
    print(f"  📉 Rows after removing empty Vietnamese translations: {after_final_filtering} ({before_final_filtering - after_final_filtering} removed)")


    final_df = df.rename(columns={"orth": "geGloss", "translation": "geSentence"})

    # Save cleaned dataset
    name = ".".join(file.split(".")[:2])
    final_df.to_csv(f"{name}.csv", sep="|", index=False, encoding="utf-8")
    final_df.to_json(f"{name}.jsonl", orient="records", lines=True, force_ascii=False)

    print(f"  ✅ Saved clean dataset: {name}.csv")
    print(f"  📊 Final records: {len(final_df)}")

    # Show example of cleaned data
    if len(final_df) > 0:
        print(f"  📝 Example:")
        print(f"    German Gloss: {final_df['geGloss'].iloc[0]}")
        print(f"    German Text:  {final_df['geSentence'].iloc[0]}")
        print(f"    Vietnamese:   {final_df['viSentence'].iloc[0]}")
    print("-" * 60)

Processing PHOENIX-2014-T.dev.corpus.csv...
  📈 Initial rows: 519
  📊 Data Quality Check:
    - Empty gloss entries: 0
    - Empty translation entries: 0
    - Very short gloss (<5 chars): 0
    - Very short translation (<10 chars): 0
    - Very long gloss (>200 chars): 0
    - Very long translation (>300 chars): 0
  📉 Rows after initial filtering: 519 (0 removed)
  🧹 Cleaning German gloss...
  🧹 Cleaning German sentences...
  📉 Rows after removing empty entries post-cleaning: 519 (0 removed)
  🔄 Translating to Vietnamese...


Translating PHOENIX-2014-T.dev.corpus.csv:   0%|          | 0/17 [00:00<?, ?it/s]

  🧹 Cleaning Vietnamese translations...
  📉 Rows after removing empty Vietnamese translations: 519 (0 removed)
  ✅ Saved clean dataset: PHOENIX-2014-T.dev.csv
  📊 Final records: 519
  📝 Example:
    German Gloss: DRUCK TIEF KOMMEN
    German Text:  tiefer luftdruck bestimmt in den nächsten tagen unser wetter
    Vietnamese:   Áp suất không khí sâu sẽ xác định thời tiết của chúng ta trong vài ngày tới
------------------------------------------------------------
Processing PHOENIX-2014-T.test.corpus.csv...
  📈 Initial rows: 642
  📊 Data Quality Check:
    - Empty gloss entries: 0
    - Empty translation entries: 0
    - Very short gloss (<5 chars): 0
    - Very short translation (<10 chars): 0
    - Very long gloss (>200 chars): 0
    - Very long translation (>300 chars): 0
  📉 Rows after initial filtering: 642 (0 removed)
  🧹 Cleaning German gloss...
  🧹 Cleaning German sentences...
  📉 Rows after removing empty entries post-cleaning: 642 (0 removed)
  🔄 Translating to Vietnamese...


Translating PHOENIX-2014-T.test.corpus.csv:   0%|          | 0/21 [00:00<?, ?it/s]

  🧹 Cleaning Vietnamese translations...
  📉 Rows after removing empty Vietnamese translations: 642 (0 removed)
  ✅ Saved clean dataset: PHOENIX-2014-T.test.csv
  📊 Final records: 642
  📝 Example:
    German Gloss: REGEN SCHNEE REGION VERSCHWINDEN NORD REGEN KOENNEN REGION STERN KOENNEN SEHEN
    German Text:  regen und schnee lassen an den alpen in der nacht nach im norden und nordosten fallen hier und da schauer sonst ist das klar
    Vietnamese:   Mưa và tuyết buông dãy Alps trong đêm ở phía bắc và đông bắc rơi ở đây và đó.
------------------------------------------------------------
Processing PHOENIX-2014-T.train.corpus.csv...
  📈 Initial rows: 7096
  📊 Data Quality Check:
    - Empty gloss entries: 0
    - Empty translation entries: 0
    - Very short gloss (<5 chars): 1
    - Very short translation (<10 chars): 1
    - Very long gloss (>200 chars): 1
    - Very long translation (>300 chars): 0
  🧹 Filtered out 1 low-quality entries (0.0%)
  📉 Rows after initial filtering: 7095 (1

Translating PHOENIX-2014-T.train.corpus.csv:   0%|          | 0/222 [00:00<?, ?it/s]

    ⚠️ Translation error: 
  🧹 Cleaning Vietnamese translations...
  📉 Rows after removing empty Vietnamese translations: 7063 (32 removed)
  ✅ Saved clean dataset: PHOENIX-2014-T.train.csv
  📊 Final records: 7063
  📝 Example:
    German Gloss: JETZT WETTER MORGEN DONNERSTAG ZWOELF FEBRUAR
    German Text:  und nun die wettervorhersage für morgen donnerstag den zwölften august
    Vietnamese:   Và bây giờ dự báo thời tiết cho ngày mai thứ năm
------------------------------------------------------------


In [13]:
from google.colab import files
import os

files_to_download = [
    "PHOENIX-2014-T.dev.csv",
    "PHOENIX-2014-T.dev.jsonl",
    "PHOENIX-2014-T.test.csv",
    "PHOENIX-2014-T.test.jsonl",
    "PHOENIX-2014-T.train.csv",
    "PHOENIX-2014-T.train.jsonl",
]

for file in files_to_download:
    if os.path.exists(file):
        files.download(file)
    else:
        print(f"File not found: {file}")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>