# TRANSLATE USING GTRANSLATE

In [None]:
pip install googletrans

Collecting googletrans
  Downloading googletrans-4.0.2-py3-none-any.whl.metadata (10 kB)
Downloading googletrans-4.0.2-py3-none-any.whl (18 kB)
Installing collected packages: googletrans
Successfully installed googletrans-4.0.2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#!/usr/bin/env python3
"""
Bulk-translate a CSV column that contains Python-list-of-dicts conversations
into Twi (Akan) – fast, resumable, parallel using googletrans async API.
"""
import asyncio
import ast
import json
import sys
import time
from pathlib import Path
from typing import Any, Dict, List, Tuple

import pandas as pd
from googletrans import Translator

# ------------------------------------------------------------------ CONFIG
MAX_CONCURRENT = 20           # concurrent translators (be gentle with Google)
BATCH_SIZE     = 20          # texts per batch
CHUNK_SIZE     = 500         # rows between disk writes
DEST_LANG      = "gaa"        # Twi (Akan)
RETRY_DELAY    = 2           # seconds between retries
MAX_RETRIES    = 3           # max retries per batch
PROGRESS_FILE  = Path("/content/drive/MyDrive/Collab/Boafo/translation_progress.json")
INPUT_FILE     = Path("/content/drive/MyDrive/Collab/Boafo/code-dataset-output_eng-placeholders.csv")
OUTPUT_FILE    = Path("/content/drive/MyDrive/Collab/Boafo/code-dataset-output_twi.csv")

# ------------------------------------------------------------------ UTILS
def load_progress() -> Dict[str, Any]:
    if PROGRESS_FILE.exists():
        try:
            return json.loads(PROGRESS_FILE.read_text(encoding="utf8"))
        except Exception as e:
            print(f"⚠️  Corrupt progress file ({e}) – starting fresh.")
    return {"processed": [], "last_idx": -1, "errors": []}

def save_progress(progress: Dict[str, Any]) -> None:
    PROGRESS_FILE.write_text(json.dumps(progress, indent=2, ensure_ascii=False), encoding="utf8")

# ------------------------------------------------------------------ CONVERSATION HELPERS
def extract_texts(conv: List[Dict[str, Any]]) -> Tuple[List[str], List[int]]:
    texts, idx_map = [], []
    for i, msg in enumerate(conv):
        if isinstance(msg, dict) and msg.get("value"):
            text = str(msg["value"]).strip()
            if text:  # only add non-empty texts
                texts.append(text)
                idx_map.append(i)
    return texts, idx_map

def rebuild(conv: List[Dict[str, Any]], idx_map: List[int], translations: List[str]) -> List[Dict[str, Any]]:
    out = [dict(m) for m in conv]
    for j, idx in enumerate(idx_map):
        if j < len(translations):
            out[idx]["value"] = translations[j]
    return out

# ------------------------------------------------------------------ TRANSLATION WITH RETRY
async def translate_texts_with_retry(translator: Translator, texts: List[str], max_retries: int = MAX_RETRIES) -> List[str]:
    """Translate texts with retry logic using async googletrans"""

    for attempt in range(max_retries + 1):
        try:
            if not texts:
                return []

            # Translate in smaller batches to avoid rate limits
            results = []
            for i in range(0, len(texts), BATCH_SIZE):
                batch = texts[i:i + BATCH_SIZE]

                # Use the async translate method
                translated = await translator.translate(batch, dest=DEST_LANG)

                # Handle both single translation and list of translations
                if isinstance(translated, list):
                    results.extend([t.text for t in translated])
                else:
                    results.append(translated.text)

                # Small delay between batches to be nice to Google
                if i + BATCH_SIZE < len(texts):
                    await asyncio.sleep(0.1)

            return results

        except Exception as e:
            if attempt < max_retries:
                print(f"  ⚠️  Translation attempt {attempt + 1} failed: {e}. Retrying in {RETRY_DELAY}s...")
                await asyncio.sleep(RETRY_DELAY * (attempt + 1))  # exponential backoff
            else:
                print(f"  ❌  Translation failed after {max_retries + 1} attempts: {e}")
                return texts  # return original texts if all retries fail

# ------------------------------------------------------------------ SINGLE ROW WORKER
async def translate_one_row(idx: int, row: pd.Series, translator: Translator) -> Tuple[int, str, str | None]:
    """Return (index, new_conversations_string, error_or_None)"""
    try:
        conv = ast.literal_eval(str(row["conversations"]))
        texts, idx_map = extract_texts(conv)

        if texts:
            translations = await translate_texts_with_retry(translator, texts)
            new_conv = rebuild(conv, idx_map, translations)
        else:
            new_conv = conv

        return idx, str(new_conv), None

    except Exception as exc:
        error_msg = f"Row {idx}: {str(exc)}"
        print(f"  ❌  {error_msg}")
        return idx, str(row["conversations"]), error_msg

# ------------------------------------------------------------------ CHUNK PROCESSOR
async def process_chunk(
    chunk_rows: List[int], df: pd.DataFrame, translator: Translator, progress: Dict[str, Any]
) -> None:
    """Process a chunk of rows using semaphore for concurrency control"""
    import asyncio

    # Semaphore to limit concurrent translations
    semaphore = asyncio.Semaphore(MAX_CONCURRENT)

    async def _translate_with_semaphore(idx: int):
        async with semaphore:
            return await translate_one_row(idx, df.iloc[idx], translator)

    # Process all rows in the chunk concurrently
    results = await asyncio.gather(*[_translate_with_semaphore(idx) for idx in chunk_rows])

    # Update in-memory dataframe
    error_count = 0
    for idx, new_conv, err in results:
        df.at[idx, "conversations_twi"] = new_conv
        progress["processed"].append(idx)
        if err:
            progress["errors"].append({"row": idx, "error": err})
            error_count += 1

    progress["last_idx"] = max(progress["last_idx"], max(chunk_rows))

    # Single disk write per chunk
    df.to_csv(OUTPUT_FILE, index=False)
    save_progress(progress)

    success_count = len(chunk_rows) - error_count
    print(f"  ✅  Processed chunk: {success_count} success, {error_count} errors (total done: {len(progress['processed'])})")

# ------------------------------------------------------------------ MAIN
async def main_async() -> None:
    import asyncio

    if "--reset" in sys.argv:
        if PROGRESS_FILE.exists():
            PROGRESS_FILE.unlink()
        print("🗑️  Progress reset – starting fresh.")

    progress = load_progress()

    print(f"📖  Loading CSV from: {INPUT_FILE}")
    df = pd.read_csv(INPUT_FILE)

    if "conversations_twi" not in df.columns:
        df["conversations_twi"] = None

    all_rows = set(range(len(df)))
    done_rows = set(progress["processed"])
    todo_rows = sorted(all_rows - done_rows)

    if not todo_rows:
        print("🎉  All rows already translated!")
        return

    print(f"📊  Rows total: {len(df)}  |  Done: {len(done_rows)}  |  Todo: {len(todo_rows)}")
    print(f"🔧  Config: {MAX_CONCURRENT} concurrent, {CHUNK_SIZE} chunk size, '{DEST_LANG}' target")
    print(f"💾  Output: {OUTPUT_FILE}")

    start_time = time.time()

    # Create translator with async context manager
    async with Translator() as translator:
        for i in range(0, len(todo_rows), CHUNK_SIZE):
            chunk = todo_rows[i : i + CHUNK_SIZE]
            chunk_num = i // CHUNK_SIZE + 1
            total_chunks = (len(todo_rows) + CHUNK_SIZE - 1) // CHUNK_SIZE

            print(f"\n🔄  Processing chunk {chunk_num}/{total_chunks} (rows {chunk[0]}-{chunk[-1]})...")
            await process_chunk(chunk, df, translator, progress)

            # Progress estimate
            elapsed = time.time() - start_time
            completed = len(progress["processed"]) - len(done_rows)  # newly completed
            if completed > 0:
                rate = completed / elapsed
                remaining = len(todo_rows) - completed
                eta = remaining / rate if rate > 0 else 0
                print(f"  📈  Rate: {rate:.1f} rows/sec, ETA: {eta/60:.1f} minutes")

    errors = progress.get("errors", [])
    print(f"\n✅  Translation complete!")
    print(f"📊  Total processed: {len(progress['processed'])}")
    print(f"❌  Errors: {len(errors)}")
    print(f"💾  Output saved to: {OUTPUT_FILE}")

    if errors:
        print(f"\n⚠️  Errors occurred in {len(errors)} rows:")
        for err in errors[-5:]:  # show last 5 errors
            print(f"    Row {err['row']}: {err['error']}")

# ------------------------------------------------------------------ JUPYTER-COMPATIBLE MAIN
def main():
    """Main function that works in both Jupyter and standalone Python"""
    import asyncio
    import nest_asyncio

    try:
        # Try to get the current event loop
        loop = asyncio.get_running_loop()
        # If we get here, we're in Jupyter/IPython with a running event loop
        print("🔄  Detected running event loop (Jupyter/IPython), using nest_asyncio...")

        # Install nest_asyncio to allow nested event loops
        try:
            nest_asyncio.apply()
            return loop.run_until_complete(main_async())
        except ImportError:
            print("📦  Installing nest_asyncio...")
            import subprocess
            subprocess.check_call([sys.executable, "-m", "pip", "install", "nest_asyncio"])
            import nest_asyncio
            nest_asyncio.apply()
            return loop.run_until_complete(main_async())

    except RuntimeError:
        # No event loop running, we can use asyncio.run()
        return asyncio.run(main_async())

# ------------------------------------------------------------------ ENTRY
if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n⏹️  Interrupted by user – progress saved.")
    except Exception as e:
        print(f"❌  Fatal error: {e}")
        import traceback
        traceback.print_exc()
else:
    # When imported/run in Jupyter, execute main directly
    try:
        main()
    except KeyboardInterrupt:
        print("\n⏹️  Interrupted by user – progress saved.")
    except Exception as e:
        print(f"❌  Fatal error: {e}")
        import traceback
        traceback.print_exc()

🔄  Detected running event loop (Jupyter/IPython), using nest_asyncio...
📖  Loading CSV from: /content/drive/MyDrive/Collab/Boafo/code-dataset-output_eng-placeholders.csv
📊  Rows total: 176999  |  Done: 0  |  Todo: 176999
🔧  Config: 20 concurrent, 500 chunk size, 'gaa' target
💾  Output: /content/drive/MyDrive/Collab/Boafo/code-dataset-output_twi.csv

🔄  Processing chunk 1/354 (rows 0-499)...
  ✅  Processed chunk: 500 success, 0 errors (total done: 500)
  📈  Rate: 20.7 rows/sec, ETA: 142.0 minutes

🔄  Processing chunk 2/354 (rows 500-999)...
  ✅  Processed chunk: 500 success, 0 errors (total done: 1000)
  📈  Rate: 19.6 rows/sec, ETA: 149.4 minutes

🔄  Processing chunk 3/354 (rows 1000-1499)...
  ✅  Processed chunk: 500 success, 0 errors (total done: 1500)
  📈  Rate: 20.4 rows/sec, ETA: 143.4 minutes

🔄  Processing chunk 4/354 (rows 1500-1999)...
  ✅  Processed chunk: 500 success, 0 errors (total done: 2000)
  📈  Rate: 20.2 rows/sec, ETA: 144.2 minutes

🔄  Processing chunk 5/354 (rows 200

# CLEAN LLM DATA

In [None]:
import pandas as pd
import re

# === Step 1: Load your CSV ===
input_file = "/content/drive/MyDrive/Collab/Boafo/code-dataset-output_twi.csv"
output_file = "/content/drive/MyDrive/Collab/Boafo/code-dataset-output_twi_cleaned.csv"
df = pd.read_csv(input_file)

# === Step 2: Remove the first column ===
df = df.iloc[:, 1:]

# === Step 3: Rename conversations_twi to conversations ===
df = df.rename(columns={"conversations_twi": "conversations"})

# === Step 6: Save updated CSV ===
df.to_csv(output_file, index=False, encoding='utf-8')
print(f"✅ Cleaned CSV saved to {output_file}")
print(f"📊 Total rows: {len(df)}")

✅ Cleaned CSV saved to /content/drive/MyDrive/Collab/Boafo/code-dataset-output_twi_cleaned.csv
📊 Total rows: 176999


# RESTORE PLACEHOLDERS

In [None]:
#!/usr/bin/env python3
"""
restore.py – restores original text from [N*@@] placeholders
"""
import csv
import re
from pathlib import Path

PLACEHOLDER_CSV = Path("/content/drive/MyDrive/Collab/Boafo/code-dataset-output_twi_cleaned.csv")
MAPPING_CSV     = Path("/content/drive/MyDrive/Collab/Boafo/mapping.csv")
RESTORED_CSV    = Path("/content/drive/MyDrive/Collab/Boafo/code-dataset-output_twi-final.csv")

def main():
    # 1. Build mapping {id -> original_text}
    mapping = {}
    with MAPPING_CSV.open(newline='', encoding='utf-8') as f:
        for row in csv.DictReader(f):
            # Convert id to integer for numeric lookup
            mapping[int(row['id'])] = row['original_text']

    # 2. Regex pattern that captures placeholders with format [number*@...@...]
    #    Matches square brackets containing digits, followed by *, followed by @ symbols
    placeholder_pat = re.compile(r'\[(\d+)\*@+\]')

    def expand(match):
        num = int(match.group(1))  # Extract and convert the number to integer
        return mapping.get(num, match.group(0))  # Replace if found, else keep

    # 3. Stream and process large CSV
    with PLACEHOLDER_CSV.open(newline='', encoding='utf-8') as infile, \
         RESTORED_CSV.open('w', newline='', encoding='utf-8') as outfile:

        reader = csv.DictReader(infile)
        writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
        writer.writeheader()

        for n, row in enumerate(reader, 1):
            if 'conversations' in row and row['conversations']:
                row['conversations'] = placeholder_pat.sub(expand, row['conversations'])
            writer.writerow(row)

            if n % 10_000 == 0:
                print(f"  {n:,} rows restored", end='\r')

        print(f"\n✅  {n:,} rows restored -> {RESTORED_CSV}")

if __name__ == '__main__':
    main()


✅  176,999 rows restored -> /content/drive/MyDrive/Collab/Boafo/code-dataset-output_twi-final.csv


# PUSH TO HF

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
import os
from huggingface_hub import login
import json
import ast
import re

def clean_text(text):
    """Remove invalid Unicode surrogate characters"""
    if isinstance(text, str):
        # Remove surrogate characters (U+D800 to U+DFFF)
        return text.encode('utf-8', errors='ignore').decode('utf-8', errors='ignore')
    return text

def clean_conversation(conv):
    """Clean all text in a conversation"""
    if not isinstance(conv, list):
        return conv

    cleaned = []
    for turn in conv:
        if isinstance(turn, dict):
            cleaned_turn = {}
            for key, value in turn.items():
                cleaned_turn[key] = clean_text(value)
            cleaned.append(cleaned_turn)
        else:
            cleaned.append(turn)

    return cleaned

def parse_conversation_string(conv_str):
    """
    Parse a conversation string that may contain Python list syntax
    """
    # Handle NaN or None
    if pd.isna(conv_str) or conv_str is None:
        return None

    # Convert to string and clean
    conv_str = str(conv_str).strip()

    if not conv_str or conv_str == 'nan':
        return None

    try:
        # Try ast.literal_eval (works for Python list syntax)
        result = ast.literal_eval(conv_str)
        if isinstance(result, list) and len(result) > 0:
            # Clean the parsed result
            return clean_conversation(result)
    except:
        pass

    try:
        # Try JSON parsing
        result = json.loads(conv_str)
        if isinstance(result, list) and len(result) > 0:
            return clean_conversation(result)
    except:
        pass

    return None

def upload_dataset_to_hf(csv_file_path, repo_id, token):
    """
    Upload a single-column CSV dataset to Hugging Face Hub
    """

    # Login to Hugging Face
    login(token=token)

    print("Reading CSV file (single column format)...")

    # Read with specific settings for single column with embedded newlines
    df = pd.read_csv(
        csv_file_path,
        header=0,
        quoting=1,  # QUOTE_ALL
        doublequote=True,
        escapechar=None,
        encoding='utf-8',
        on_bad_lines='skip',
        engine='python'  # Python engine handles complex cases better
    )

    print(f"Loaded dataframe shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")

    # Check if we have the conversations column
    if 'conversations' not in df.columns:
        print(f"❌ Expected 'conversations' column but found: {list(df.columns)}")
        return

    print(f"\nOriginal rows: {len(df)}")

    # Show sample of raw data
    print("\n📝 Sample raw data (first 3 rows):")
    for i in range(min(3, len(df))):
        raw = str(df['conversations'].iloc[i])
        print(f"  Row {i}: {raw[:150]}...")

    # Parse the conversations column
    print("\n🔄 Parsing conversations...")
    df['conversations'] = df['conversations'].apply(parse_conversation_string)

    # Remove None values
    valid_mask = df['conversations'].notna()
    failed_count = (~valid_mask).sum()

    print(f"\n📊 Parsing results:")
    print(f"  Total rows: {len(df)}")
    print(f"  Failed: {failed_count}")
    print(f"  Success: {valid_mask.sum()}")

    # Filter to valid rows only
    df = df[valid_mask].copy()

    # Additional validation
    df = df[df['conversations'].apply(lambda x: isinstance(x, list) and len(x) > 0)]

    print(f"  Final valid rows: {len(df)}")

    if len(df) == 0:
        print("\n❌ ERROR: No valid conversations could be parsed!")
        return

    # Show parsed sample
    print(f"\n✅ Sample parsed data:")
    sample = df['conversations'].iloc[0]
    print(f"  Type: {type(sample)}")
    print(f"  Number of turns: {len(sample)}")
    print(f"  First turn:")
    print(json.dumps(sample[0], ensure_ascii=False, indent=4)[:400])

    # Reset index
    df = df.reset_index(drop=True)

    # Convert to Hugging Face dataset
    print("\n📦 Converting to Hugging Face dataset...")

    # Create a clean dataset dict for conversion
    clean_data = {'conversations': df['conversations'].tolist()}

    # Convert using from_dict instead of from_pandas to avoid encoding issues
    dataset = Dataset.from_dict(clean_data)

    # Create dataset dictionary
    dataset_dict = DatasetDict({
        "train": dataset
    })

    # Push to Hub
    print(f"\n☁️  Uploading to {repo_id}...")
    try:
        dataset_dict.push_to_hub(
            repo_id,
            token=token,
            commit_message="Upload Code-170k-ga - Ga language coding conversations dataset",
            private=False
        )

        print("\n✅ SUCCESS! Dataset uploaded!")
        print(f"\n📊 Dataset Info:")
        print(f"   🔗 URL: https://huggingface.co/datasets/{repo_id}")
        print(f"   📝 Samples: {len(dataset):,}")
        print(f"   📋 Columns: {', '.join(dataset.column_names)}")

    except Exception as e:
        print(f"\n❌ Upload failed!")
        print(f"Error: {e}")

        # Try to save locally as backup
        try:
            backup_path = "/content/dataset_backup.jsonl"
            print(f"\n💾 Saving backup to {backup_path}...")
            with open(backup_path, 'w', encoding='utf-8') as f:
                for conv in clean_data['conversations']:
                    f.write(json.dumps({'conversations': conv}, ensure_ascii=False) + '\n')
            print(f"✅ Backup saved successfully!")
        except Exception as backup_error:
            print(f"❌ Backup also failed: {backup_error}")

def main():
    CSV_FILE_PATH = "/content/drive/MyDrive/Collab/Boafo/code-dataset-output_twi-final.csv"
    REPO_ID = f"{USERNAME}/{REPO_NAME}"

    if not os.path.exists(CSV_FILE_PATH):
        print(f"❌ File not found: {CSV_FILE_PATH}")
        return

    print(f"✓ Found file: {CSV_FILE_PATH}")
    print(f"✓ Target repo: {REPO_ID}\n")
    print("="*60)

    upload_dataset_to_hf(CSV_FILE_PATH, REPO_ID, HF_TOKEN)

if __name__ == "__main__":
    # ⚠️ WARNING: REGENERATE YOUR TOKEN - it's been exposed!
    HF_TOKEN = "INSERTT-HUGGING-FACE-TOKEN-HERE"
    USERNAME = "INSERT-HUGGING-FACE-USERNAME-HERE"
    REPO_NAME = "REPLACE*WITH*NAME*OF*MODEL"

    main()

✓ Found file: /content/drive/MyDrive/Collab/Boafo/code-dataset-output_twi-final.csv
✓ Target repo: michsethowusu/Code-170k-ga

Reading CSV file (single column format)...
Loaded dataframe shape: (176999, 1)
Columns: ['conversations']

Original rows: 176999

📝 Sample raw data (first 3 rows):
  Row 0: [{'from': 'human', 'value': 'Te abaafee tɛŋŋ afee algorithm ko koni ekɛtsake wiemɔi ni yɔɔ wiemɔ kuku mli lɛ?'}, {'from': 'gpt', 'value': 'Gbɛ kome ni...
  Row 1: [{'from': 'human', 'value': 'Ani mɔ ko baanyɛ aye abua mi ni mayɔse programming wiemɔ ni akɛtsuɔ nii yɛ code snippet nɛɛ mli?\n\nKod Snippet:\n```\nde...
  Row 2: [{'from': 'human', 'value': 'Ani abaanyɛ akɛ shishinumɔ ni aŋmala ashwie shi lɛ atsu nii ni akɛfee wiemɔi ashishitsɔɔmɔ wolo yɛ Python mli? Akɛ nɔkwɛm...

🔄 Parsing conversations...

📊 Parsing results:
  Total rows: 176999
  Failed: 46282
  Success: 130717
  Final valid rows: 130717

✅ Sample parsed data:
  Type: <class 'list'>
  Number of turns: 2
  First turn:
{
    "fr

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/131 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :   2%|1         | 2.10MB /  112MB            


✅ SUCCESS! Dataset uploaded!

📊 Dataset Info:
   🔗 URL: https://huggingface.co/datasets/michsethowusu/Code-170k-ga
   📝 Samples: 130,717
   📋 Columns: conversations
