In [None]:

!pip install -q nltk pandas transformers tqdm colorama psutil

import os
import csv
import pandas as pd
import nltk
import random
import time
import json
import torch
import datetime
import psutil
from tqdm.notebook import tqdm
from nltk.corpus import wordnet as wn
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from google.colab import drive
from colorama import init, Fore, Style

# Initialize colorama for colored terminal output
init()

# Download NLTK data
nltk.download('wordnet')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('tagsets')

# Make sure all resources are downloaded
try:
    from nltk.tag import pos_tag
    pos_tag(['test'])
    print(f"{Fore.GREEN}✅ NLTK tagger is working{Style.RESET_ALL}")
except LookupError:
    print(f"{Fore.YELLOW}⚠️ NLTK tagger error - downloading additional resources{Style.RESET_ALL}")
    nltk.download('averaged_perceptron_tagger_eng', quiet=True)
    nltk.download('universal_tagset', quiet=True)
except Exception as e:
    print(f"{Fore.RED}⚠️ NLTK error: {str(e)}{Style.RESET_ALL}")

# Mount Google Drive for checkpoints
try:
    drive.mount('/content/drive')
    checkpoint_dir = "/content/drive/MyDrive/language_processing_checkpoints"
    os.makedirs(checkpoint_dir, exist_ok=True)
    print(f"{Fore.GREEN}✅ Google Drive mounted. Checkpoints will be saved to Drive.{Style.RESET_ALL}")
except:
    checkpoint_dir = "./checkpoints"
    os.makedirs(checkpoint_dir, exist_ok=True)
    print(f"{Fore.YELLOW}⚠️ Google Drive not mounted. Checkpoints will be saved locally.{Style.RESET_ALL}")

# Set up directories
os.makedirs("data", exist_ok=True)
os.makedirs("data/pos", exist_ok=True)
os.makedirs("output", exist_ok=True)
os.makedirs("interim_results", exist_ok=True)  # New directory for interim results

# Get current time for logging
def get_timestamp():
    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Create log file for completed steps
completed_log = 'data/log_completed.txt'
if not os.path.exists(completed_log):
    open(completed_log, 'w').close()
    print(f"{Fore.GREEN}✅ Created log file for tracking completed steps{Style.RESET_ALL}")

def log_completed_step(step_name):
    """Log a completed step to the log file"""
    with open(completed_log, 'a') as f:
        f.write(f"{step_name},{get_timestamp()}\n")
    print(f"{Fore.GREEN}✅ [{get_timestamp()}] Completed step: {step_name}{Style.RESET_ALL}")

def is_step_completed(step_name):
    """Check if a step is already completed based on the log file"""
    if os.path.exists(completed_log):
        with open(completed_log, 'r') as f:
            completed_steps = [line.strip().split(',')[0] for line in f.readlines()]
        return step_name in completed_steps
    return False

# Progress tracking log
progress_log = 'data/progress_log.txt'
def log_progress(message, level="INFO"):
    """Log progress message with timestamp"""
    timestamp = get_timestamp()
    color = Fore.GREEN if level == "INFO" else Fore.YELLOW if level == "WARNING" else Fore.RED

    with open(progress_log, 'a') as f:
        f.write(f"[{timestamp}] {level}: {message}\n")

    print(f"{color}[{timestamp}] {level}: {message}{Style.RESET_ALL}")

# System info display
def display_system_info():
    """Display system info for monitoring resources"""
    try:
        # Get memory usage
        memory = psutil.virtual_memory()
        memory_used_gb = memory.used / (1024 ** 3)
        memory_total_gb = memory.total / (1024 ** 3)
        memory_percent = memory.percent

        # Get CPU usage
        cpu_percent = psutil.cpu_percent(interval=1)

        # Get GPU info if available
        gpu_info = ""
        if torch.cuda.is_available():
            gpu_memory_allocated = torch.cuda.memory_allocated() / (1024 ** 3)
            gpu_memory_reserved = torch.cuda.memory_reserved() / (1024 ** 3)
            gpu_info = f"GPU Memory: {gpu_memory_allocated:.2f}GB allocated / {gpu_memory_reserved:.2f}GB reserved"

        print(f"{Fore.CYAN}--- System Info ---{Style.RESET_ALL}")
        print(f"Memory: {memory_used_gb:.2f}GB / {memory_total_gb:.2f}GB ({memory_percent}%)")
        print(f"CPU Usage: {cpu_percent}%")
        if gpu_info:
            print(gpu_info)
        print(f"{Fore.CYAN}------------------{Style.RESET_ALL}")
    except:
        print(f"{Fore.YELLOW}⚠️ Couldn't display system info{Style.RESET_ALL}")

# Configuration
MAX_WORDS_PER_POS = 50000  # Maximum words per POS category
BATCH_SIZE = 100  # Translation batch size
TARGET_POS = ['NOUN', 'VERB', 'ADJ', 'ADV', 'DET', 'PRON', 'ADP', 'CONJ', 'NUM', 'INTJ']
INTERIM_SAVE_EVERY = 1000  # Save interim results every 1000 words

# Save interim results in chunks of 1000
def save_interim_results(data, pos, chunk_number):
    """Save interim results after each chunk of translations"""
    filename = f"interim_results/interim_{pos}_chunk_{chunk_number}.csv"

    # Create DataFrame and save
    temp_df = pd.DataFrame(data)
    temp_df.to_csv(filename, index=False)

    total_words = len(data)
    log_progress(f"Saved interim results for {pos}: {total_words} words (chunk {chunk_number})")

    # Also generate a small POS-aligned sample
    generate_interim_pos_aligned_sample(data, pos, chunk_number)

def generate_interim_pos_aligned_sample(data, pos, chunk_number):
    """Generate a small POS-aligned sample CSV for the current chunk"""
    filename = f"interim_results/interim_aligned_{pos}_chunk_{chunk_number}.csv"

    # Convert data to DataFrame
    pos_df = pd.DataFrame(data)

    # Group by POS
    pos_dict = {}
    for p in pos_df['pos'].unique():
        pos_words = pos_df[pos_df["pos"] == p]
        if len(pos_words) > 0:
            pos_dict[p] = {
                "english": pos_words["english"].tolist(),
                "french": pos_words["french"].tolist(),
                "bambara": pos_words["bambara"].tolist(),
                "wolof": pos_words["wolof"].tolist()
            }

    # Generate a small sample
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)

        # First header row: POS categories
        pos_header = []
        for p in pos_dict.keys():
            # Each POS category spans 4 columns (one per language)
            pos_header.extend([p, "", "", ""])
        writer.writerow(pos_header)

        # Second header row: Language codes
        lang_header = []
        for _ in pos_dict.keys():
            lang_header.extend(["ENG", "FR", "BAM", "WOL"])
        writer.writerow(lang_header)

        # Find maximum number of words for this chunk
        max_chunk_words = min(200, max([len(words["english"]) for words in pos_dict.values()]))

        # Write data rows
        for i in range(max_chunk_words):
            row = []
            for p in pos_dict.keys():
                words = pos_dict[p]
                # Add words or empty strings if index out of range
                en_word = words["english"][i] if i < len(words["english"]) else ""
                fr_word = words["french"][i] if i < len(words["french"]) else ""
                bam_word = words["bambara"][i] if i < len(words["bambara"]) else ""
                wol_word = words["wolof"][i] if i < len(words["wolof"]) else ""

                row.extend([en_word, fr_word, bam_word, wol_word])

            writer.writerow(row)

    log_progress(f"Generated interim aligned sample for {pos} (chunk {chunk_number})")

# WordNet POS mapping
WN_POS_MAP = {
    wn.NOUN: 'NOUN',
    wn.VERB: 'VERB',
    wn.ADJ: 'ADJ',
    wn.ADV: 'ADV'
}

# POS tag mapping (for words not in WordNet)
TAG_POS_MAP = {
    'NN': 'NOUN', 'NNS': 'NOUN', 'NNP': 'NOUN', 'NNPS': 'NOUN',
    'VB': 'VERB', 'VBD': 'VERB', 'VBG': 'VERB', 'VBN': 'VERB', 'VBP': 'VERB', 'VBZ': 'VERB',
    'JJ': 'ADJ', 'JJR': 'ADJ', 'JJS': 'ADJ',
    'RB': 'ADV', 'RBR': 'ADV', 'RBS': 'ADV',
    'DT': 'DET', 'PDT': 'DET', 'WDT': 'DET',
    'PRP': 'PRON', 'PRP$': 'PRON', 'WP': 'PRON', 'WP$': 'PRON',
    'IN': 'ADP',
    'CC': 'CONJ',
    'CD': 'NUM',
    'UH': 'INTJ'
}

# PHASE 1: EXTRACT LARGE-SCALE WORD LISTS WITH POS TAGS

def extract_large_word_lists():
    """Extract large-scale word lists with POS tags using WordNet and NLTK"""
    output_file = "data/pos/english_words_large.csv"
    checkpoint_file = f"{checkpoint_dir}/pos_extraction_large_checkpoint.csv"

    # Skip if already completed
    if is_step_completed("pos_extraction_large") and os.path.exists(output_file):
        log_progress("Skipping large-scale POS extraction - already completed")
        return pd.read_csv(output_file)

    # Check if resuming from checkpoint
    if os.path.exists(checkpoint_file):
        words_df = pd.read_csv(checkpoint_file)
        log_progress(f"Resuming from checkpoint with {len(words_df)} entries")
    else:
        # Initialize DataFrame with columns
        words_df = pd.DataFrame(columns=["word", "pos"])

    log_progress("Starting large-scale POS extraction...")

    # Track words we've already added
    existing_words = set((row['word'], row['pos']) for _, row in words_df.iterrows())
    total_words = len(existing_words)
    words_per_pos = {}

    for pos in TARGET_POS:
        words_per_pos[pos] = len(words_df[words_df['pos'] == pos])
        log_progress(f"Current count for {pos}: {words_per_pos[pos]}/{MAX_WORDS_PER_POS} words")

    # Step 1: Extract words from WordNet (covers NOUN, VERB, ADJ, ADV)
    if total_words < MAX_WORDS_PER_POS * len(TARGET_POS):
        log_progress("Extracting words from WordNet...")
        display_system_info()

        # Get words from WordNet for each POS
        new_rows = []

        for wn_pos, target_pos in WN_POS_MAP.items():
            if words_per_pos.get(target_pos, 0) >= MAX_WORDS_PER_POS:
                log_progress(f"Skipping {target_pos} - already have {MAX_WORDS_PER_POS} words")
                continue

            log_progress(f"Processing {target_pos} from WordNet")
            synsets = list(wn.all_synsets(wn_pos))
            random.shuffle(synsets)  # Randomize to get a diverse set

            # Create progress bar
            pbar = tqdm(synsets, desc=f"Processing {target_pos}")

            chunk_count = words_per_pos.get(target_pos, 0) // INTERIM_SAVE_EVERY
            chunk_words = []

            for synset in pbar:
                for lemma in synset.lemma_names():
                    # Skip multi-word expressions and non-alphabetic words
                    if '_' in lemma or not lemma.isalpha():
                        continue

                    lemma = lemma.lower()
                    if (lemma, target_pos) not in existing_words:
                        new_row = {"word": lemma, "pos": target_pos}
                        new_rows.append(new_row)
                        chunk_words.append(new_row)
                        existing_words.add((lemma, target_pos))

                        # Update progress bar description
                        if len(new_rows) % 100 == 0:
                            pbar.set_description(f"Processing {target_pos}: {len(new_rows) + words_per_pos.get(target_pos, 0)} words")

                        # Save interim results for this chunk
                        if len(chunk_words) >= INTERIM_SAVE_EVERY:
                            save_interim_results(chunk_words, target_pos, chunk_count)
                            chunk_count += 1
                            chunk_words = []

                        # Check if we have enough words for this POS
                        if len(new_rows) + words_per_pos.get(target_pos, 0) >= MAX_WORDS_PER_POS:
                            break

                # Check if we have enough words for this POS
                if len(new_rows) + words_per_pos.get(target_pos, 0) >= MAX_WORDS_PER_POS:
                    break

            # Save any remaining words in the current chunk
            if chunk_words:
                save_interim_results(chunk_words, target_pos, chunk_count)

            # Save checkpoint after each POS
            if new_rows:
                temp_df = pd.DataFrame(new_rows)
                current_df = pd.concat([words_df, temp_df], ignore_index=True)
                current_df.to_csv(checkpoint_file, index=False)
                current_df.to_csv(output_file, index=False)  # Also save to output file for viewing
                words_df = current_df
                new_rows = []
                log_progress(f"Checkpoint saved with {len(words_df)} entries")

                # Update counts
                words_per_pos[target_pos] = len(words_df[words_df['pos'] == pos])
                log_progress(f"Updated count for {target_pos}: {words_per_pos[target_pos]}/{MAX_WORDS_PER_POS} words")

                # Display system info
                display_system_info()

    # Step 2: Add common words for categories not well-covered by WordNet
    # (DET, PRON, ADP, CONJ, NUM, INTJ)

    # Common words for these categories
    common_words = {
        'DET': ['the', 'a', 'an', 'this', 'that', 'these', 'those', 'my', 'your', 'his', 'her', 'our', 'their', 'its',
                'each', 'every', 'some', 'any', 'no', 'all', 'both', 'either', 'neither', 'many', 'much', 'most', 'few',
                'which', 'what', 'whose', 'whichever', 'whatever'],
        'PRON': ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'who', 'whom', 'which',
                'what', 'whose', 'whoever', 'whomever', 'whatever', 'myself', 'yourself', 'himself', 'herself', 'itself',
                'ourselves', 'yourselves', 'themselves', 'each', 'other', 'all', 'another', 'some', 'any', 'somebody',
                'anyone', 'everybody', 'nobody', 'something', 'anything', 'nothing', 'everything'],
        'ADP': ['in', 'on', 'at', 'by', 'with', 'from', 'to', 'for', 'of', 'about', 'against', 'between', 'through',
                'during', 'before', 'after', 'above', 'below', 'under', 'over', 'beside', 'behind', 'across', 'into',
                'towards', 'onto', 'beyond', 'along', 'amid', 'among', 'around', 'concerning', 'considering', 'despite',
                'except', 'inside', 'like', 'near', 'off', 'out', 'outside', 'past', 'regarding', 'round', 'since',
                'throughout', 'till', 'until', 'upon', 'within', 'without'],
        'CONJ': ['and', 'or', 'but', 'if', 'when', 'than', 'because', 'although', 'since', 'unless', 'while',
                'as', 'that', 'whether', 'before', 'after', 'though', 'so', 'till', 'until', 'whereas', 'for',
                'nor', 'yet', 'once', 'provided', 'supposing', 'considering', 'even', 'otherwise', 'however'],
        'NUM': ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'first', 'second',
                'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth', 'once', 'twice', 'thrice',
                'dozen', 'hundred', 'thousand', 'million', 'billion', 'trillion', 'zero', 'half', 'quarter', 'double',
                'triple', 'quadruple', 'many', 'few', 'several', 'numerous', 'countless'],
        'INTJ': ['oh', 'wow', 'hey', 'hello', 'hi', 'yes', 'no', 'okay', 'well', 'ah', 'ouch', 'ugh', 'oops',
                'thanks', 'sorry', 'please', 'goodbye', 'bye', 'eh', 'hmm', 'er', 'um', 'alas', 'hurray', 'hooray',
                'bravo', 'congratulations', 'cheers', 'yay', 'yikes', 'phew', 'whew', 'huh', 'damn', 'darn', 'gosh',
                'goodness', 'jeez', 'bingo', 'voila', 'encore', 'oops', 'psst', 'shh', 'whoa', 'ha', 'haha']
    }

    new_rows = []
    for pos, words in common_words.items():
        log_progress(f"Adding common words for {pos}")

        chunk_count = words_per_pos.get(pos, 0) // INTERIM_SAVE_EVERY
        chunk_words = []

        for word in words:
            if (word, pos) not in existing_words:
                new_row = {"word": word, "pos": pos}
                new_rows.append(new_row)
                chunk_words.append(new_row)
                existing_words.add((word, pos))

                # Save interim results for this chunk
                if len(chunk_words) >= INTERIM_SAVE_EVERY:
                    save_interim_results(chunk_words, pos, chunk_count)
                    chunk_count += 1
                    chunk_words = []

        # Save any remaining words in the current chunk
        if chunk_words:
            save_interim_results(chunk_words, pos, chunk_count)

    # Step 3: Use NLTK's list of English words to fill remaining categories
    if total_words + len(new_rows) < MAX_WORDS_PER_POS * len(TARGET_POS):
        log_progress("Adding words from NLTK's word list...")

        from nltk.corpus import words
        english_words = words.words()
        random.shuffle(english_words)  # Randomize the list

        # Use NLTK's POS tagger to get categories
        try:
            # Use NLTK's POS tagger for the words
            log_progress("Tagging words with NLTK (this may take a while)...")
            tagged_words = nltk.pos_tag(english_words[:100000])  # Limit to avoid memory issues

            # Track chunks for each POS
            chunk_counts = {pos: words_per_pos.get(pos, 0) // INTERIM_SAVE_EVERY for pos in TARGET_POS}
            chunk_words = {pos: [] for pos in TARGET_POS}

            # Create progress bar
            pbar = tqdm(tagged_words, desc="Processing NLTK words")

            for word, tag in pbar:
                # Skip words we already have
                if not word.isalpha() or len(word) <= 2:
                    continue

                word = word.lower()
                pos = TAG_POS_MAP.get(tag)

                if pos in TARGET_POS and words_per_pos.get(pos, 0) + len([r for r in new_rows if r['pos'] == pos]) < MAX_WORDS_PER_POS:
                    if (word, pos) not in existing_words:
                        new_row = {"word": word, "pos": pos}
                        new_rows.append(new_row)
                        chunk_words[pos].append(new_row)
                        existing_words.add((word, pos))

                        # Update progress bar description every 100 words
                        if len(new_rows) % 100 == 0:
                            pbar.set_description(f"Processing NLTK words: {len(new_rows)} new words")

                        # Check for complete chunks and save interim results
                        for p in TARGET_POS:
                            if len(chunk_words[p]) >= INTERIM_SAVE_EVERY:
                                save_interim_results(chunk_words[p], p, chunk_counts[p])
                                chunk_counts[p] += 1
                                chunk_words[p] = []

                # Save checkpoint every 1000 words
                if len(new_rows) >= 1000:
                    temp_df = pd.DataFrame(new_rows)
                    current_df = pd.concat([words_df, temp_df], ignore_index=True)
                    current_df.to_csv(checkpoint_file, index=False)
                    current_df.to_csv(output_file, index=False)  # Also save to output file for viewing
                    words_df = current_df
                    new_rows = []
                    log_progress(f"Checkpoint saved with {len(words_df)} entries")

                    # Update counts
                    for pos in TARGET_POS:
                        words_per_pos[pos] = len(words_df[words_df['pos'] == pos])
                        log_progress(f"Updated count for {pos}: {words_per_pos[pos]}/{MAX_WORDS_PER_POS} words")

                    # Check if we have enough words
                    if all(words_per_pos.get(pos, 0) >= MAX_WORDS_PER_POS for pos in TARGET_POS):
                        break

                    # Display system info
                    display_system_info()

            # Save any remaining chunks
            for pos in TARGET_POS:
                if chunk_words[pos]:
                    save_interim_results(chunk_words[pos], pos, chunk_counts[pos])

        except Exception as e:
            log_progress(f"Error processing NLTK words: {str(e)}", "ERROR")
            log_progress("Continuing with WordNet words only", "WARNING")

    # Add any remaining rows
    if new_rows:
        temp_df = pd.DataFrame(new_rows)
        words_df = pd.concat([words_df, temp_df], ignore_index=True)

    # Fill remaining categories with synthetic data if needed
    log_progress("Checking for categories that need more words...")
    display_system_info()

    for pos in TARGET_POS:
        count = len(words_df[words_df['pos'] == pos])
        if count < MAX_WORDS_PER_POS:
            log_progress(f"Only {count}/{MAX_WORDS_PER_POS} words for {pos} - adding synthetic words", "WARNING")

            # Generate synthetic words for this category
            needed = MAX_WORDS_PER_POS - count
            log_progress(f"Generating {needed} synthetic words for {pos}")

            new_rows = []
            # Add synthetic words
            existing_words = set(words_df[words_df['pos'] == pos]['word'].tolist())

            chunk_count = count // INTERIM_SAVE_EVERY
            chunk_words = []

            # Create progress bar
            pbar = tqdm(range(needed), desc=f"Generating synthetic {pos} words")

            for i in pbar:
                # Create synthetic word
                if pos == 'NOUN':
                    word = f"noun{i+1}"
                elif pos == 'VERB':
                    word = f"verb{i+1}"
                elif pos == 'ADJ':
                    word = f"adj{i+1}"
                elif pos == 'ADV':
                    word = f"adv{i+1}"
                elif pos == 'DET':
                    word = f"det{i+1}"
                elif pos == 'PRON':
                    word = f"pron{i+1}"
                elif pos == 'ADP':
                    word = f"adp{i+1}"
                elif pos == 'CONJ':
                    word = f"conj{i+1}"
                elif pos == 'NUM':
                    word = f"num{i+1}"
                elif pos == 'INTJ':
                    word = f"intj{i+1}"
                else:
                    word = f"word{pos}{i+1}"

                # Skip if word already exists
                if word in existing_words:
                    continue

                new_row = {"word": word, "pos": pos}
                new_rows.append(new_row)
                chunk_words.append(new_row)
                existing_words.add(word)

                # Update progress bar every 100 words
                if (i+1) % 100 == 0:
                    pbar.set_description(f"Generated {i+1}/{needed} synthetic {pos} words")

                # Save interim results for this chunk
                if len(chunk_words) >= INTERIM_SAVE_EVERY:
                    save_interim_results(chunk_words, pos, chunk_count)
                    chunk_count += 1
                    chunk_words = []

                # Save checkpoint every 10000 words
                if len(new_rows) % 10000 == 0:
                    temp_df = pd.DataFrame(new_rows)
                    current_df = pd.concat([words_df, temp_df], ignore_index=True)
                    current_df.to_csv(checkpoint_file, index=False)
                    current_df.to_csv(output_file, index=False)  # Also save to output file for viewing
                    words_df = current_df
                    new_rows = []
                    log_progress(f"Checkpoint saved with {len(words_df)} entries")

                    # Display system info
                    display_system_info()

            # Save any remaining words in the current chunk
            if chunk_words:
                save_interim_results(chunk_words, pos, chunk_count)

            # Add remaining rows
            if new_rows:
                temp_df = pd.DataFrame(new_rows)
                words_df = pd.concat([words_df, temp_df], ignore_index=True)
                log_progress(f"Added {len(new_rows)} synthetic words for {pos}")

    # Final counts
    for pos in TARGET_POS:
        count = len(words_df[words_df['pos'] == pos])
        log_progress(f"Final count for {pos}: {count}/{MAX_WORDS_PER_POS} words")

    # Save to file
    words_df.to_csv(output_file, index=False)

    log_progress("Extracted large-scale word lists with POS tags")
    display_system_info()

    log_completed_step("pos_extraction_large")

    return words_df

# PHASE 2: TRANSLATE WORDS EFFICIENTLY

def translate_word_batch(word_batch, target_lang, translator):
    """Translate a batch of words to the target language with improved error handling"""
    translations = []
    max_retries = 3

    # Added improvement: Normalize input words and ensure all are strings
    normalized_batch = [str(word).lower().strip() for word in word_batch]

    for retry in range(max_retries):
        try:
            # Use a single API call for the whole batch when possible
            if target_lang in ['fra_Latn', 'wol_Latn', 'bam_Latn']:
                results = translator(normalized_batch, src_lang="eng_Latn", tgt_lang=target_lang, max_length=200)
                translations = [result['translation_text'] for result in results]
                break  # Success, exit retry loop
            else:
                # Fallback to individual translation
                translations = []
                for word in normalized_batch:
                    result = translator(word, src_lang="eng_Latn", tgt_lang=target_lang, max_length=200)
                    translations.append(result[0]['translation_text'])
                break  # Success, exit retry loop
        except Exception as e:
            log_progress(f"Error translating batch (attempt {retry+1}/{max_retries}): {str(e)}", "WARNING")
            if retry == max_retries - 1:  # Last retry failed
                # Improved fallback: Process words individually with timeouts
                translations = []
                for word in normalized_batch:
                    try:
                        # Add timeout between individual word translation attempts
                        time.sleep(0.5)
                        result = translator(word, src_lang="eng_Latn", tgt_lang=target_lang, max_length=200)
                        translations.append(result[0]['translation_text'])
                    except:
                        translations.append(f"[ERROR] {word}")
            else:
                # Wait before retrying
                time.sleep(2 * (retry + 1))  # Exponential backoff

    # Post-process translations
    clean_translations = []
    for trans in translations:
        # Remove any extraneous characters that may have been added
        clean_trans = trans.strip()
        # Handle any common issues in translated text
        if clean_trans.startswith("[") and clean_trans.endswith("]"):
            clean_trans = clean_trans[1:-1].strip()
        clean_translations.append(clean_trans)

    return clean_translations

def translate_large_word_lists(words_df, use_test_mode=False):
    """Translate the large word lists to all target languages with improved methods"""
    output_file = "data/pos/quadrilingual_words_large.csv"
    checkpoint_file = f"{checkpoint_dir}/word_translation_large_checkpoint.json"

    # Skip if already completed
    if is_step_completed("word_translation_large") and os.path.exists(output_file):
        log_progress("Skipping large-scale word translation - already completed")
        return pd.read_csv(output_file)

    # Check if we need test mode
    if use_test_mode:
        log_progress("Using test mode with placeholder translations")

        # Process all words but with placeholder translations
        test_data = []

        # Process each POS category with samples for testing
        for pos in TARGET_POS:
            pos_words = words_df[words_df['pos'] == pos]
            log_progress(f"Creating test translations for {len(pos_words)} {pos} words")

            chunk_count = 0
            chunk_data = []

            # Create progress bar
            pbar = tqdm(pos_words.iterrows(), total=len(pos_words), desc=f"Processing {pos}")

            for i, row in enumerate(pbar):
                _, row = row  # Unpack the row
                english = row['word']

                # Create placeholder translations
                french = f"fr_{english}"
                bambara = f"bam_{english}"
                wolof = f"wol_{english}"

                item = {
                    'english': english,
                    'french': french,
                    'bambara': bambara,
                    'wolof': wolof,
                    'pos': pos
                }

                test_data.append(item)
                chunk_data.append(item)

                # Update progress
                if (i+1) % 100 == 0:
                    pbar.set_description(f"Processed {i+1}/{len(pos_words)} {pos} words")

                # Save interim results every 1000 words
                if len(chunk_data) >= INTERIM_SAVE_EVERY:
                    save_interim_results(chunk_data, pos, chunk_count)
                    chunk_count += 1
                    chunk_data = []

                    # Also save progress to CSV periodically
                    if len(test_data) % 10000 == 0:
                        temp_df = pd.DataFrame(test_data)
                        temp_df.to_csv(output_file, index=False)
                        log_progress(f"Saved progress with {len(test_data)} test translations")

                        # Display system info
                        display_system_info()

            # Save any remaining words in the chunk
            if chunk_data:
                save_interim_results(chunk_data, pos, chunk_count)

        # Create DataFrame and save
        test_df = pd.DataFrame(test_data)
        test_df.to_csv(output_file, index=False)

        log_progress(f"Created test translations with {len(test_df)} entries")
        log_completed_step("word_translation_large")

        return test_df

    # Real translation mode
    log_progress("Translating large word lists to all target languages")
    display_system_info()

    # Check if resuming from checkpoint
    translations = {}
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as f:
            translations = json.load(f)
        log_progress(f"Resuming from checkpoint with {len(translations.get('french', {}))} translated words")
    else:
        translations = {'french': {}, 'bambara': {}, 'wolof': {}}

    # Initialize translation model
    try:
        # Set up model with lower precision for memory efficiency
        log_progress("Loading translation model...")
        model_name = "facebook/nllb-200-distilled-600M"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
        )

        translator = pipeline(
            "translation",
            model=model,
            tokenizer=tokenizer,
            device=0 if torch.cuda.is_available() else -1
        )
        log_progress("Loaded NLLB translation model with memory optimizations")
    except Exception as e:
        log_progress(f"Error loading translation model: {str(e)}", "ERROR")
        log_progress("Will use placeholder translations", "WARNING")
        translator = None

    # High-quality translations for common words
    common_words = {
        'DET': {
            'the': {'french': 'le/la', 'bambara': 'o', 'wolof': 'bi/gi'},
            'a': {'french': 'un/une', 'bambara': 'dɔ', 'wolof': 'ab'},
            'an': {'french': 'un/une', 'bambara': 'dɔ', 'wolof': 'ab'},
            'this': {'french': 'ce/cette', 'bambara': 'nin', 'wolof': 'bii'},
            'that': {'french': 'ce/cette', 'bambara': 'o', 'wolof': 'boobu'},
            'these': {'french': 'ces', 'bambara': 'ninnu', 'wolof': 'yii'},
            'those': {'french': 'ceux/celles', 'bambara': 'olu', 'wolof': 'yooyu'},
            'my': {'french': 'mon/ma/mes', 'bambara': 'n ka', 'wolof': 'sama'},
            'your': {'french': 'ton/ta/tes', 'bambara': 'i ka', 'wolof': 'sa'},
            'his': {'french': 'son/sa/ses', 'bambara': 'a ka', 'wolof': 'am'},
            'her': {'french': 'son/sa/ses', 'bambara': 'a ka', 'wolof': 'am'},
        },
        'PRON': {
            'i': {'french': 'je', 'bambara': 'n', 'wolof': 'man'},
            'you': {'french': 'tu/vous', 'bambara': 'i', 'wolof': 'yaw'},
            'he': {'french': 'il', 'bambara': 'a', 'wolof': 'moom'},
            'she': {'french': 'elle', 'bambara': 'a', 'wolof': 'moom'},
            'it': {'french': 'il/elle', 'bambara': 'a', 'wolof': 'moom'},
            'we': {'french': 'nous', 'bambara': 'an', 'wolof': 'nun'},
            'they': {'french': 'ils/elles', 'bambara': 'u', 'wolof': 'ñoom'},
            'me': {'french': 'me/moi', 'bambara': 'n', 'wolof': 'ma'},
            'him': {'french': 'lui', 'bambara': 'a', 'wolof': 'ko'},
            'her': {'french': 'elle/lui', 'bambara': 'a', 'wolof': 'ko'},
        },
        'NOUN': {
            'man': {'french': 'homme', 'bambara': 'cɛ', 'wolof': 'góor'},
            'woman': {'french': 'femme', 'bambara': 'muso', 'wolof': 'jigéen'},
            'child': {'french': 'enfant', 'bambara': 'den', 'wolof': 'xale'},
            'house': {'french': 'maison', 'bambara': 'so', 'wolof': 'kër'},
            'water': {'french': 'eau', 'bambara': 'ji', 'wolof': 'ndox'},
        },
        'VERB': {
            'go': {'french': 'aller', 'bambara': 'taa', 'wolof': 'dem'},
            'come': {'french': 'venir', 'bambara': 'na', 'wolof': 'ñëw'},
            'eat': {'french': 'manger', 'bambara': 'dumu', 'wolof': 'lekk'},
            'drink': {'french': 'boire', 'bambara': 'min', 'wolof': 'naan'},
            'see': {'french': 'voir', 'bambara': 'ye', 'wolof': 'gis'},
        }
    }

    # Process words by POS category
    all_translations = []

    for pos in TARGET_POS:
        pos_words = words_df[words_df['pos'] == pos]
        log_progress(f"Translating {len(pos_words)} {pos} words")
        display_system_info()

        chunk_count = 0
        chunk_translations = []

        # Process in batches
        pbar = tqdm(range(0, len(pos_words), BATCH_SIZE), desc=f"Translating {pos}")

        for i in pbar:
            batch = pos_words.iloc[i:i+BATCH_SIZE]
            batch_words = batch['word'].tolist()
            batch_translations = []

            # Only translate words not already in cache
            to_translate = []
            for word in batch_words:
                # Check if it's in our common words dictionary
                if pos in common_words and word in common_words[pos]:
                    continue
                # Check if it's already in our translation cache
                if word not in translations['french']:
                    to_translate.append(word)

            # Translate if needed
            if to_translate and translator:
                # French translation
                log_progress(f"Translating {len(to_translate)} new words to French")
                fr_translations = translate_word_batch(to_translate, "fra_Latn", translator)
                for word, trans in zip(to_translate, fr_translations):
                    translations['french'][word] = trans

                # Bambara translation
                log_progress(f"Translating {len(to_translate)} new words to Bambara")
                bam_translations = translate_word_batch(to_translate, "bam_Latn", translator)
                for word, trans in zip(to_translate, bam_translations):
                    translations['bambara'][word] = trans

                # Wolof translation
                log_progress(f"Translating {len(to_translate)} new words to Wolof")
                wol_translations = translate_word_batch(to_translate, "wol_Latn", translator)
                for word, trans in zip(to_translate, wol_translations):
                    translations['wolof'][word] = trans

                # Free up memory
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

            # Add translations for this batch
            for word in batch_words:
                # Check if it's in our common words dictionary first (highest quality)
                if pos in common_words and word in common_words[pos]:
                    french = common_words[pos][word]['french']
                    bambara = common_words[pos][word]['bambara']
                    wolof = common_words[pos][word]['wolof']
                elif translator:
                    # Use cached translations
                    french = translations['french'].get(word, f"[FR] {word}")
                    bambara = translations['bambara'].get(word, f"[BAM] {word}")
                    wolof = translations['wolof'].get(word, f"[WOL] {word}")
                else:
                    # Use placeholder translations
                    french = f"[FR] {word}"
                    bambara = f"[BAM] {word}"
                    wolof = f"[WOL] {word}"

                item = {
                    'english': word,
                    'french': french,
                    'bambara': bambara,
                    'wolof': wolof,
                    'pos': pos
                }

                batch_translations.append(item)
                chunk_translations.append(item)

            # Add to final list
            all_translations.extend(batch_translations)

            # Update progress bar
            pbar.set_description(f"Translated {len(all_translations)} words total")

            # Save chunk after every 1000 words
            if len(chunk_translations) >= INTERIM_SAVE_EVERY:
                save_interim_results(chunk_translations, pos, chunk_count)
                chunk_count += 1
                chunk_translations = []

            # Save checkpoint
            with open(checkpoint_file, 'w') as f:
                json.dump(translations, f)

            # Also save progress to CSV periodically
            if len(all_translations) % 10000 == 0:
                temp_df = pd.DataFrame(all_translations)
                temp_df.to_csv(output_file, index=False)
                log_progress(f"Saved progress with {len(all_translations)} translated words")

                # Display system info
                display_system_info()

        # Save any remaining words in the chunk
        if chunk_translations:
            save_interim_results(chunk_translations, pos, chunk_count)

        # Save complete POS translations
        pos_df = pd.DataFrame([t for t in all_translations if t['pos'] == pos])
        pos_df.to_csv(f"interim_results/complete_{pos}_translations.csv", index=False)
        log_progress(f"Saved complete translations for {pos}: {len(pos_df)} words")

    # Create final DataFrame
    quad_df = pd.DataFrame(all_translations)

    # Save to file
    quad_df.to_csv(output_file, index=False)

    log_progress(f"Translated {len(quad_df)} words to all languages")
    display_system_info()

    log_completed_step("word_translation_large")

    return quad_df

# PHASE 3: GENERATE POS-ALIGNED CSV

def generate_large_pos_aligned_csv(quad_df):
    """Generate the final large-scale POS-aligned CSV file"""
    output_file = "output/pos_aligned_quadrilingual_large.csv"

    # Skip if already completed
    if is_step_completed("csv_generation_large") and os.path.exists(output_file):
        log_progress("Skipping large-scale CSV generation - already completed")
        return

    log_progress("Generating large-scale POS-aligned CSV file")
    display_system_info()

    # Group words by POS category
    pos_dict = {}
    for pos in TARGET_POS:
        pos_words = quad_df[quad_df["pos"] == pos]
        if len(pos_words) > 0:
            pos_dict[pos] = {
                "english": pos_words["english"].tolist(),
                "french": pos_words["french"].tolist(),
                "bambara": pos_words["bambara"].tolist(),
                "wolof": pos_words["wolof"].tolist()
            }

    # Create CSV file with the specified format - process in chunks to handle large size
    log_progress("Writing CSV file (this may take some time for large datasets)")

    # Use streaming approach for large file
    with open(output_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)

        # First header row: POS categories
        pos_header = []
        for pos in pos_dict.keys():
            # Each POS category spans 4 columns (one per language)
            pos_header.extend([pos, "", "", ""])
        writer.writerow(pos_header)

        # Second header row: Language codes
        lang_header = []
        for _ in pos_dict.keys():
            lang_header.extend(["ENG", "FR", "BAM", "WOL"])
        writer.writerow(lang_header)

        # Find maximum number of words in any category
        max_words = max([len(words["english"]) for words in pos_dict.values()])
        log_progress(f"Preparing to write {max_words} rows of data")

        # Write data rows in batches
        batch_size = 1000
        pbar = tqdm(range(0, max_words, batch_size), desc="Writing CSV")

        for start_idx in pbar:
            end_idx = min(start_idx + batch_size, max_words)

            # Create interim aligned sample for this chunk
            chunk_rows = []

            for i in range(start_idx, end_idx):
                row = []
                for pos in pos_dict.keys():
                    words = pos_dict[pos]
                    # Add words or empty strings if index out of range
                    en_word = words["english"][i] if i < len(words["english"]) else ""
                    fr_word = words["french"][i] if i < len(words["french"]) else ""
                    bam_word = words["bambara"][i] if i < len(words["bambara"]) else ""
                    wol_word = words["wolof"][i] if i < len(words["wolof"]) else ""

                    row.extend([en_word, fr_word, bam_word, wol_word])

                writer.writerow(row)
                chunk_rows.append(row)

            # Save an interim version of this chunk
            chunk_number = start_idx // batch_size
            interim_file = f"interim_results/aligned_chunk_{chunk_number}.csv"

            with open(interim_file, "w", newline="", encoding="utf-8") as chunk_f:
                chunk_writer = csv.writer(chunk_f)
                chunk_writer.writerow(pos_header)
                chunk_writer.writerow(lang_header)
                for row in chunk_rows:
                    chunk_writer.writerow(row)

            log_progress(f"Saved interim aligned CSV chunk {chunk_number} ({start_idx}-{end_idx-1})")

            # Update progress
            progress_pct = min(100, 100 * end_idx / max_words)
            pbar.set_description(f"Writing CSV: {progress_pct:.1f}% complete")

            # Display system info occasionally
            if chunk_number % 10 == 0:
                display_system_info()

    log_progress(f"Generated large-scale POS-aligned CSV with {max_words} rows")
    display_system_info()

    log_completed_step("csv_generation_large")

# MAIN EXECUTION

def main():
    log_progress("=" * 50)
    log_progress("STARTING LARGE-SCALE POS-ALIGNED CSV GENERATION PIPELINE")
    log_progress("=" * 50)

    # Display initial system info
    display_system_info()

    # Step 1: Extract large-scale word lists with POS tags
    log_progress("PHASE 1: EXTRACT LARGE-SCALE WORD LISTS WITH POS TAGS")
    words_df = extract_large_word_lists()

    # Step 2: Translate words to all target languages
    log_progress("PHASE 2: TRANSLATE WORDS TO ALL TARGET LANGUAGES")
    quad_df = translate_large_word_lists(words_df, use_test_mode=False)  # Set to False for real translation

    # Step 3: Generate large-scale POS-aligned CSV
    log_progress("PHASE 3: GENERATE LARGE-SCALE POS-ALIGNED CSV")
    generate_large_pos_aligned_csv(quad_df)

    log_progress("=" * 50)
    log_progress("🎉 LARGE-SCALE PIPELINE COMPLETED SUCCESSFULLY!")
    log_progress("=" * 50)

    # Display final system info
    display_system_info()

if __name__ == "__main__":
    main()