In [None]:
import os
import csv
import pandas as pd
import nltk
import random
import time
import json
import torch
import datetime
import psutil
import multiprocessing
import threading
import concurrent.futures
from tqdm.auto import tqdm
from colorama import init, Fore, Style
import re
import gc
import hashlib
import shutil
import ssl

# Initialize colorama for colored terminal output
init()

# ====================
# CONFIGURATION
# ====================

# Maximum words per POS category
MAX_WORDS_PER_POS = 10000

# Translation batch size - increased for better parallelism
BATCH_SIZE = 500  

# Target POS categories
TARGET_POS = ['NOUN', 'VERB', 'ADJ', 'ADV', 'DET', 'PRON', 'ADP', 'CONJ', 'NUM', 'INTJ']

# Save interim results every N words
INTERIM_SAVE_EVERY = 1000  

# Multiprocessing settings - using more workers with available RAM
CPU_COUNT = multiprocessing.cpu_count()
MAX_WORKERS = min(CPU_COUNT * 2, 16)  # Scale workers based on available CPUs
MAX_TRANSLATION_WORKERS = 8  # Dedicated workers for translation tasks

# Translation model - Helsinki-NLP is a good alternative to NLLB
PRIMARY_MODEL = "facebook/nllb-200-distilled-1.3B"  # Using larger model for better quality
FALLBACK_MODEL = "facebook/nllb-200-distilled-600M"  # Fallback to smaller model if memory issues

# Translation language codes
LANGUAGE_CODES = {
    "english": "eng_Latn", 
    "french": "fra_Latn", 
    "bambara": "bam_Latn", 
    "wolof": "wol_Latn"
}

# Custom NLTK data directory
NLTK_DATA_DIR = os.path.expanduser("~/nltk_data")
os.makedirs(NLTK_DATA_DIR, exist_ok=True)
nltk.data.path.append(NLTK_DATA_DIR)

# POS tag mapping (for words not in WordNet)
TAG_POS_MAP = {
    'NN': 'NOUN', 
    'NNS': 'NOUN', 
    'NNP': 'NOUN', 
    'NNPS': 'NOUN',
    'VB': 'VERB', 
    'VBD': 'VERB', 
    'VBG': 'VERB', 
    'VBN': 'VERB', 
    'VBP': 'VERB', 
    'VBZ': 'VERB',
    'JJ': 'ADJ', 
    'JJR': 'ADJ', 
    'JJS': 'ADJ',
    'RB': 'ADV', 
    'RBR': 'ADV', 
    'RBS': 'ADV',
    'DT': 'DET', 
    'PDT': 'DET', 
    'WDT': 'DET',
    'PRP': 'PRON', 
    'PRPS': 'PRON',  # Using PRPS instead of PRP$ to avoid $ character
    'WP': 'PRON', 
    'WPS': 'PRON',   # Using WPS instead of WP$ to avoid $ character
    'IN': 'ADP',
    'CC': 'CONJ',
    'CD': 'NUM',
    'UH': 'INTJ'
}

# WN_POS_MAP will be configured at runtime based on WordNet availability
WN_POS_MAP = {}

# ====================
# DIRECTORY SETUP
# ====================

def setup_drive_directories():
    """Set up directory structure for the project"""
    try:
        # Try importing Google Colab drive module
        from google.colab import drive
        
        # Mount Google Drive
        drive.mount('/content/drive')
        
        # Main project directory
        base_dir = "/content/drive/MyDrive/multilingual_dataset"
        os.makedirs(base_dir, exist_ok=True)
        
        print(f"{Fore.GREEN}✅ Google Drive mounted successfully{Style.RESET_ALL}")
    except Exception as e:
        print(f"{Fore.YELLOW}⚠️ Not running in Google Colab or drive access error: {str(e)}{Style.RESET_ALL}")
        print(f"{Fore.YELLOW}⚠️ Using local directories instead{Style.RESET_ALL}")
        
        # Create local base directory
        base_dir = "./multilingual_dataset"
        os.makedirs(base_dir, exist_ok=True)
    
    # Create subdirectories
    directories = {
        "data": os.path.join(base_dir, "data"),
        "pos_data": os.path.join(base_dir, "data/pos"),
        "output": os.path.join(base_dir, "output"),
        "checkpoints": os.path.join(base_dir, "checkpoints"),
        "logs": os.path.join(base_dir, "logs"),
        "interim": os.path.join(base_dir, "interim_results"),
        "cache": os.path.join(base_dir, "cache"),
        "samples": os.path.join(base_dir, "samples"),
        "translations": os.path.join(base_dir, "translations"),
        "final": os.path.join(base_dir, "final_results")
    }
    
    # Create all directories
    for path in directories.values():
        os.makedirs(path, exist_ok=True)
    
    print(f"{Fore.GREEN}✅ Created all project directories{Style.RESET_ALL}")
    return base_dir, directories

# ====================
# LOGGING & MONITORING
# ====================

def get_timestamp():
    """Get current timestamp for logging"""
    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

class Logger:
    """Enhanced logger with structured logging and progress tracking"""
    def __init__(self, log_dir):
        self.log_dir = log_dir
        self.main_log = os.path.join(log_dir, "main.log")
        self.progress_log = os.path.join(log_dir, "progress.log")
        self.completed_log = os.path.join(log_dir, "completed_steps.log")
        self.error_log = os.path.join(log_dir, "errors.log")
        
        # Create log files if they don't exist
        for log_file in [self.main_log, self.progress_log, self.completed_log, self.error_log]:
            if not os.path.exists(log_file):
                with open(log_file, 'w') as f:
                    f.write(f"Log started at {get_timestamp()}\n")
        
        # Track completed steps
        self.completed_steps = self._load_completed_steps()
        
        # Set up log-specific locks to prevent race conditions in multi-threading
        self.main_log_lock = threading.Lock()
        self.progress_log_lock = threading.Lock()
        self.completed_log_lock = threading.Lock()
        self.error_log_lock = threading.Lock()
    
    def _load_completed_steps(self):
        """Load list of completed steps from log file"""
        completed = set()
        if os.path.exists(self.completed_log):
            with open(self.completed_log, 'r') as f:
                for line in f:
                    if ',' in line:
                        step = line.strip().split(',')[0]
                        completed.add(step)
        return completed
    
    def log(self, message, level="INFO"):
        """Log a message to the main log file"""
        timestamp = get_timestamp()
        color = Fore.GREEN if level == "INFO" else Fore.YELLOW if level == "WARNING" else Fore.RED
        log_message = f"[{timestamp}] {level}: {message}"
        
        # Print to console
        print(f"{color}{log_message}{Style.RESET_ALL}")
        
        # Write to log file
        with self.main_log_lock:
            with open(self.main_log, 'a') as f:
                f.write(f"{log_message}\n")
                
        # Write to error log if error
        if level == "ERROR":
            with self.error_log_lock:
                with open(self.error_log, 'a') as f:
                    f.write(f"{log_message}\n")
    
    def track_progress(self, phase, current, total, additional_info=""):
        """Track progress of a specific phase"""
        timestamp = get_timestamp()
        percentage = (current / total) * 100 if total > 0 else 0
        progress_message = f"[{timestamp}] {phase}: {current}/{total} ({percentage:.2f}%) {additional_info}"
        
        # Write to progress log
        with self.progress_log_lock:
            with open(self.progress_log, 'a') as f:
                f.write(f"{progress_message}\n")
    
    def complete_step(self, step_name):
        """Mark a step as completed"""
        timestamp = get_timestamp()
        
        with self.completed_log_lock:
            with open(self.completed_log, 'a') as f:
                f.write(f"{step_name},{timestamp}\n")
            
        self.completed_steps.add(step_name)
        self.log(f"✅ Completed step: {step_name}")
    
    def is_completed(self, step_name):
        """Check if a step is already completed"""
        return step_name in self.completed_steps
    
    def summary(self):
        """Print a summary of completed steps"""
        self.log("=" * 50)
        self.log("SUMMARY OF COMPLETED STEPS")
        self.log("=" * 50)
        
        for step in sorted(self.completed_steps):
            self.log(f"✓ {step}")
        
        self.log("=" * 50)

# System monitoring
def display_system_info():
    """Display detailed system info for monitoring resources"""
    try:
        # Get memory usage
        memory = psutil.virtual_memory()
        memory_used_gb = memory.used / (1024 ** 3)
        memory_total_gb = memory.total / (1024 ** 3)
        memory_percent = memory.percent
        
        # Get swap usage
        swap = psutil.swap_memory()
        swap_used_gb = swap.used / (1024 ** 3)
        swap_total_gb = swap.total / (1024 ** 3)
        swap_percent = swap.percent
        
        # Get CPU usage
        cpu_percent = psutil.cpu_percent(interval=1)
        cpu_freq = psutil.cpu_freq()
        cpu_freq_str = f"{cpu_freq.current:.2f} MHz" if cpu_freq else "N/A"
        
        # Get disk usage
        disk = psutil.disk_usage('/')
        disk_used_gb = disk.used / (1024 ** 3)
        disk_total_gb = disk.total / (1024 ** 3)
        disk_percent = disk.percent
        
        # Get GPU info if available
        gpu_info = ""
        if torch.cuda.is_available():
            gpu_count = torch.cuda.device_count()
            gpu_name = torch.cuda.get_device_name(0) if gpu_count > 0 else "N/A"
            gpu_memory_allocated = torch.cuda.memory_allocated() / (1024 ** 3)
            gpu_memory_reserved = torch.cuda.memory_reserved() / (1024 ** 3)
            gpu_info = f"GPU: {gpu_name} ({gpu_count} devices)\n"
            gpu_info += f"GPU Memory: {gpu_memory_allocated:.2f}GB allocated / {gpu_memory_reserved:.2f}GB reserved"
        
        # Print system info
        print(f"{Fore.CYAN}{'=' * 50}{Style.RESET_ALL}")
        print(f"{Fore.CYAN}SYSTEM INFORMATION{Style.RESET_ALL}")
        print(f"{Fore.CYAN}{'=' * 50}{Style.RESET_ALL}")
        print(f"CPU Usage: {cpu_percent}% (Freq: {cpu_freq_str})")
        print(f"RAM: {memory_used_gb:.2f}GB / {memory_total_gb:.2f}GB ({memory_percent}%)")
        print(f"Swap: {swap_used_gb:.2f}GB / {swap_total_gb:.2f}GB ({swap_percent}%)")
        print(f"Disk: {disk_used_gb:.2f}GB / {disk_total_gb:.2f}GB ({disk_percent}%)")
        if gpu_info:
            print(gpu_info)
        print(f"{Fore.CYAN}{'=' * 50}{Style.RESET_ALL}")
    except Exception as e:
        print(f"{Fore.YELLOW}⚠️ Couldn't display system info: {str(e)}{Style.RESET_ALL}")

# Cache manager for expensive operations
class CacheManager:
    """Manage caching of expensive operations"""
    def __init__(self, cache_dir):
        self.cache_dir = cache_dir
        os.makedirs(cache_dir, exist_ok=True)
        self.cache_lock = threading.Lock()
    
    def get_cache_path(self, key, category):
        """Get path for a cached item"""
        # Create a hash of the key for filename
        key_hash = hashlib.md5(str(key).encode()).hexdigest()
        category_dir = os.path.join(self.cache_dir, category)
        os.makedirs(category_dir, exist_ok=True)
        return os.path.join(category_dir, f"{key_hash}.json")
    
    def get(self, key, category):
        """Get item from cache"""
        cache_path = self.get_cache_path(key, category)
        if os.path.exists(cache_path):
            try:
                with open(cache_path, 'r') as f:
                    return json.load(f)
            except:
                return None
        return None
    
    def set(self, key, value, category):
        """Store item in cache"""
        cache_path = self.get_cache_path(key, category)
        with self.cache_lock:
            with open(cache_path, 'w') as f:
                json.dump(value, f, ensure_ascii=False)
    
    def clear_category(self, category):
        """Clear all cached items in a category"""
        category_dir = os.path.join(self.cache_dir, category)
        if os.path.exists(category_dir):
            shutil.rmtree(category_dir)
            os.makedirs(category_dir, exist_ok=True)

# ====================
# DATA EXTRACTION
# ====================

def download_nltk_data():
    """Download required NLTK data with proper SSL handling and verification"""
    print(f"{Fore.CYAN}Downloading NLTK resources...{Style.RESET_ALL}")
    
    # Fix SSL issues that might prevent downloads
    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context
    
    # Resources to download
    resources = [
        'wordnet',
        'omw-1.4',  # Open Multilingual WordNet
        'words',
        'averaged_perceptron_tagger',
        'punkt',
        'tagsets',
        'universal_tagset'
    ]
    
    # Download each resource with explicit console output
    for resource in resources:
        try:
            print(f"Downloading {resource}... ", end="", flush=True)
            download_result = nltk.download(resource)
            if download_result:
                print(f"{Fore.GREEN}Success{Style.RESET_ALL}")
            else:
                print(f"{Fore.RED}Failed{Style.RESET_ALL}")
        except Exception as e:
            print(f"{Fore.RED}Error: {str(e)}{Style.RESET_ALL}")
    
    # Verification phase
    print(f"\n{Fore.CYAN}Verifying critical NLTK resources:{Style.RESET_ALL}")
    
    # Test WordNet
    wordnet_ok = False
    try:
        from nltk.corpus import wordnet as wn
        synsets = wn.synsets('test')
        if synsets:
            print(f"{Fore.GREEN}✓ WordNet is working{Style.RESET_ALL}")
            wordnet_ok = True
        else:
            print(f"{Fore.YELLOW}? WordNet available but returned no data{Style.RESET_ALL}")
    except Exception as e:
        print(f"{Fore.RED}✗ WordNet failed: {str(e)}{Style.RESET_ALL}")
    
    # Test words corpus
    words_ok = False
    try:
        from nltk.corpus import words
        word_list = words.words()
        if word_list:
            print(f"{Fore.GREEN}✓ Words corpus is working ({len(word_list)} words){Style.RESET_ALL}")
            words_ok = True
        else:
            print(f"{Fore.YELLOW}? Words corpus available but returned no data{Style.RESET_ALL}")
    except Exception as e:
        print(f"{Fore.RED}✗ Words corpus failed: {str(e)}{Style.RESET_ALL}")
    
    # Test POS tagger
    tagger_ok = False
    try:
        from nltk.tag import pos_tag
        tagged = pos_tag(['test', 'this'])
        if tagged:
            print(f"{Fore.GREEN}✓ POS tagger is working{Style.RESET_ALL}")
            tagger_ok = True
        else:
            print(f"{Fore.YELLOW}? POS tagger available but returned no data{Style.RESET_ALL}")
    except Exception as e:
        print(f"{Fore.RED}✗ POS tagger failed: {str(e)}{Style.RESET_ALL}")
    
    # Return status of critical components
    return {"wordnet": wordnet_ok, "words": words_ok, "tagger": tagger_ok}

def process_wordnet_pos(wn_pos, target_pos, existing_words, words_per_pos, max_words):
    """Process a specific WordNet POS category - for parallel processing"""
    if words_per_pos.get(target_pos, 0) >= max_words:
        return []

    try:
        from nltk.corpus import wordnet as wn
        synsets = list(wn.all_synsets(wn_pos))
        random.shuffle(synsets)  # Randomize to get a diverse set
    except Exception as e:
        print(f"{Fore.RED}Error accessing WordNet: {str(e)}{Style.RESET_ALL}")
        # Return empty list if WordNet access fails
        return []

    # Process synsets
    new_rows = []
    needed_words = max_words - words_per_pos.get(target_pos, 0)
    
    for synset in synsets:
        for lemma in synset.lemma_names():
            # Skip multi-word expressions and non-alphabetic words
            if '_' in lemma or not lemma.isalpha() or len(lemma) <= 2:
                continue

            lemma = lemma.lower()
            if (lemma, target_pos) not in existing_words:
                new_row = {"word": lemma, "pos": target_pos}
                new_rows.append(new_row)
                existing_words.add((lemma, target_pos))

                # Check if we have enough words for this POS
                if len(new_rows) >= needed_words:
                    return new_rows

    return new_rows

def extract_lexicon_from_nltk_words(words_df, existing_words, words_per_pos, max_words, logger, num_workers=MAX_WORKERS):
    """Extract words from NLTK's word list with parallel tagging"""
    try:
        from nltk.corpus import words
        
        english_words = words.words()
        random.shuffle(english_words)  # Randomize the list
        
        logger.log(f"Processing {len(english_words[:100000])} words from NLTK corpus")
    except Exception as e:
        logger.log(f"Error loading NLTK words corpus: {str(e)}", "ERROR")
        logger.log("Skipping NLTK word extraction", "WARNING")
        return words_df, 0
    
    # Dictionary to track words needed by POS
    needed_by_pos = {pos: max(0, max_words - words_per_pos.get(pos, 0)) for pos in TARGET_POS}
    total_needed = sum(needed_by_pos.values())
    
    if total_needed <= 0:
        logger.log("No additional words needed from NLTK corpus")
        return words_df, 0
    
    logger.log(f"Need {total_needed} more words from NLTK corpus")
    
    try:
        # Import NLTK POS tagger here to ensure it's available
        from nltk.tag import pos_tag
        
        # Process in chunks for better parallelization
        chunk_size = 5000  # Larger chunks for better parallelization
        word_batches = [english_words[i:i+chunk_size] for i in range(0, min(100000, len(english_words)), chunk_size)]
        
        # New words by POS
        new_words_by_pos = {pos: [] for pos in TARGET_POS}
        tagged_count = 0
        
        # Process batches in parallel
        with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
            # Map each batch to a tagging task
            future_to_batch = {executor.submit(pos_tag, batch): batch for batch in word_batches}
            
            for future in tqdm(concurrent.futures.as_completed(future_to_batch), 
                             total=len(future_to_batch), 
                             desc="Tagging word batches"):
                try:
                    tagged_words = future.result()
                    tagged_count += len(tagged_words)
                    
                    # Process tagged words
                    for word, tag in tagged_words:
                        if not word.isalpha() or len(word) <= 2:
                            continue
                        
                        word = word.lower()
                        pos = TAG_POS_MAP.get(tag)
                        
                        if pos in TARGET_POS and needed_by_pos[pos] > 0 and (word, pos) not in existing_words:
                            new_words_by_pos[pos].append({"word": word, "pos": pos})
                            existing_words.add((word, pos))
                            needed_by_pos[pos] -= 1
                            
                    # Check if we have all needed words
                    total_remaining = sum(needed_by_pos.values())
                    logger.track_progress("NLTK words processed", tagged_count, 100000, 
                                        f"Words remaining: {total_remaining}")
                    
                    if total_remaining <= 0:
                        logger.log("Collected all needed words from NLTK corpus")
                        break
                        
                except Exception as e:
                    logger.log(f"Error processing batch: {str(e)}", "ERROR")
    except Exception as e:
        logger.log(f"Error in NLTK POS tagging: {str(e)}", "ERROR")
        logger.log("Will continue with other word sources", "WARNING")
        return words_df, 0
    
    # Add new words to dataframe
    total_added = 0
    for pos in TARGET_POS:
        if new_words_by_pos[pos]:
            new_df = pd.DataFrame(new_words_by_pos[pos])
            words_df = pd.concat([words_df, new_df], ignore_index=True)
            total_added += len(new_words_by_pos[pos])
            logger.log(f"Added {len(new_words_by_pos[pos])} {pos} words from NLTK corpus")
    
    return words_df, total_added

def extract_words_with_pos_tags(directories, logger, cache_manager):
    """Extract words with POS tags using multiple sources with parallel processing"""
    output_file = os.path.join(directories["pos_data"], "english_words.csv")
    checkpoint_file = os.path.join(directories["checkpoints"], "words_extraction_checkpoint.csv")
    
    # Skip if already completed
    if logger.is_completed("words_extraction") and os.path.exists(output_file):
        logger.log("Skipping word extraction - already completed")
        return pd.read_csv(output_file)
    
    # Check if resuming from checkpoint
    if os.path.exists(checkpoint_file):
        words_df = pd.read_csv(checkpoint_file)
        logger.log(f"Resuming from checkpoint with {len(words_df)} entries")
    else:
        # Initialize DataFrame
        words_df = pd.DataFrame(columns=["word", "pos"])
    
    # Track words we've already added
    existing_words = set((row['word'], row['pos']) for _, row in words_df.iterrows())
    words_per_pos = {}
    
    for pos in TARGET_POS:
        pos_words = words_df[words_df['pos'] == pos]
        words_per_pos[pos] = len(pos_words)
        logger.log(f"Current count for {pos}: {words_per_pos[pos]}/{MAX_WORDS_PER_POS} words")
    
    # Display system info
    display_system_info()
    
    # Import WordNet inside the function to ensure it's loaded after download
    try:
        from nltk.corpus import wordnet as wn
        # Define WordNet POS mapping inside function to use the loaded module
        wn_pos_map = {
            wn.NOUN: 'NOUN',
            wn.VERB: 'VERB',
            wn.ADJ: 'ADJ',
            wn.ADV: 'ADV'
        }
        logger.log("Successfully imported WordNet")
    except Exception as e:
        logger.log(f"Error importing WordNet: {str(e)}", "ERROR")
        # Use simplified extraction without WordNet if it fails
        wn_pos_map = {}
        logger.log("Will use alternative methods for word extraction", "WARNING")
    
    # PHASE 1: Extract from WordNet (parallel processing)
    if wn_pos_map:  # Only if WordNet loaded successfully
        logger.log("Starting WordNet extraction with parallel processing...")
        
        # Filter out POS categories that already have enough words
        pos_to_process = {wn_pos: target_pos for wn_pos, target_pos in wn_pos_map.items() 
                        if words_per_pos.get(target_pos, 0) < MAX_WORDS_PER_POS}
        
        if pos_to_process:
            logger.log(f"Processing {len(pos_to_process)} POS categories from WordNet")
            
            with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
                # Submit tasks for each POS category
                future_to_pos = {
                    executor.submit(
                        process_wordnet_pos, 
                        wn_pos, 
                        target_pos, 
                        existing_words, 
                        words_per_pos,
                        MAX_WORDS_PER_POS
                    ): target_pos 
                    for wn_pos, target_pos in pos_to_process.items()
                }
                
                # Process results as they come in
                for future in concurrent.futures.as_completed(future_to_pos):
                    target_pos = future_to_pos[future]
                    try:
                        new_rows = future.result()
                        if new_rows:
                            # Create DataFrame for new words
                            new_df = pd.DataFrame(new_rows)
                            
                            # Save checkpoint for this POS category
                            pos_checkpoint = os.path.join(directories["checkpoints"], f"{target_pos}_wordnet.csv")
                            new_df.to_csv(pos_checkpoint, index=False)
                            
                            # Update master DataFrame
                            words_df = pd.concat([words_df, new_df], ignore_index=True)
                            
                            # Save intermediate DataFrame to the checkpoint file
                            words_df.to_csv(checkpoint_file, index=False)
                            
                            # Update counts
                            words_per_pos[target_pos] = len(words_df[words_df['pos'] == target_pos])
                            
                            logger.log(f"Added {len(new_rows)} {target_pos} words from WordNet")
                            logger.log(f"Current count for {target_pos}: {words_per_pos[target_pos]}/{MAX_WORDS_PER_POS}")
                    except Exception as e:
                        logger.log(f"Error processing {target_pos} from WordNet: {str(e)}", "ERROR")
    else:
        logger.log("Skipping WordNet extraction due to import error", "WARNING")
    
    # PHASE 2: Add common, high-quality words for certain categories
    logger.log("Adding high-quality common words for specific categories...")
    
    # Common words dictionary with curated high-quality translations
    common_words = {
        'DET': ['the', 'a', 'an', 'this', 'that', 'these', 'those', 'my', 'your', 'his', 'her', 'our', 'their', 'its',
                'each', 'every', 'some', 'any', 'no', 'all', 'both', 'either', 'neither', 'many', 'much', 'most', 'few',
                'which', 'what', 'whose', 'whichever', 'whatever'],
        'PRON': ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'who', 'whom', 'which',
                'what', 'whose', 'whoever', 'whomever', 'whatever', 'myself', 'yourself', 'himself', 'herself', 'itself',
                'ourselves', 'yourselves', 'themselves', 'each', 'other', 'all', 'another', 'some', 'any', 'somebody',
                'anyone', 'everybody', 'nobody', 'something', 'anything', 'nothing', 'everything'],
        'ADP': ['in', 'on', 'at', 'by', 'with', 'from', 'to', 'for', 'of', 'about', 'against', 'between', 'through',
                'during', 'before', 'after', 'above', 'below', 'under', 'over', 'beside', 'behind', 'across', 'into',
                'towards', 'onto', 'beyond', 'along', 'amid', 'among', 'around', 'concerning', 'considering', 'despite',
                'except', 'inside', 'like', 'near', 'off', 'out', 'outside', 'past', 'regarding', 'round', 'since',
                'throughout', 'till', 'until', 'upon', 'within', 'without'],
        'CONJ': ['and', 'or', 'but', 'if', 'when', 'than', 'because', 'although', 'since', 'unless', 'while',
                'as', 'that', 'whether', 'before', 'after', 'though', 'so', 'till', 'until', 'whereas', 'for',
                'nor', 'yet', 'once', 'provided', 'supposing', 'considering', 'even', 'otherwise', 'however'],
        'NUM': ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'first', 'second',
                'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth', 'once', 'twice', 'thrice',
                'dozen', 'hundred', 'thousand', 'million', 'billion', 'trillion', 'zero', 'half', 'quarter', 'double',
                'triple', 'quadruple', 'many', 'few', 'several', 'numerous', 'countless'],
        'INTJ': ['oh', 'wow', 'hey', 'hello', 'hi', 'yes', 'no', 'okay', 'well', 'ah', 'ouch', 'ugh', 'oops',
                'thanks', 'sorry', 'please', 'goodbye', 'bye', 'eh', 'hmm', 'er', 'um', 'alas', 'hurray', 'hooray',
                'bravo', 'congratulations', 'cheers', 'yay', 'yikes', 'phew', 'whew', 'huh', 'damn', 'darn', 'gosh',
                'goodness', 'jeez', 'bingo', 'voila', 'encore', 'oops', 'psst', 'shh', 'whoa', 'ha', 'haha']
    }
    
    for pos, words_list in common_words.items():
        added_count = 0
        for word in words_list:
            if (word, pos) not in existing_words:
                # Add to DataFrame
                new_row = pd.DataFrame([{"word": word, "pos": pos}])
                words_df = pd.concat([words_df, new_row], ignore_index=True)
                
                # Update tracking sets
                existing_words.add((word, pos))
                added_count += 1
        
        if added_count > 0:
            logger.log(f"Added {added_count} common words for {pos}")
            words_per_pos[pos] = len(words_df[words_df['pos'] == pos])
    
    # Save checkpoint
    words_df.to_csv(checkpoint_file, index=False)
    
    # PHASE 3: Use NLTK word list for additional words
    logger.log("Extracting additional words from NLTK corpus...")
    
    words_df, nltk_words_added = extract_lexicon_from_nltk_words(
        words_df, existing_words, words_per_pos, MAX_WORDS_PER_POS, logger, MAX_WORKERS
    )
    
    if nltk_words_added > 0:
        logger.log(f"Added {nltk_words_added} words from NLTK corpus")
        words_df.to_csv(checkpoint_file, index=False)
    
    # PHASE 4: Fill remaining categories with synthetic words if needed
    logger.log("Checking if synthetic words are needed...")
    
    def generate_synthetic_words(pos, needed_count):
        """Generate synthetic words for a POS category"""
        synthetic_words = []
        for i in range(1, needed_count + 1):
            word = f"{pos.lower()}{i:05d}"  # Format with leading zeros for consistent sorting
            synthetic_words.append({"word": word, "pos": pos})
        return synthetic_words
    
    # Check each POS category and add synthetic words if needed
    for pos in TARGET_POS:
        current_count = len(words_df[words_df['pos'] == pos])
        if current_count < MAX_WORDS_PER_POS:
            needed = MAX_WORDS_PER_POS - current_count
            logger.log(f"Generating {needed} synthetic words for {pos}")
            
            synthetic_df = pd.DataFrame(generate_synthetic_words(pos, needed))
            words_df = pd.concat([words_df, synthetic_df], ignore_index=True)
            
            logger.log(f"Added {needed} synthetic words for {pos}")
    
    # Final counts
    for pos in TARGET_POS:
        count = len(words_df[words_df['pos'] == pos])
        logger.log(f"Final count for {pos}: {count}/{MAX_WORDS_PER_POS} words")
    
    # Save final output
    words_df.to_csv(output_file, index=False)
    logger.log(f"Saved {len(words_df)} words with POS tags to {output_file}")
    
    # Mark step as completed
    logger.complete_step("words_extraction")
    
    return words_df

# ====================
# TRANSLATION
# ====================

def setup_translation_model(logger, cache_dir):
    """Set up translation model with proper memory optimization"""
    try:
        # Import transformers here to ensure it's available
        from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
        
        # Choose model based on available memory
        model_name = PRIMARY_MODEL
        logger.log(f"Loading translation model: {model_name}")
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # Configure model with memory optimizations
        if torch.cuda.is_available():
            # GPU optimizations
            logger.log("Using GPU for translation with mixed precision")
            model = AutoModelForSeq2SeqLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16,  # Use half precision
                device_map="auto",          # Auto-distribute across GPUs
                low_cpu_mem_usage=True,     # Reduce CPU memory usage
                cache_dir=cache_dir         # Cache model files
            )
        else:
            # CPU optimizations
            logger.log("Using CPU for translation")
            model = AutoModelForSeq2SeqLM.from_pretrained(
                model_name,
                low_cpu_mem_usage=True,     # Reduce memory usage
                cache_dir=cache_dir         # Cache model files
            )
        
        # Create pipeline
        translator = pipeline(
            "translation",
            model=model,
            tokenizer=tokenizer,
            device=0 if torch.cuda.is_available() else -1,
            batch_size=BATCH_SIZE  # Adjust batch size for better throughput
        )
        
        logger.log(f"Successfully loaded translation model: {model_name}")
        return translator
    except Exception as e:
        logger.log(f"Error loading translation model: {str(e)}", "ERROR")
        
        # Try with fallback model
        try:
            logger.log(f"Attempting to load fallback model: {FALLBACK_MODEL}")
            
            # Load tokenizer
            tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL)
            
            # Configure model with more aggressive memory optimizations
            model = AutoModelForSeq2SeqLM.from_pretrained(
                FALLBACK_MODEL,
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                low_cpu_mem_usage=True,
                cache_dir=cache_dir
            )
            
            # Create pipeline with smaller batch size
            translator = pipeline(
                "translation",
                model=model,
                tokenizer=tokenizer,
                device=0 if torch.cuda.is_available() else -1,
                batch_size=BATCH_SIZE // 2  # Smaller batch size for fallback
            )
            
            logger.log(f"Successfully loaded fallback model: {FALLBACK_MODEL}")
            return translator
        except Exception as e:
            logger.log(f"Error loading fallback model: {str(e)}", "ERROR")
            logger.log("Will use placeholder translations instead", "WARNING")
            return None

def translate_batch(word_batch, source_lang, target_lang, translator, max_retries=3):
    """Translate a batch of words with improved error handling and retries"""
    # Normalize and clean input words
    normalized_batch = [str(word).lower().strip() for word in word_batch]
    
    # Attempt translation with retries
    for retry in range(max_retries):
        try:
            results = translator(normalized_batch, 
                                src_lang=source_lang, 
                                tgt_lang=target_lang,
                                max_length=128)
            
            # Extract translations
            translations = [result['translation_text'].strip() for result in results]
            return translations, True
            
        except Exception as e:
            if retry == max_retries - 1:  # Last retry
                # Final fallback: translate one by one
                translations = []
                success = False
                for word in normalized_batch:
                    try:
                        result = translator(word, 
                                         src_lang=source_lang, 
                                         tgt_lang=target_lang,
                                         max_length=128)
                        translation = result[0]['translation_text'].strip()
                        translations.append(translation)
                        success = True
                    except:
                        # If individual translation fails, use original word
                        translations.append(f"[{target_lang}_{word}]")
                
                return translations, success
            else:
                # Wait before retrying (with exponential backoff)
                time.sleep(1 * (2 ** retry))
    
    # If all retries fail
    return [f"[{target_lang}_{word}]" for word in normalized_batch], False

def translate_pos_category(pos, words_df, translator, logger, cache_manager, translations):
    """Translate all words for a specific POS category in batches"""
    # Get words for this POS
    pos_words = words_df[words_df['pos'] == pos]
    word_list = pos_words['word'].tolist()
    
    # Check which words still need translation
    to_translate = []
    for word in word_list:
        if not all(word in translations.get(lang, {}) for lang in ['french', 'bambara', 'wolof']):
            to_translate.append(word)
    
    if not to_translate:
        logger.log(f"All {pos} words already translated")
        return translations
    
    logger.log(f"Translating {len(to_translate)} {pos} words")
    
    # Process in batches
    batch_size = BATCH_SIZE
    batches = [to_translate[i:i+batch_size] for i in range(0, len(to_translate), batch_size)]
    
    for i, batch in enumerate(tqdm(batches, desc=f"Translating {pos} batches")):
        batch_translations = {}
        
        # Translate to each target language
        for lang_name, lang_code in LANGUAGE_CODES.items():
            if lang_name == 'english':  # Skip source language
                continue
                
            # Check cache first
            batch_key = f"{pos}_batch_{i}_{lang_name}"
            cached_results = cache_manager.get(batch_key, "translations")
            
            if cached_results:
                # Use cached results
                batch_translations[lang_name] = cached_results
                logger.log(f"Used cached translations for batch {i} to {lang_name}")
            else:
                # Translate batch
                try:
                    results, success = translate_batch(
                        batch, 
                        LANGUAGE_CODES['english'], 
                        lang_code, 
                        translator
                    )
                    
                    # Store results
                    batch_translations[lang_name] = results
                    
                    # Cache successful translations
                    if success:
                        cache_manager.set(batch_key, results, "translations")
                        
                    # Log progress
                    logger.log(f"Translated batch {i+1}/{len(batches)} to {lang_name}")
                    
                except Exception as e:
                    logger.log(f"Error translating batch to {lang_name}: {str(e)}", "ERROR")
                    # Use placeholder if translation fails
                    batch_translations[lang_name] = [f"[{lang_name}_{word}]" for word in batch]
        
        # Update translations dictionary
        for idx, word in enumerate(batch):
            for lang in batch_translations:
                if lang not in translations:
                    translations[lang] = {}
                translations[lang][word] = batch_translations[lang][idx]
        
        # Free up memory
        if i % 5 == 0 and torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()
    
    return translations

def translate_words(words_df, directories, logger, cache_manager):
    """Translate words to target languages with parallel processing and caching"""
    output_file = os.path.join(directories["translations"], "quadrilingual_words.csv")
    checkpoint_file = os.path.join(directories["checkpoints"], "translations_checkpoint.json")
    
    # Skip if already completed
    if logger.is_completed("word_translation") and os.path.exists(output_file):
        logger.log("Skipping word translation - already completed")
        return pd.read_csv(output_file)
    
    # Load high-quality translations for common words
    common_words_file = os.path.join(directories["data"], "common_words_translations.json")
    
    # Dictionary for curated high-quality translations
    curated_translations = {
        'DET': {
            'the': {'french': 'le/la', 'bambara': 'o', 'wolof': 'bi/gi'},
            'a': {'french': 'un/une', 'bambara': 'dɔ', 'wolof': 'ab'},
            'an': {'french': 'un/une', 'bambara': 'dɔ', 'wolof': 'ab'},
            'this': {'french': 'ce/cette', 'bambara': 'nin', 'wolof': 'bii'},
            'that': {'french': 'ce/cette', 'bambara': 'o', 'wolof': 'boobu'},
            'these': {'french': 'ces', 'bambara': 'ninnu', 'wolof': 'yii'},
            'those': {'french': 'ceux/celles', 'bambara': 'olu', 'wolof': 'yooyu'},
            'my': {'french': 'mon/ma/mes', 'bambara': 'n ka', 'wolof': 'sama'},
            'your': {'french': 'ton/ta/tes', 'bambara': 'i ka', 'wolof': 'sa'},
            'his': {'french': 'son/sa/ses', 'bambara': 'a ka', 'wolof': 'am'},
            'her': {'french': 'son/sa/ses', 'bambara': 'a ka', 'wolof': 'am'},
        },
        'PRON': {
            'i': {'french': 'je', 'bambara': 'n', 'wolof': 'man'},
            'you': {'french': 'tu/vous', 'bambara': 'i', 'wolof': 'yaw'},
            'he': {'french': 'il', 'bambara': 'a', 'wolof': 'moom'},
            'she': {'french': 'elle', 'bambara': 'a', 'wolof': 'moom'},
            'it': {'french': 'il/elle', 'bambara': 'a', 'wolof': 'moom'},
            'we': {'french': 'nous', 'bambara': 'an', 'wolof': 'nun'},
            'they': {'french': 'ils/elles', 'bambara': 'u', 'wolof': 'ñoom'},
            'me': {'french': 'me/moi', 'bambara': 'n', 'wolof': 'ma'},
            'him': {'french': 'lui', 'bambara': 'a', 'wolof': 'ko'},
            'her': {'french': 'elle/lui', 'bambara': 'a', 'wolof': 'ko'},
        },
        'NOUN': {
            'man': {'french': 'homme', 'bambara': 'cɛ', 'wolof': 'góor'},
            'woman': {'french': 'femme', 'bambara': 'muso', 'wolof': 'jigéen'},
            'child': {'french': 'enfant', 'bambara': 'den', 'wolof': 'xale'},
            'house': {'french': 'maison', 'bambara': 'so', 'wolof': 'kër'},
            'water': {'french': 'eau', 'bambara': 'ji', 'wolof': 'ndox'},
            'food': {'french': 'nourriture', 'bambara': 'dumuni', 'wolof': 'ñam'},
            'day': {'french': 'jour', 'bambara': 'don', 'wolof': 'bés'},
            'night': {'french': 'nuit', 'bambara': 'su', 'wolof': 'guddi'},
            'sun': {'french': 'soleil', 'bambara': 'tere', 'wolof': 'jant'},
            'moon': {'french': 'lune', 'bambara': 'kalo', 'wolof': 'weer'},
        },
        'VERB': {
            'go': {'french': 'aller', 'bambara': 'taa', 'wolof': 'dem'},
            'come': {'french': 'venir', 'bambara': 'na', 'wolof': 'ñëw'},
            'eat': {'french': 'manger', 'bambara': 'dumu', 'wolof': 'lekk'},
            'drink': {'french': 'boire', 'bambara': 'min', 'wolof': 'naan'},
            'see': {'french': 'voir', 'bambara': 'ye', 'wolof': 'gis'},
            'hear': {'french': 'entendre', 'bambara': 'mɛn', 'wolof': 'dégg'},
            'speak': {'french': 'parler', 'bambara': 'kuma', 'wolof': 'wax'},
            'walk': {'french': 'marcher', 'bambara': 'taama', 'wolof': 'dox'},
            'sleep': {'french': 'dormir', 'bambara': 'sunɔgɔ', 'wolof': 'nelaw'},
            'work': {'french': 'travailler', 'bambara': 'baara', 'wolof': 'liggéey'},
        },
        'ADJ': {
            'good': {'french': 'bon', 'bambara': 'ɲuman', 'wolof': 'baax'},
            'bad': {'french': 'mauvais', 'bambara': 'juguman', 'wolof': 'bon'},
            'big': {'french': 'grand', 'bambara': 'bon', 'wolof': 'mag'},
            'small': {'french': 'petit', 'bambara': 'fitini', 'wolof': 'ndaw'},
            'hot': {'french': 'chaud', 'bambara': 'goni', 'wolof': 'tàng'},
            'cold': {'french': 'froid', 'bambara': 'suma', 'wolof': 'sedd'},
            'new': {'french': 'nouveau', 'bambara': 'kura', 'wolof': 'bees'},
            'old': {'french': 'vieux', 'bambara': 'kɔrɔ', 'wolof': 'màggat'},
        }
    }
    
    # Save curated translations for reference
    with open(common_words_file, 'w') as f:
        json.dump(curated_translations, f, ensure_ascii=False, indent=2)
    
    # Check for checkpoint
    translations = {}
    if os.path.exists(checkpoint_file):
        try:
            with open(checkpoint_file, 'r') as f:
                translations = json.load(f)
            logger.log(f"Loaded translations for {len(translations.get('french', {}))} words from checkpoint")
        except Exception as e:
            logger.log(f"Error loading checkpoint: {str(e)}", "ERROR")
            translations = {'french': {}, 'bambara': {}, 'wolof': {}}
    else:
        translations = {'french': {}, 'bambara': {}, 'wolof': {}}
    
    # Add curated translations to translation dictionary
    for pos, words in curated_translations.items():
        for word, trans in words.items():
            for lang, value in trans.items():
                if lang not in translations:
                    translations[lang] = {}
                translations[lang][word] = value
    
    # Setup translation model
    translator = setup_translation_model(logger, directories["cache"])
    if not translator:
        logger.log("Using placeholder translations due to model loading failure", "WARNING")
    
    # Translate each POS category in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
        futures = {}
        
        for pos in TARGET_POS:
            # Only process one POS at a time to avoid memory issues
            futures[executor.submit(
                translate_pos_category, 
                pos, 
                words_df, 
                translator, 
                logger, 
                cache_manager, 
                translations
            )] = pos
            
            # Wait for completion before starting next POS
            for future in concurrent.futures.as_completed(futures):
                pos = futures[future]
                try:
                    translations = future.result()
                    logger.log(f"Completed translation for {pos}")
                    
                    # Save checkpoint after each POS
                    with open(checkpoint_file, 'w') as f:
                        json.dump(translations, f, ensure_ascii=False)
                    
                    # Generate interim CSV for this POS
                    interim_file = os.path.join(directories["interim"], f"{pos}_translations.csv")
                    
                    # Create DataFrame for this POS
                    pos_words = words_df[words_df['pos'] == pos]
                    rows = []
                    
                    for _, row in pos_words.iterrows():
                        word = row['word']
                        item = {
                            'english': word,
                            'french': translations.get('french', {}).get(word, f"fr_{word}"),
                            'bambara': translations.get('bambara', {}).get(word, f"bam_{word}"),
                            'wolof': translations.get('wolof', {}).get(word, f"wol_{word}"),
                            'pos': pos
                        }
                        rows.append(item)
                    
                    # Save interim CSV
                    pos_df = pd.DataFrame(rows)
                    pos_df.to_csv(interim_file, index=False)
                    
                except Exception as e:
                    logger.log(f"Error translating {pos}: {str(e)}", "ERROR")
            
            # Clear futures for next POS
            futures.clear()
            
            # Clean up memory
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            gc.collect()
    
    # Create final quadrilingual DataFrame
    logger.log("Creating final quadrilingual DataFrame")
    
    # Create list of all translations
    all_translations = []
    
    for _, row in words_df.iterrows():
        word = row['word']
        pos = row['pos']
        
        item = {
            'english': word,
            'french': translations.get('french', {}).get(word, f"fr_{word}"),
            'bambara': translations.get('bambara', {}).get(word, f"bam_{word}"),
            'wolof': translations.get('wolof', {}).get(word, f"wol_{word}"),
            'pos': pos
        }
        all_translations.append(item)
    
    # Create DataFrame and save
    quad_df = pd.DataFrame(all_translations)
    quad_df.to_csv(output_file, index=False)
    
    logger.log(f"Saved {len(quad_df)} translations to {output_file}")
    logger.complete_step("word_translation")
    
    return quad_df

# ====================
# POS-ALIGNED CSV GENERATION
# ====================

def generate_pos_aligned_csv(quad_df, directories, logger):
    """Generate POS-aligned CSV file with optimized chunked processing"""
    output_file = os.path.join(directories["final"], "pos_aligned_quadrilingual.csv")
    
    # Skip if already completed
    if logger.is_completed("pos_aligned_csv") and os.path.exists(output_file):
        logger.log("Skipping POS-aligned CSV generation - already completed")
        return
    
    logger.log("Generating POS-aligned CSV file")
    display_system_info()
    
    # Group words by POS category
    pos_dict = {}
    for pos in TARGET_POS:
        pos_words = quad_df[quad_df["pos"] == pos]
        if len(pos_words) > 0:
            pos_dict[pos] = {
                "english": pos_words["english"].tolist(),
                "french": pos_words["french"].tolist(),
                "bambara": pos_words["bambara"].tolist(),
                "wolof": pos_words["wolof"].tolist()
            }
    
    # Get headers
    pos_header = []
    for pos in pos_dict.keys():
        # Each POS category spans 4 columns (one per language)
        pos_header.extend([pos, "", "", ""])
    
    lang_header = []
    for _ in pos_dict.keys():
        lang_header.extend(["ENG", "FR", "BAM", "WOL"])
    
    # Find maximum number of words in any category
    max_words = max([len(words["english"]) for words in pos_dict.values()])
    logger.log(f"Writing {max_words} rows of data to CSV")
    
    # Process in chunks
    chunk_size = 5000
    
    def process_chunk(start_idx, end_idx):
        """Process a chunk of rows for the CSV file"""
        rows = []
        for i in range(start_idx, end_idx):
            row = []
            for pos in pos_dict.keys():
                words = pos_dict[pos]
                # Add words or empty strings if index out of range
                en_word = words["english"][i] if i < len(words["english"]) else ""
                fr_word = words["french"][i] if i < len(words["french"]) else ""
                bam_word = words["bambara"][i] if i < len(words["bambara"]) else ""
                wol_word = words["wolof"][i] if i < len(words["wolof"]) else ""
                row.extend([en_word, fr_word, bam_word, wol_word])
            rows.append(row)
        return rows
    
    # Create CSV file with parallel chunk processing
    with open(output_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        
        # Write headers
        writer.writerow(pos_header)
        writer.writerow(lang_header)
        
        # Process chunks in parallel
        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = []
            
            # Submit all chunks for processing
            for start_idx in range(0, max_words, chunk_size):
                end_idx = min(start_idx + chunk_size, max_words)
                futures.append(executor.submit(process_chunk, start_idx, end_idx))
            
            # Process chunks as they complete
            for i, future in enumerate(tqdm(concurrent.futures.as_completed(futures), 
                                         total=len(futures), 
                                         desc="Writing CSV chunks")):
                try:
                    # Get rows for this chunk
                    rows = future.result()
                    
                    # Write rows to CSV
                    for row in rows:
                        writer.writerow(row)
                    
                    # Log progress
                    chunk_idx = i + 1
                    logger.track_progress("CSV generation", chunk_idx * chunk_size, max_words, 
                                       f"Chunk {chunk_idx}/{len(futures)}")
                    
                    # Save sample of this chunk
                    if i % 10 == 0:
                        sample_file = os.path.join(directories["samples"], f"chunk_{i}_sample.csv")
                        with open(sample_file, "w", newline="", encoding="utf-8") as sample_f:
                            sample_writer = csv.writer(sample_f)
                            sample_writer.writerow(pos_header)
                            sample_writer.writerow(lang_header)
                            for row in rows[:100]:  # Save first 100 rows as sample
                                sample_writer.writerow(row)
                except Exception as e:
                    logger.log(f"Error processing chunk {i}: {str(e)}", "ERROR")
    
    logger.log(f"Generated POS-aligned CSV with {max_words} rows")
    logger.complete_step("pos_aligned_csv")

# ====================
# MAIN PIPELINE
# ====================

def run_pipeline():
    """Run the complete pipeline with improved parallelism and Google Drive storage"""
    print(f"{Fore.CYAN}{'=' * 70}{Style.RESET_ALL}")
    print(f"{Fore.CYAN}STARTING MULTILINGUAL DATASET GENERATION PIPELINE{Style.RESET_ALL}")
    print(f"{Fore.CYAN}{'=' * 70}{Style.RESET_ALL}")
    
    start_time = time.time()
    
    # Download and verify NLTK resources first
    print(f"{Fore.CYAN}STEP 0: DOWNLOADING AND VERIFYING NLTK RESOURCES{Style.RESET_ALL}")
    nltk_status = download_nltk_data()
    
    # Import transformers here after NLTK setup
    try:
        from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
        print(f"{Fore.GREEN}✓ Successfully imported transformers library{Style.RESET_ALL}")
    except Exception as e:
        print(f"{Fore.RED}✗ Error importing transformers: {str(e)}{Style.RESET_ALL}")
        print(f"{Fore.RED}This is critical - cannot continue without transformers library{Style.RESET_ALL}")
        return
    
    # Set up directories
    print(f"{Fore.CYAN}SETTING UP PROJECT DIRECTORIES{Style.RESET_ALL}")
    base_dir, directories = setup_drive_directories()
    
    # Set up logger
    logger = Logger(directories["logs"])
    logger.log(f"Pipeline started using base directory: {base_dir}")
    
    # Set up cache manager
    cache_manager = CacheManager(directories["cache"])
    
    # Import and configure wordnet with proper error handling
    global WN_POS_MAP
    if nltk_status.get("wordnet", False):
        try:
            from nltk.corpus import wordnet as wn
            WN_POS_MAP = {
                wn.NOUN: 'NOUN',
                wn.VERB: 'VERB',
                wn.ADJ: 'ADJ',
                wn.ADV: 'ADV'
            }
            logger.log("Successfully configured WordNet")
        except Exception as e:
            logger.log(f"Error configuring WordNet: {str(e)}", "ERROR")
            WN_POS_MAP = {}
    else:
        logger.log("WordNet not available, will use alternative word sources", "WARNING")
        WN_POS_MAP = {}
    
    try:
        # STEP 1: Extract words with POS tags
        logger.log("STEP 1: EXTRACTING WORDS WITH POS TAGS")
        words_df = extract_words_with_pos_tags(directories, logger, cache_manager)
        
        # Display system info
        display_system_info()
        
        # STEP 2: Translate words
        logger.log("STEP 2: TRANSLATING WORDS")
        quad_df = translate_words(words_df, directories, logger, cache_manager)
        
        # Display system info
        display_system_info()
        
        # STEP 3: Generate POS-aligned CSV
        logger.log("STEP 3: GENERATING POS-ALIGNED CSV")
        generate_pos_aligned_csv(quad_df, directories, logger)
        
        # Display final system info
        display_system_info()
        
        # Calculate execution time
        end_time = time.time()
        execution_time = end_time - start_time
        hours, remainder = divmod(execution_time, 3600)
        minutes, seconds = divmod(remainder, 60)
        
        logger.log(f"PIPELINE COMPLETED SUCCESSFULLY in {int(hours)}h {int(minutes)}m {int(seconds)}s")
        logger.summary()
        
    except Exception as e:
        logger.log(f"Pipeline failed: {str(e)}", "ERROR")
        import traceback
        traceback.print_exc()
        logger.log("See error log for details", "ERROR")
    
    print(f"{Fore.CYAN}{'=' * 70}{Style.RESET_ALL}")
    print(f"{Fore.CYAN}PIPELINE EXECUTION COMPLETED{Style.RESET_ALL}")
    print(f"{Fore.CYAN}{'=' * 70}{Style.RESET_ALL}")

# Entry point
if __name__ == "__main__":
    run_pipeline()
