# Second Try using small model 'facebook/nllb-200-distilled-600M'

In [14]:
import pandas as pd
import torch
import re
import time
import os
import sys
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from huggingface_hub import hf_hub_download
from IPython.display import display, HTML, clear_output
from tqdm.notebook import tqdm
import ipywidgets as widgets


In [17]:
"""
Sinhala/Tamil to English Translation Program - Jupyter Notebook Version

This notebook translates Sinhala or Tamil text to English using the facebook/nllb-200-distilled-600M model.
It processes CSV files containing Sinhala/Tamil comments and saves the translations to a new CSV file.

Requirements:
    !pip install transformers pandas torch sentencepiece sacremoses huggingface_hub tqdm ipywidgets
"""

import pandas as pd
import torch
import re
import time
import os
import sys
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from huggingface_hub import hf_hub_download
from IPython.display import display, HTML, clear_output
from tqdm.notebook import tqdm
import ipywidgets as widgets


# Cell 1: Load and explore CSV function
def load_and_explore_csv(file_path):
    """
    Load the CSV file and explore its structure.
    
    Args:
        file_path (str): Path to the CSV file containing text to translate
        
    Returns:
        DataFrame: Pandas DataFrame containing the CSV data
    """
    try:
        # Read the CSV file
        # Note: You might need to adjust parameters based on your CSV structure
        df = pd.read_csv(file_path, encoding='utf-8')
        
        # Display basic information
        print(f"CSV file loaded successfully.")
        print(f"Shape of the data: {df.shape}")
        print(f"Column names: {df.columns.tolist()}")
        
        # Display first few rows
        display(HTML("<h3>First few rows of the data:</h3>"))
        display(df.head())
        
        # Check for missing values
        display(HTML("<h3>Missing values in each column:</h3>"))
        display(df.isnull().sum())
        
        return df
        
    except Exception as e:
        print(f"Error loading CSV file: {e}")
        return None


# Cell 2: Setup translation pipeline
def setup_translation_pipeline():
    """
    Set up an optimized translation pipeline with the NLLB model.
    
    Returns:
        tuple: (translator, device) - The loaded pipeline and device
    """
    try:
        # Define model name
        model_name = "facebook/nllb-200-distilled-600M"
        
        # Progress indicator
        progress_text = widgets.HTML("Loading model and tokenizer...")
        progress_bar = widgets.IntProgress(min=0, max=4, value=0)
        display(widgets.VBox([progress_text, progress_bar]))
        
        # Configure device
        if torch.backends.mps.is_available():
            device = torch.device("mps")
            torch_dtype = torch.float32
            progress_text.value = "Using Apple Silicon (MPS) acceleration"
        elif torch.cuda.is_available():
            device = torch.device("cuda")
            torch_dtype = torch.float16
            progress_text.value = "Using CUDA acceleration"
        else:
            device = torch.device("cpu")
            torch_dtype = torch.float32
            progress_text.value = "Using CPU (no GPU acceleration available)"
        
        progress_bar.value += 1
        
        # Configure environment for better downloads
        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
        os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
        
        try:
            # Try creating a translation pipeline directly
            progress_text.value = "Creating translation pipeline..."
            progress_bar.value += 1
            
            translator = pipeline(
                "translation", 
                model=model_name,
                device=device if device.type != "mps" else -1,  # Pipeline doesn't support MPS directly
                torch_dtype=torch_dtype
            )
            
            progress_bar.value += 1
            progress_text.value = "Translation pipeline created successfully!"
            progress_bar.value = 4
            
        except Exception as e:
            progress_text.value = f"Pipeline creation failed: {e}<br>Attempting manual download and setup..."
            
            try:
                # Download model files manually if needed
                hf_hub_download(
                    repo_id=model_name,
                    filename="model.safetensors",
                    resume_download=True
                )
                
                progress_bar.value += 1
                progress_text.value = "Loading tokenizer..."
                
                # Load tokenizer and model separately
                tokenizer = AutoTokenizer.from_pretrained(model_name)
                
                progress_bar.value += 1
                progress_text.value = "Loading model..."
                
                model = AutoModelForSeq2SeqLM.from_pretrained(
                    model_name,
                    torch_dtype=torch_dtype
                )
                
                # Move model to device
                model = model.to(device)
                
                progress_bar.value += 1
                progress_text.value = "Creating pipeline with loaded components..."
                
                # Create pipeline with loaded components
                translator = pipeline(
                    "translation", 
                    model=model,
                    tokenizer=tokenizer,
                    device=device if device.type != "mps" else -1
                )
                
                progress_bar.value = 4
                progress_text.value = "Translation pipeline successfully created!"
                
            except Exception as e:
                progress_bar.value = 4
                progress_text.value = f"<span style='color:red'>Failed to create translation pipeline: {e}</span><br>" + \
                                     "Troubleshooting tips:<br>" + \
                                     "1. Check internet connection<br>" + \
                                     "2. Try smaller model: facebook/nllb-200-distilled-300M<br>" + \
                                     "3. Manually download from: https://huggingface.co/facebook/nllb-200-distilled-600M"
                return None, None
        
        display(HTML(f"<div style='background:#d4edda;padding:10px;border-radius:5px'>" +
                    f"Translation pipeline successfully created and running on {device}</div>"))
        return translator, device
        
    except Exception as e:
        progress_text.value = f"<span style='color:red'>Error setting up translation pipeline: {e}</span>"
        progress_bar.value = 4
        return None, None


# Cell 3: Language detection function
def detect_language(text):
    """
    Detect if the text is Sinhala, Tamil, or primarily English.
    
    Args:
        text (str): The text to check
        
    Returns:
        str: 'sin' for Sinhala, 'tam' for Tamil, 'eng' for English, or None for empty/invalid
    """
    if not text or pd.isna(text):
        return None
    
    # Count English characters
    english_chars = len(re.findall(r'[a-zA-Z]', text))
    
    # Count Sinhala characters (Unicode range: U+0D80 to U+0DFF)
    sinhala_chars = len(re.findall(r'[\u0D80-\u0DFF]', text))
    
    # Count Tamil characters (Unicode range: U+0B80 to U+0BFF)
    tamil_chars = len(re.findall(r'[\u0B80-\u0BFF]', text))
    
    # Get total characters (only counting those that are relevant)
    total_relevant_chars = english_chars + sinhala_chars + tamil_chars
    
    if total_relevant_chars == 0:
        return None
    
    # Calculate percentages
    english_percent = english_chars / total_relevant_chars
    sinhala_percent = sinhala_chars / total_relevant_chars
    tamil_percent = tamil_chars / total_relevant_chars
    
    # Determine dominant language
    if english_percent > 0.7:
        return 'eng'
    elif sinhala_percent > tamil_percent:
        return 'sin'
    elif tamil_percent > 0:
        return 'tam'
    else:
        return 'sin'  # Default to Sinhala if unclear


# Cell 4: Translation function
def translate_text(text, translator, src_lang=None):
    """
    Translate text to English using the NLLB pipeline.
    
    Args:
        text (str): The text to translate
        translator: The translation pipeline
        src_lang (str, optional): Source language code. If None, it will be auto-detected
        
    Returns:
        str: The translated English text
    """
    try:
        # Skip translation if text is empty or None
        if not text or pd.isna(text):
            return ""
        
        # Auto-detect language if not provided
        if src_lang is None:
            src_lang = detect_language(text)
            if src_lang is None:
                return ""  # Empty or invalid text
            
        # Map our language codes to NLLB language codes
        lang_map = {
            'sin': 'sin_Sinh',  # Sinhala
            'tam': 'tam_Taml',  # Tamil
            'eng': 'eng_Latn'   # English
        }
        
        nllb_src_lang = lang_map.get(src_lang)
        if not nllb_src_lang:
            print(f"Warning: Unknown language code '{src_lang}'")
            nllb_src_lang = 'sin_Sinh'  # Default to Sinhala
            
        # If already English, return as is
        if src_lang == 'eng':
            return text
            
        # Calculate appropriate max_length based on input text length
        # Increase max_length for longer inputs to avoid truncation warnings
        text_length = len(text)
        max_length = max(1000, text_length * 2)  # Use at least 500 or 2x input length
        
        # Use the translation pipeline with increased max_length
        result = translator(
            text,
            src_lang=nllb_src_lang,
            tgt_lang="eng_Latn",   # English
            max_length=max_length  # Increased max_length to handle longer texts
        )
        
        # Extract the translation text
        if isinstance(result, list) and len(result) > 0:
            if isinstance(result[0], dict) and 'translation_text' in result[0]:
                return result[0]['translation_text']
            else:
                return str(result[0])
        else:
            return str(result)
        
    except Exception as e:
        print(f"Error translating text: {e}")
        print(f"Problematic text: {text}")
        return f"ERROR: {str(e)}"


# Cell 5: Smart translation function with interactive progress bar
def smart_translate_dataframe(df, text_column, translator):
    """
    Intelligently translate all texts in a specific column of a DataFrame with interactive progress.
    
    Args:
        df (DataFrame): The DataFrame containing the text to translate
        text_column (str): The name of the column containing text
        translator: The translation pipeline
        
    Returns:
        DataFrame: The DataFrame with an additional column containing translations
    """
    # Create a copy of the DataFrame to avoid modifying the original
    result_df = df.copy()
    
    # Add a new column for translations
    result_df['translated_text'] = ""
    
    # Add a column for detected languages
    result_df['detected_language'] = ""
    
    # Get total number of rows
    total_rows = len(df)
    
    # Create widgets for progress tracking
    progress_bar = tqdm(total=total_rows)
    status_output = widgets.Output()
    display(status_output)
    
    # Initialize stats panel
    stats_html = widgets.HTML(
        f"<h4>Statistics</h4>" +
        f"<p>Languages: Sinhala: 0, Tamil: 0, English: 0, Unknown: 0</p>" +
        f"<p>Speed: 0 rows/sec, Elapsed: 0 min, ETA: 0 min</p>"
    )
    display(stats_html)
    
    # Sample panel to show examples of translations
    sample_output = widgets.Output()
    display(HTML("<h4>Sample Translations</h4>"))
    display(sample_output)
    
    # Track time
    start_time = time.time()
    last_update_time = start_time
    batch_size = 5  # Update stats every 5 translations
    
    # Counters for statistics
    language_counts = {'sin': 0, 'tam': 0, 'eng': 0, 'unknown': 0}
    
    # Process each row
    for i, row in df.iterrows():
        # Get the text to translate
        text = str(row[text_column])
        
        # Detect language
        detected_lang = detect_language(text)
        
        # Update language counter
        if detected_lang:
            language_counts[detected_lang] += 1
        else:
            language_counts['unknown'] += 1
            
        # Store detected language
        result_df.at[i, 'detected_language'] = detected_lang if detected_lang else 'unknown'
        
        # Translate the text
        if detected_lang == 'eng':
            # Already English, no need to translate
            result_df.at[i, 'translated_text'] = text
        elif detected_lang:
            # Translate from detected language to English
            translated = translate_text(text, translator, detected_lang)
            result_df.at[i, 'translated_text'] = translated
        else:
            # Cannot translate
            result_df.at[i, 'translated_text'] = ""
        
        # Show a sample of translations occasionally
        if i % 10 == 0 or i == total_rows - 1:
            with sample_output:
                clear_output(wait=True)
                print(f"Row {i+1}/{total_rows}:")
                print(f"Original ({detected_lang}): {text[:100]}{'...' if len(text) > 100 else ''}")
                print(f"Translated: {result_df.at[i, 'translated_text'][:100]}{'...' if len(result_df.at[i, 'translated_text']) > 100 else ''}")
                print("-" * 80)
        
        # Update progress bar
        progress_bar.update(1)
        
        # Show stats every batch_size rows
        if (i + 1) % batch_size == 0 or (i + 1) == total_rows:
            current_time = time.time()
            elapsed = current_time - start_time
            batch_elapsed = current_time - last_update_time
            last_update_time = current_time
            
            progress = (i + 1) / total_rows * 100
            rows_per_sec = batch_size / batch_elapsed if batch_elapsed > 0 else 0
            
            # Estimate remaining time
            remaining_rows = total_rows - (i + 1)
            eta_seconds = remaining_rows / rows_per_sec if rows_per_sec > 0 else 0
            eta_min = eta_seconds / 60
            
            # Update statistics display
            stats_html.value = (
                f"<h4>Statistics</h4>" +
                f"<p>Languages: Sinhala: {language_counts['sin']}, " +
                f"Tamil: {language_counts['tam']}, " +
                f"English: {language_counts['eng']}, " +
                f"Unknown: {language_counts['unknown']}</p>" +
                f"<p>Speed: {rows_per_sec:.2f} rows/sec, " +
                f"Elapsed: {elapsed/60:.2f} min, " +
                f"ETA: {eta_min:.2f} min</p>"
            )
    
    # Final statistics update
    with status_output:
        clear_output(wait=True)
        elapsed_min = (time.time() - start_time) / 60
        print(f"Translation completed in {elapsed_min:.2f} minutes")
    
    progress_bar.close()
    
    return result_df


# Cell 6: Validate translations
def validate_translations(df, original_column, translated_column):
    """
    Validate the translations to ensure quality.
    
    Args:
        df (DataFrame): The DataFrame containing original and translated text
        original_column (str): The name of the column containing original text
        translated_column (str): The name of the column containing translated text
        
    Returns:
        DataFrame: A DataFrame containing problematic translations for review
    """
    display(HTML("<h3>Validating Translations</h3>"))
    
    # Create a copy for validation results
    validation_df = pd.DataFrame(columns=['row_index', 'original', 'translated', 'issue'])
    
    # List to collect problematic rows
    issues = []
    
    # Progress bar for validation
    progress_bar = tqdm(total=len(df))
    
    # Check each row
    for i, row in df.iterrows():
        original = str(row[original_column])
        translated = str(row[translated_column])
        
        # Skip if original was detected as English (where translated == original)
        if row['detected_language'] == 'eng':
            progress_bar.update(1)
            continue
            
        # Check for empty translations of non-empty originals
        if original and not translated:
            issues.append({
                'row_index': i,
                'original': original,
                'translated': translated,
                'issue': 'Empty translation'
            })
            progress_bar.update(1)
            continue
        
        # Check for very short translations of long originals
        # This could indicate truncation or incomplete translation
        if len(original) > 50 and len(translated) < 10:
            issues.append({
                'row_index': i,
                'original': original,
                'translated': translated,
                'issue': 'Suspiciously short translation'
            })
            progress_bar.update(1)
            continue
            
        # Check for translations that contain error messages
        if "ERROR:" in translated:
            issues.append({
                'row_index': i,
                'original': original,
                'translated': translated,
                'issue': 'Contains error message'
            })
            progress_bar.update(1)
            continue
            
        progress_bar.update(1)
    
    progress_bar.close()
    
    # Create DataFrame from issues list
    if issues:
        validation_df = pd.DataFrame(issues)
        display(HTML(f"<div style='background:#fff3cd;padding:10px;border-radius:5px'>Found {len(issues)} potentially problematic translations</div>"))
        display(validation_df)
    else:
        display(HTML("<div style='background:#d4edda;padding:10px;border-radius:5px'>No translation issues found</div>"))
    
    return validation_df


# Cell 7: Retry failed translations
def retry_failed_translations(df, validation_df, text_column, translator):
    """
    Retry the failed translations.
    
    Args:
        df (DataFrame): The original DataFrame with translations
        validation_df (DataFrame): The DataFrame containing failed translations
        text_column (str): The name of the column containing original text
        translator: The translation pipeline
        
    Returns:
        DataFrame: The updated DataFrame with retried translations
    """
    # Create a copy to avoid modifying the original
    result_df = df.copy()
    
    if validation_df.empty:
        display(HTML("<div style='background:#d4edda;padding:10px;border-radius:5px'>No failed translations to retry</div>"))
        return result_df
    
    display(HTML(f"<h3>Retrying {len(validation_df)} Failed Translations</h3>"))
    
    # Progress bar for retries
    progress_bar = tqdm(total=len(validation_df))
    retry_output = widgets.Output()
    display(retry_output)
    
    # Retry each failed translation
    for _, row in validation_df.iterrows():
        row_index = row['row_index']
        original_text = row['original']
        
        # Get the detected language from the original dataframe
        detected_lang = df.at[row_index, 'detected_language']
        
        # Try translation with different parameters
        try:
            # Calculate appropriate max_length for retry
            text_length = len(original_text)
            max_length = max(3000, text_length * 3)  # More generous for retries
            
            # Retry translation with increased max_length
            translated = translate_text(original_text, translator, detected_lang)
            
            # Update the DataFrame
            result_df.at[row_index, 'translated_text'] = translated
            
            with retry_output:
                clear_output(wait=True)
                print(f"Successfully retried translation for row {row_index}")
                print(f"Original: {original_text[:100]}{'...' if len(original_text) > 100 else ''}")
                print(f"New translation: {translated[:100]}{'...' if len(translated) > 100 else ''}")
            
        except Exception as e:
            with retry_output:
                clear_output(wait=True)
                print(f"Still failed to translate row {row_index}: {e}")
        
        progress_bar.update(1)
    
    progress_bar.close()
    
    return result_df


# Cell 8: Save translated CSV
def save_translated_csv(df, output_path, include_original=True):
    """
    Save the DataFrame with translations to a CSV file.
    
    Args:
        df (DataFrame): The DataFrame containing the original text and translations
        output_path (str): Path to save the output CSV file
        include_original (bool): Whether to include the original text in the output
        
    Returns:
        bool: True if saved successfully, False otherwise
    """
    try:
        # Create a copy to avoid modifying the original
        output_df = df.copy()
        
        # Reorder columns to put translations at the end if needed
        # This is just for better readability of the CSV
        if include_original:
            # Keep all columns but ensure translation is at the end
            cols = [col for col in output_df.columns if col != 'translated_text'] + ['translated_text']
            output_df = output_df[cols]
        else:
            # Replace the original text column with the translated text
            # Find the column that was translated (assuming it's stored somewhere)
            # For now, we'll just keep all columns including translated_text
            pass
        
        # Save to CSV
        output_df.to_csv(output_path, index=False, encoding='utf-8-sig')  # utf-8-sig includes BOM for Excel compatibility
        
        display(HTML(f"<div style='background:#d4edda;padding:10px;border-radius:5px'>" +
                   f"Translated data saved to {output_path}<br>" +
                   f"Total rows saved: {len(output_df)}</div>"))
        
        return True
        
    except Exception as e:
        display(HTML(f"<div style='background:#f8d7da;padding:10px;border-radius:5px'>" +
                   f"Error saving translated CSV: {e}</div>"))
        return False


# Cell 9: Example section
def run_example_in_notebook():
    """
    Run a simple example with the provided sample data.
    """
    # Create a sample DataFrame with the provided example
    sample_data = pd.DataFrame({
        'comment': [
            'අසරන මනුස්සයෙක්ගෙ දවසම කාලා නිලිය sooting යන්නෙත් pikme weel එකක ඒකට දෙන්න මුංට සල්ලිත් නැ සල්ලි නැත්තම් atm එකකින් බැහැල කාඩ් එකෙන් සල්ලි අරන් දෙන්න තිබ්බනෙ මෙයා රටෙ මිනිස්සුන්ගෙන් මදිවට වීල් වල යන අයගෙනුත් කුණු බැනුම් අහගන්නවා',
            'PickMe this is a serious nonsense aththatama gaman cancel karana ekai gewanna wei kiala nathnm complaint ekak dammoth ape numbers walata call aran banina ekai lata kisima safety ekak naha complaint ekak daaddi ape address ekath ekkalu yanne U all have to update ur system and choose professional quality drivers n give them strict warnings about the service u all providing',
            'நெருக்கடியான சூழ்நிலையில். .. .. .. .. .. .. .. .. .. .. .. .. .. நெருக்கடியான சூழ்நிலையில். .. .. .. நெருக்கடியான சூழ்நிலையில். .. .. .. . நெருக்கடியான சூழ்நிலையில். .. .. .. .. .. .. .. .. .. .. . நெருக்கடிகள். .. .. .. .. .. .. .. .. .. .. . நெருக்கடியான சூழ்நிலையில். .. .. .. நெருக்கடியான சூழ்நிலையில். .. .. .. .. .. .. .. .. .. நெருக்கடியான சூழ்நிலையில். .. .. .. .. .. .. .. .. .. .'
        ]
    })
    
    # Save the sample data to a temporary CSV file
    sample_csv_path = 'sample_comments.csv'
    sample_data.to_csv(sample_csv_path, index=False)
    
    display(HTML("<div style='background:#d4edda;padding:10px;border-radius:5px'>Sample CSV created with provided examples</div>"))
    
    # Show the sample data
    display(HTML("<h3>Sample Data:</h3>"))
    display(sample_data)
    
    # Set up the translation pipeline
    display(HTML("<h3>Setting up Translation Pipeline:</h3>"))
    translator, _ = setup_translation_pipeline()
    if translator is None:
        display(HTML("<div style='background:#f8d7da;padding:10px;border-radius:5px'>Failed to set up translation pipeline</div>"))
        return
    
    # Translate the examples
    display(HTML("<h3>Translating Sample Data:</h3>"))
    translated_df = smart_translate_dataframe(sample_data, 'comment', translator)
    
    # Validate translations
    display(HTML("<h3>Validating Translations:</h3>"))
    validation_results = validate_translations(translated_df, 'comment', 'translated_text')
    
    # Retry failed translations if needed
    if not validation_results.empty:
        translated_df = retry_failed_translations(translated_df, validation_results, 'comment', translator)
    
    # Save the results
    output_csv_path = 'translated_sample_comments.csv'
    save_translated_csv(translated_df, output_csv_path)
    
    # Show final translations
    display(HTML("<h3>Final Translations:</h3>"))
    display(translated_df)


# Cell 10: Main function for interactive use
def translate_csv_interactive():
    """
    Interactive function for translating CSVs in a Jupyter notebook.
    """
    display(HTML("<h2>Sinhala/Tamil to English Translation Tool</h2>"))
    
    # Create tabs for different input methods
    tab = widgets.Tab()
    upload_tab = widgets.VBox()
    path_tab = widgets.VBox()
    tab.children = [upload_tab, path_tab]
    tab.set_title(0, 'Upload File')
    tab.set_title(1, 'Specify Path')
    display(tab)
    
    # === TAB 1: UPLOAD FILE ===
    file_upload = widgets.FileUpload(
        description='Upload CSV:',
        accept='.csv',
        multiple=False
    )
    upload_tab.children = [file_upload]
    
    def on_file_upload(change):
        if not change.new:
            return
            
        # Get the uploaded file content
        file_content = list(change.new.values())[0]['content']
        
        # Save to a temporary file
        with open('temp_uploaded.csv', 'wb') as f:
            f.write(file_content)
        
        # Load the CSV
        df = load_and_explore_csv('temp_uploaded.csv')
        process_loaded_df(df, 'temp_uploaded.csv')
    
    file_upload.observe(on_file_upload, names='value')
    
    # === TAB 2: SPECIFY PATH ===
    file_path = widgets.Text(
        value='',
        placeholder='Enter path to your CSV file (e.g., data/comments.csv)',
        description='File path:',
        disabled=False,
        style={'description_width': 'initial'}
    )
    
    load_path_button = widgets.Button(
        description='Load CSV',
        button_style='info',
        tooltip='Click to load the CSV file from the specified path'
    )
    
    path_description = widgets.HTML(
        value="<p style='margin-bottom:10px'>Enter the full or relative path to your CSV file:</p>"
    )
    
    path_tab.children = [path_description, file_path, load_path_button]
    
    def on_load_path_button_click(b):
        if not file_path.value:
            with path_error_output:
                clear_output(wait=True)
                display(HTML("<div style='color:red'>Please specify a file path</div>"))
            return
        
        # Disable the button to prevent multiple clicks
        load_path_button.disabled = True
        
        try:
            # Load the CSV
            df = load_and_explore_csv(file_path.value)
            if df is not None:
                process_loaded_df(df, file_path.value)
            else:
                with path_error_output:
                    clear_output(wait=True)
                    display(HTML(f"<div style='color:red'>Error loading file from path: {file_path.value}</div>"))
        except Exception as e:
            with path_error_output:
                clear_output(wait=True)
                display(HTML(f"<div style='color:red'>Error: {str(e)}</div>"))
        
        # Re-enable the button
        load_path_button.disabled = False
    
    load_path_button.on_click(on_load_path_button_click)
    
    path_error_output = widgets.Output()
    path_tab.children = [path_description, file_path, load_path_button, path_error_output]
    
    # === COMMON PROCESSING FUNCTION ===
    def process_loaded_df(df, source_path):
        if df is None:
            return
            
        # Clear any existing controls below the tabs
        for i in range(len(tab.selected_widget.children)-1, 0, -1):
            tab.selected_widget.children = tab.selected_widget.children[:-1]
        
        # Create column selector
        column_selector = widgets.Dropdown(
            options=df.columns.tolist(),
            description='Text column:',
            disabled=False
        )
        
        # Create output file name input
        default_output = f"translated_{source_path.split('/')[-1]}"
        output_name = widgets.Text(
            value=default_output,
            placeholder='output.csv',
            description='Output file:',
            disabled=False
        )
        
        # Create button to start translation
        translate_button = widgets.Button(
            description='Start Translation',
            button_style='primary',
            tooltip='Click to start translating'
        )
        
        # Add these widgets to the current tab
        tab.selected_widget.children = tab.selected_widget.children + (column_selector, output_name, translate_button)
        
        def on_translate_button_click(b):
            # Disable the button to prevent multiple clicks
            translate_button.disabled = True
            
            # Set up the translation pipeline
            display(HTML("<h3>Setting up Translation Pipeline:</h3>"))
            translator, _ = setup_translation_pipeline()
            if translator is None:
                translate_button.disabled = False
                return
            
            # Translate the dataframe
            display(HTML("<h3>Translating Data:</h3>"))
            translated_df = smart_translate_dataframe(df, column_selector.value, translator)
            
            # Validate translations
            display(HTML("<h3>Validating Translations:</h3>"))
            validation_results = validate_translations(translated_df, column_selector.value, 'translated_text')
            
            # Retry failed translations if needed
            if not validation_results.empty:
                translated_df = retry_failed_translations(
                    translated_df, validation_results, column_selector.value, translator
                )
            
            # Save the results
            save_translated_csv(translated_df, output_name.value)
            
            # Show final translations
            display(HTML("<h3>Final Translations:</h3>"))
            display(translated_df)
            
            # Re-enable the button
            translate_button.disabled = False
        
        # Connect the click event
        translate_button.on_click(on_translate_button_click)
    
    # Add option to run the example instead
    example_button = widgets.Button(
        description='Run Example',
        button_style='info',
        tooltip='Click to run the built-in example'
    )
    display(example_button)
    
    def on_example_button_click(b):
        # Disable the button to prevent multiple clicks
        example_button.disabled = True
        run_example_in_notebook()
        example_button.disabled = False
        
    example_button.on_click(on_example_button_click)


# How to use this notebook:
# 1. Run this cell to define all functions
# 2. Execute translate_csv_interactive() to launch the interactive interface with file upload
# 3. Or use the direct method to translate a specific file:

# DIRECT USAGE EXAMPLE:

# Load CSV file directly
input_file = "/Volumes/KODAK/folder 02/language_translation/Language_translator/data/filtered_dataset.csv"  # <-- Put your CSV file path here
df = load_and_explore_csv(input_file)

# Set up the translation pipeline
translator, _ = setup_translation_pipeline()

# Choose which column to translate
text_column = "Comment"  # <-- Replace with your column name

# Translate the data
translated_df = smart_translate_dataframe(df, text_column, translator)

# Validate and fix any issues
validation_results = validate_translations(translated_df, text_column, 'translated_text')
if not validation_results.empty:
    translated_df = retry_failed_translations(translated_df, validation_results, text_column, translator)

# Save the results
save_translated_csv(translated_df, "translated_output.csv")


# Run the interactive tool with:
translate_csv_interactive()

# Or run the example with:
# run_example_in_notebook()

CSV file loaded successfully.
Shape of the data: (807, 1)
Column names: ['Comment']


Unnamed: 0,Comment
0,මොනා උනත් පොත්ත සුදු කෑල්ලක් දැකලා දෙකට නැවුනෙ...
1,බැරිනං PickMe එකෙන් අයින් වෙලා නිකං හයර් දුවපං...
2,ට් ‍ රිප් එක මැදදී කෑශ් හයර් එක කාඩ් හයර් එකට ...
3,me දුවපු කොල්ලෙක්ට විතරයි seen එක තේරෙන්නෙඋදේම...
4,මාත් හයර් දුවන්නෙ කස්ටමර්ගෙ පැත්තෙන් බලනකොට සා...


Comment    0
dtype: int64

VBox(children=(HTML(value='Loading model and tokenizer...'), IntProgress(value=0, max=4)))

Device set to use cpu


  0%|          | 0/807 [00:00<?, ?it/s]

Output()

HTML(value='<h4>Statistics</h4><p>Languages: Sinhala: 0, Tamil: 0, English: 0, Unknown: 0</p><p>Speed: 0 rows/…

Output()

  0%|          | 0/807 [00:00<?, ?it/s]

Tab(children=(VBox(), VBox()), _titles={'0': 'Upload File', '1': 'Specify Path'})

Button(button_style='info', description='Run Example', style=ButtonStyle(), tooltip='Click to run the built-in…