In [42]:
import pandas as pd
from transformers import pipeline
from googletrans import Translator
from tqdm.notebook import tqdm
import time
import random
import logging
import numpy as np
from multiprocessing import Pool, cpu_count
import re

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
tqdm.pandas()

In [43]:
df_01 = pd.read_csv('/Volumes/KODAK/folder 02/language_translation/Language_translator/data/filtered_dataset.csv')

df_01.head()

Unnamed: 0,Comment
0,මොනා උනත් පොත්ත සුදු කෑල්ලක් දැකලා දෙකට නැවුනෙ...
1,බැරිනං PickMe එකෙන් අයින් වෙලා නිකං හයර් දුවපං...
2,ට් ‍ රිප් එක මැදදී කෑශ් හයර් එක කාඩ් හයර් එකට ...
3,me දුවපු කොල්ලෙක්ට විතරයි seen එක තේරෙන්නෙඋදේම...
4,මාත් හයර් දුවන්නෙ කස්ටමර්ගෙ පැත්තෙන් බලනකොට සා...


In [44]:
def clean_and_preprocess_text(text):
    """
    Comprehensive text cleaning and preprocessing for Sinhala-English mixed content
    Handles URLs, special chars, mixed language tokens, and whitespace
    """
    
    if not isinstance(text, str):
        return ""
    
    
    # Reular expression patterns
    sinhala_pattern = re.compile(r'[\u0D80-\u0DFF\u0DE6-\u0DEF0-9]+')
    english_pattern = re.compile(r'[a-zA-Z]+')
    url_pattern = re.compile(r'http\S+|www\S+|https\S+', flags=re.MULTILINE)
    special_char_pattern = re.compile(r'[[^\u0D80-\u0DFF\s.,!?\'\"]]')
    special_whitespace = re.compile(r'[\u200B-\u200D\uFEFF\u00A0]')
    
    # Remove URLs
    text = url_pattern.sub('', text)
    text = special_whitespace.sub(' ', text)
    
    # Split into tokens(words)
    tokens = text.split()
    processed_tokens = []
    
    for token in tokens:
        # Check langauge composition of each token
        has_sinhala = bool(sinhala_pattern.search(token))
        has_english = bool(english_pattern.search(token))
        
        # Pure Sinhala - process special chars only
        if has_sinhala and not has_english:
            cleaned_token = special_char_pattern.sub('', token)
            if cleaned_token:
                processed_tokens.append(cleaned_token)
                
        
        # Mixed Sinhala-English - extract only Sinhala part
        elif has_sinhala and has_english:
            sinhala_parts = ''.join(sinhala_pattern.findall(token))
            if sinhala_parts:
                processed_tokens.append(sinhala_parts)
                
                
        # Pure English - remove completely (change if you want to keep)
        else:
            cleaned_eng = special_char_pattern.sub('', token)
            if cleaned_eng:
                processed_tokens.append(f'[ENG: {cleaned_eng}]')
                
                
    # Reconstruct the text
    processed_text = ' '.join(processed_tokens)
    
    # Final cleaning and whitespace normalization 
    processed_text = special_char_pattern.sub('', processed_text)
    processed_text = ' '.join(processed_text.split())
    
    return processed_text.strip()

In [45]:
# load the pipeline

df = pd.read_csv('/Volumes/KODAK/folder 02/language_translation/Language_translator/data/filtered_dataset.csv')

# check the structure
print(df.head())
print(f'Total comments: {len(df)}')

# Apply the combined cleaning function
df['cleaned_comment'] = df['Comment'].apply(clean_and_preprocess_text)

# Remove empty comments after cleaning
df = df[df['cleaned_comment'].str.len() > 0]
print(f'Comments after cleaning: {len(df)}')


# Show some sample
for idx, row in df.head(5).iterrows():
    print(f'\nOriginal: {row['Comment']}')
    print(f'Cleaned: {row['cleaned_comment']}')

                                             Comment
0  මොනා උනත් පොත්ත සුදු කෑල්ලක් දැකලා දෙකට නැවුනෙ...
1  බැරිනං PickMe එකෙන් අයින් වෙලා නිකං හයර් දුවපං...
2  ට් ‍ රිප් එක මැදදී කෑශ් හයර් එක කාඩ් හයර් එකට ...
3  me දුවපු කොල්ලෙක්ට විතරයි seen එක තේරෙන්නෙඋදේම...
4  මාත් හයර් දුවන්නෙ කස්ටමර්ගෙ පැත්තෙන් බලනකොට සා...
Total comments: 807
Comments after cleaning: 807

Original: මොනා උනත් පොත්ත සුදු කෑල්ලක් දැකලා දෙකට නැවුනෙ නැති එකට respect කොල්ලො සමහර උන් නිකනුත් දාන් යන්න try කරනවනේ
Cleaned: මොනා උනත් පොත්ත සුදු කෑල්ලක් දැකලා දෙකට නැවුනෙ නැති එකට [ENG: respect] කොල්ලො සමහර උන් නිකනුත් දාන් යන්න [ENG: try] කරනවනේ

Original: බැරිනං PickMe එකෙන් අයින් වෙලා නිකං හයර් දුවපං මේකේ ඉන්න වියළි කලාපයේ උන් සල්ලි උස්සං හැම තැනම ගියාට කොළඹ ඉන්න එවුන් හැමතැනම ගෙනියන්නේ card කියලා නොදැන මෙතල වැළලෙන්ඩ හදනවා කෙල්ලගේ හමටයි රස්සාවටයි බැන බැන පිස්සු කෙලිනවා මෙයා කවුද කියලා මම දන්නෑ නමුත් මේ ප් ‍ රශ්නේ දැන් PickMe එකේ මල වදයක් වෙලා තියෙන්නේ එක්කෝ මගදි කියනවා trip එක cancel කරන්ඩ කියලා උන්ට companyයට ගෙවන්ඩ 

In [46]:
def translate_helsinki(text):
    try:
        if not text or len(text) == 0:
            return ""
        # Initialize model (will cache after first load)
        translator = pipeline(
            "translation", 
            model="Helsinki-NLP/opus-mt-si-en",
            device=0 if torch.cuda.is_available() else -1
        )
        result = translator(text, max_length=400)[0]['translation_text']
        return result
    except Exception as e:
        logger.error(f"Error translating with Helsinki: {str(e)}")
        return ""

In [47]:
def translate_google(text):
    try:
        if not text or len(text) == 0:
            return ""
        translator = Translator()
        # Random delay to avoid rate limiting
        time.sleep(random.uniform(0.5, 1.5))
        result = translator.translate(text, src='si', dest='en').text
        return result
    except Exception as e:
        logger.error(f"Error translating with Google: {str(e)}")
        return ""

# Hybrid Approach

In [48]:
def translate_hybrid(text, max_retries = 3):
    if not text or len(text) == 0:
        return ""
    
    # Try Helsinki first
    for attempt in range(max_retries):
        try:
            result = translate_helsinki(text)
            if result and len(result) > 0:
                return result
        except:
            time.sleep(1)
            
            
    # Fallback to Google Translate
    for attempt in range(max_retries):
        try:
            result = translate_google(text)
            if result and len(result) > 0:
                return result
        except:
            time.sleep(1)
            
    return "" # Return empty string if all attempts fail

# Parallel Translation Function

In [49]:
def parallel_translate(df_chunk):
    return df_chunk['cleaned_comment'].progress_apply(translate_hybrid)

def translate_in_parallel(df, n_workers = None):
    if n_workers is None:
        n_workers = max(1, cpu_count() - 1)
        
        
    df_split = np.array_split(df, n_workers)
    
    with Pool(n_workers) as pool:
        results = pool.map(parallel_translate, df_split)
        
    # Combine results
    return pd.concat(results)

# Execute Translation

In [41]:
import pandas as pd
import numpy as np
from multiprocessing import Pool, cpu_count
from functools import partial
from tqdm.notebook import tqdm
import time
import random
from transformers import pipeline
import torch
from googletrans import Translator
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Translation functions
def translate_helsinki(text):
    try:
        if not text or len(text) == 0:
            return ""
        translator = pipeline(
            "translation", 
            model="Helsinki-NLP/opus-mt-si-en",
            device=0 if torch.cuda.is_available() else -1
        )
        result = translator(text, max_length=400)[0]['translation_text']
        return result
    except Exception as e:
        logger.error(f"Helsinki error: {str(e)}")
        return ""

def translate_google(text):
    try:
        if not text or len(text) == 0:
            return ""
        translator = Translator()
        time.sleep(random.uniform(0.5, 1.5))
        result = translator.translate(text, src='si', dest='en').text
        return result
    except Exception as e:
        logger.error(f"Google error: {str(e)}")
        return ""

def translate_hybrid(text, max_retries=3):
    if not text or len(text) == 0:
        return ""
    
    for attempt in range(max_retries):
        try:
            result = translate_helsinki(text)
            if result: return result
        except:
            time.sleep(1)
    
    for attempt in range(max_retries):
        try:
            result = translate_google(text)
            if result: return result
        except:
            time.sleep(1)
    
    return ""

# Parallel processing
def process_chunk(chunk, func):
    return chunk.apply(func)

def parallel_translate(df, column=None, n_workers=None):
    """Auto-detects text column if not specified"""
    if column is None:
        possible_cols = ['cleaned_comment', 'comment', 'text', 'content', 'sentence']
        for col in possible_cols + df.columns.tolist():
            if col in df.columns:
                column = col
                break
    
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame. Available columns: {df.columns.tolist()}")
    
    if n_workers is None:
        n_workers = max(1, cpu_count() - 1)
    
    df_split = np.array_split(df, n_workers)
    func = partial(process_chunk, func=translate_hybrid)
    
    with Pool(n_workers) as pool:
        results = pool.map(func, [chunk[column] for chunk in df_split])
    
    return pd.concat(results)

# Main execution
if __name__ == '__main__':
    # Load data
    df = pd.read_csv('/Volumes/KODAK/folder 02/language_translation/Language_translator/data/filtered_dataset.csv')
    
    # Check columns
    print("Available columns:", df.columns.tolist())
    
    # Sample data
    sample_df = df.sample(10, random_state=42) if len(df) > 100 else df.copy()
    
    # Translate
    print("Starting translation...")
    try:
        sample_df['translated_text'] = parallel_translate(sample_df)
        
        # Show results
        for idx, row in sample_df.head().iterrows():
            print(f"\nOriginal: {row.get('cleaned_comment', row.iloc[0])}")  # Gets first column if 'cleaned_comment' doesn't exist
            print(f"Translated: {row['translated_text']}")
    except Exception as e:
        print(f"Error during translation: {str(e)}")

Available columns: ['Comment']
Starting translation...


  return bound(*args, **kwds)
Process SpawnPoolWorker-52:
Process SpawnPoolWorker-56:
Process SpawnPoolWorker-54:
Process SpawnPoolWorker-50:
Process SpawnPoolWorker-53:
Process SpawnPoolWorker-55:
Process SpawnPoolWorker-51:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.12/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/opt/anaconda3/lib/python3.12/multiprocessing/queues.py", line 389, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 

KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
import time
import random
from transformers import pipeline
import torch
from googletrans import Translator
import logging
import concurrent.futures
from tqdm.auto import tqdm

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define translation functions
def translate_helsinki(text):
    try:
        if not text or len(text) == 0:
            return ""
        # Initialize model (will cache after first load)
        translator = pipeline(
            "translation", 
            model="Helsinki-NLP/opus-mt-si-en",
            device=0 if torch.cuda.is_available() else -1
        )
        result = translator(text, max_length=400)[0]['translation_text']
        return result
    except Exception as e:
        logger.error(f"Helsinki error: {str(e)}")
        return ""

def translate_google(text):
    try:
        if not text or len(text) == 0:
            return ""
        translator = Translator()
        time.sleep(random.uniform(0.5, 1.5))
        result = translator.translate(text, src='si', dest='en').text
        return result
    except Exception as e:
        logger.error(f"Google error: {str(e)}")
        return ""

def translate_hybrid(text, max_retries=3):
    if not text or len(text) == 0:
        return ""
    
    # Try Helsinki first
    for attempt in range(max_retries):
        try:
            result = translate_helsinki(text)
            if result and len(result) > 0:
                return result
        except Exception as e:
            logger.error(f"Helsinki attempt {attempt+1} failed: {str(e)}")
            time.sleep(1)
    
    # Fallback to Google
    for attempt in range(max_retries):
        try:
            result = translate_google(text)
            if result and len(result) > 0:
                return result
        except Exception as e:
            logger.error(f"Google attempt {attempt+1} failed: {str(e)}")
            time.sleep(1)
    
    return ""  # Return empty if all attempts fail

# Avoid multiprocessing in Jupyter notebooks - use sequential processing instead
def translate_all(texts):
    """Translate a list of texts sequentially with progress bar"""
    results = []
    for text in tqdm(texts, desc="Translating"):
        results.append(translate_hybrid(text))
    return results

# Find the appropriate text column in the dataframe
def get_text_column(df):
    possible_cols = ['cleaned_comment', 'comment', 'text', 'content', 'sentence', 'Comment']
    for col in possible_cols:
        if col in df.columns:
            return col
    # Default to first column if none of the expected columns are found
    return df.columns[0] if len(df.columns) > 0 else None

# Version using ThreadPoolExecutor which works better in Jupyter notebooks
def parallel_translate_threaded(df, column=None, n_workers=None):
    """Translate texts using ThreadPoolExecutor which works better in Jupyter"""
    if column is None:
        column = get_text_column(df)
    
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame. Available columns: {df.columns.tolist()}")
    
    if n_workers is None:
        import os
        n_workers = max(1, os.cpu_count() - 1)
    
    # Convert the column to a list
    texts = df[column].tolist()
    results = []
    
    # Use ThreadPoolExecutor instead of ProcessPoolExecutor to avoid pickling issues
    with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor:
        futures = [executor.submit(translate_hybrid, text) for text in texts]
        
        # Process results as they complete with a progress bar
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Translating"):
            results.append(future.result())
    
    # Reorder results to match input order
    ordered_results = [None] * len(texts)
    for i, future in enumerate(futures):
        try:
            ordered_results[i] = future.result()
        except Exception as e:
            logger.error(f"Error in thread {i}: {str(e)}")
            ordered_results[i] = ""
    
    return ordered_results

# Main execution in Jupyter-friendly way
# Load data
try:
    file_path = '/Volumes/KODAK/folder 02/language_translation/Language_translator/data/filtered_dataset.csv'
    df = pd.read_csv(file_path)
    
    # Check columns
    print("Available columns:", df.columns.tolist())
    
    # Sample data for testing
    sample_size = min(10, len(df))
    sample_df = df.sample(sample_size, random_state=42)
    
    print("Starting translation...")
    
    # Identify the text column
    text_column = get_text_column(sample_df)
    print(f"Using text column: {text_column}")
    
    # Translate using ThreadPoolExecutor approach
    translated_texts = parallel_translate_threaded(sample_df, column=text_column)
    
    # Add translations to the dataframe
    sample_df['translated_text'] = translated_texts
    
    # Show results
    for idx, row in sample_df.iterrows():
        print(f"\nOriginal: {row[text_column]}")
        print(f"Translated: {row['translated_text']}")
        
except Exception as e:
    print(f"Error during execution: {str(e)}")
    import traceback
    traceback.print_exc()

Available columns: ['Comment']
Starting translation...
Using text column: Comment


Translating:   0%|          | 0/10 [00:00<?, ?it/s]

ERROR:__main__:Helsinki error: Helsinki-NLP/opus-mt-si-en is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
ERROR:__main__:Helsinki error: Helsinki-NLP/opus-mt-si-en is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
ERROR:__main__:Helsinki error: Helsinki-NLP/opus-mt-si-en is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
E


Original: days all the pickme drivers are bad
Translated: days all the pickme drivers are bad

Original: hayar yanna ai baya mekai siddiya me hayar eka yaddi kalin giya hayar valata ganak PickMe ekata gewanna thiyanawa meka giyoth adu wela tmi labenne godak kattiya hayar ganne nethhe okai habai hamoma mehema ne e nisa okkotama dos kiyana eka weradi meka e riyadurage weraddak
Translated: Hayar Yanna Ai Siddiya ME Hayar Eka kalin giya giya giyawa gika gika ganna giyanawa genna ganna goakaHamoma Mehema NEI OKKOTAMA DOS KIYANA EKA WERADI MEKA E RIYADURAGE WERADDAK

Original: හයර් එක එයා approve කරන්න ඇත්තෙ cash ද card ද කියල බලල මෙයා මගදි තමයි card payment එකකට මාරු වෙලා තියෙන්නෙ ඒක අසාධාරණයි මෙයා කලින්ම card payment කියල තිබුනනම් මේ හයර් එක ත් ‍ රීවීල් එකේ කෙනා ගන්නෙ නෑ eken ගෙව්වවම ඒක යන්නෙ pick me එකට ඒ සල්ලි එයාට ලැබෙන්නෙ පස්සෙඅවුරුදු 6 ක් pick me app එකෙන් ගියානම් දන්නෙ නැද්ද කලින්ම cash ද card ද කියල දාන්න
Translated: Here is the one who appses the Cash and Card, who is moving to a 

In [55]:
import pandas as pd
import numpy as np
import time
import random
import re
import os
from tqdm.auto import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from googletrans import Translator
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Global variables to avoid reloading models
helsinki_translator = None
google_translator = None

# -------------------- Text Processing Functions --------------------

def clean_sinhala_text(text):
    """Special preprocessing for Sinhala text"""
    if not text or len(text) == 0:
        return ""
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Handle mixed English-Sinhala text
    # Extract English words/terms that should be preserved
    english_terms = re.findall(r'[a-zA-Z]+', text)
    
    # Replace numbers and special characters with spaces
    text = re.sub(r'[^\u0D80-\u0DFFa-zA-Z\s.,!?]', ' ', text)
    
    # Clean up extra spaces again
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

def post_process_translation(text):
    """Clean up translation results"""
    if not text or len(text) == 0:
        return ""
    
    # Fix common translation artifacts
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r' \.', '.', text)
    text = re.sub(r' ,', ',', text)
    
    # Remove repetitive phrases (common in poor translations)
    sentences = text.split('. ')
    unique_sentences = []
    for s in sentences:
        if s and s not in unique_sentences:
            unique_sentences.append(s)
    
    # Rejoin and capitalize
    text = '. '.join(s.capitalize() for s in unique_sentences if s)
    
    return text.strip()

def chunk_text_advanced(text, max_length=100):
    """Break text into smaller chunks with better boundaries"""
    if not text or len(text) <= max_length:
        return [text]
    
    # Try to split on punctuation first
    chunks = []
    pattern = r'([።,.!?])'
    sentences = re.split(pattern, text)
    
    current_chunk = ""
    for i in range(0, len(sentences), 2):
        sentence = sentences[i]
        punctuation = sentences[i+1] if i+1 < len(sentences) else ""
        
        if len(current_chunk) + len(sentence) + len(punctuation) <= max_length:
            current_chunk += sentence + punctuation
        else:
            if current_chunk:
                chunks.append(current_chunk)
            current_chunk = sentence + punctuation
    
    if current_chunk:
        chunks.append(current_chunk)
    
    # If no good sentence boundaries, try word boundaries
    if not chunks or len(chunks) == 1 and len(chunks[0]) > max_length:
        chunks = []
        words = text.split()
        current_chunk = ""
        
        for word in words:
            if len(current_chunk) + len(word) + 1 <= max_length:
                current_chunk += " " + word if current_chunk else word
            else:
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = word
        
        if current_chunk:
            chunks.append(current_chunk)
    
    # Last resort: character-based chunking
    if not chunks:
        chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
    
    return chunks

# -------------------- Translation Functions --------------------

def get_helsinki_translator():
    """Initialize Helsinki translator only once"""
    global helsinki_translator
    if helsinki_translator is None:
        try:
            logger.info("Loading Helsinki-NLP model...")
            # Use specific tokenizer and model to ensure quality
            tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-si-en")
            model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-si-en")
            device = 0 if torch.cuda.is_available() else -1
            
            helsinki_translator = pipeline(
                "translation", 
                model=model,
                tokenizer=tokenizer,
                device=device
            )
            logger.info("Helsinki-NLP model loaded successfully")
        except Exception as e:
            logger.error(f"Error loading Helsinki model: {str(e)}")
            # Fall back to simpler initialization if the above fails
            try:
                helsinki_translator = pipeline(
                    "translation", 
                    model="Helsinki-NLP/opus-mt-si-en",
                    device=0 if torch.cuda.is_available() else -1
                )
            except Exception as e2:
                logger.error(f"Error in fallback Helsinki initialization: {str(e2)}")
                return None
    
    return helsinki_translator

def translate_helsinki_improved(text):
    """Enhanced Helsinki translation with better preprocessing"""
    try:
        if not text or len(text) == 0:
            return ""
        
        # Clean text specifically for Sinhala
        text = clean_sinhala_text(text)
        
        # Get or initialize translator
        translator = get_helsinki_translator()
        if not translator:
            logger.error("Failed to initialize Helsinki translator")
            return ""
        
        # Break into smaller chunks for better quality
        chunks = chunk_text_advanced(text, max_length=80)  # Smaller chunks for better quality
        translated_chunks = []
        
        for chunk in chunks:
            try:
                # Use temperature for slightly more fluent translations
                result = translator(chunk, max_length=512, do_sample=True, temperature=0.7)[0]['translation_text']
                translated_chunks.append(result)
                # Small delay to avoid resource issues
                time.sleep(0.2)
            except Exception as chunk_error:
                logger.error(f"Error translating chunk: {str(chunk_error)}")
                # Try again with a simpler approach
                try:
                    result = translator(chunk, max_length=400)[0]['translation_text']
                    translated_chunks.append(result)
                except:
                    # If all else fails, append an empty string to maintain chunk alignment
                    translated_chunks.append("")
        
        # Join and post-process
        full_translation = ' '.join(chunk for chunk in translated_chunks if chunk)
        return post_process_translation(full_translation)
    
    except Exception as e:
        logger.error(f"Helsinki improved translation error: {str(e)}")
        return ""

def get_google_translator():
    """Initialize Google translator only once"""
    global google_translator
    if google_translator is None:
        try:
            google_translator = Translator()
        except Exception as e:
            logger.error(f"Error initializing Google translator: {str(e)}")
            return None
    
    return google_translator

def translate_google_improved(text):
    """Enhanced Google translation with better preprocessing"""
    try:
        if not text or len(text) == 0:
            return ""
        
        # Clean text specifically for Sinhala
        text = clean_sinhala_text(text)
        
        # Get or initialize translator
        translator = get_google_translator()
        if not translator:
            logger.error("Failed to initialize Google translator")
            return ""
        
        # Break into smaller chunks for better quality
        chunks = chunk_text_advanced(text, max_length=100)
        translated_chunks = []
        
        for chunk in chunks:
            try:
                # Random delay to avoid rate limiting
                time.sleep(random.uniform(0.7, 2.0))
                result = translator.translate(chunk, src='si', dest='en').text
                translated_chunks.append(result)
            except Exception as chunk_error:
                logger.error(f"Error translating chunk with Google: {str(chunk_error)}")
                # Try again after a longer delay
                try:
                    time.sleep(3.0)
                    result = translator.translate(chunk, src='si', dest='en').text
                    translated_chunks.append(result)
                except:
                    # If all else fails, append an empty string to maintain chunk alignment
                    translated_chunks.append("")
        
        # Join and post-process
        full_translation = ' '.join(chunk for chunk in translated_chunks if chunk)
        return post_process_translation(full_translation)
    
    except Exception as e:
        logger.error(f"Google improved translation error: {str(e)}")
        return ""

def translate_improved(text, max_retries=3):
    """Try multiple translation approaches with better quality settings"""
    if not text or len(text) == 0:
        return ""
    
    # Keep track of all successful translations
    translations = []
    
    # Try Helsinki with improved settings
    for attempt in range(max_retries):
        try:
            result = translate_helsinki_improved(text)
            if result and len(result) > len(text)/4:  # Ensure meaningful translation
                translations.append(result)
                break
        except Exception as e:
            logger.error(f"Helsinki attempt {attempt+1} failed: {str(e)}")
            time.sleep(1.5)
    
    # Try Google with improved settings
    for attempt in range(max_retries):
        try:
            result = translate_google_improved(text)
            if result and len(result) > len(text)/4:
                translations.append(result)
                break
        except Exception as e:
            logger.error(f"Google attempt {attempt+1} failed: {str(e)}")
            time.sleep(2)
    
    # If at least one translation succeeded
    if translations:
        # Choose based on quality heuristics (currently using length as a proxy for completeness)
        # This could be improved with more sophisticated metrics
        translations.sort(key=len, reverse=True)
        return translations[0]
    
    # Last resort: try direct Helsinki translation without chunking
    try:
        translator = get_helsinki_translator()
        if translator:
            result = translator(clean_sinhala_text(text), max_length=512)[0]['translation_text']
            return post_process_translation(result)
    except:
        pass
    
    return ""  # Return empty if all attempts fail

# -------------------- Main Processing Function --------------------

def translate_all_sequential(texts):
    """Process all texts sequentially for maximum reliability"""
    results = []
    for text in tqdm(texts, desc="Translating"):
        results.append(translate_improved(text))
    return results

# -------------------- Main Execution --------------------

# Main execution section - for Jupyter notebook use
try:
    # Load data
    file_path = '/Volumes/KODAK/folder 02/language_translation/Language_translator/data/filtered_dataset.csv'
    df = pd.read_csv(file_path)
    
    # Check columns
    print("Available columns:", df.columns.tolist())
    
    # Sample data for testing - smaller sample to focus on quality
    sample_size = min(5, len(df))
    sample_df = df.sample(sample_size, random_state=42)
    
    print("Starting translation with focus on quality...")
    
    # Identify the text column
    text_column = 'Comment' if 'Comment' in df.columns else df.columns[0]
    print(f"Using text column: {text_column}")
    
    # Use sequential processing for maximum reliability
    translated_texts = translate_all_sequential(sample_df[text_column].tolist())
    
    # Add translations to the dataframe
    sample_df['translated_text'] = translated_texts
    
    # Show results
    for idx, row in sample_df.iterrows():
        print(f"\nOriginal: {row[text_column]}")
        print(f"Translated: {row['translated_text']}")
    
    # Optional: Save results to CSV
    # sample_df.to_csv('/Volumes/KODAK/folder 02/language_translation/Language_translator/data/translated_sample_improved.csv', index=False)
    
except Exception as e:
    print(f"Error during execution: {str(e)}")
    import traceback
    traceback.print_exc()

Available columns: ['Comment']
Starting translation with focus on quality...
Using text column: Comment


Translating:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:__main__:Loading Helsinki-NLP model...
ERROR:__main__:Error loading Helsinki model: Helsinki-NLP/opus-mt-si-en is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
ERROR:__main__:Error in fallback Helsinki initialization: Helsinki-NLP/opus-mt-si-en is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
ERROR:__main__:Failed to initialize Helsinki translator
INFO:__main__:Loading Helsinki-NLP model...
ERROR:__main__:Error loading Helsinki model: Helsinki-NLP/opus-mt-si-en is not a local folder and is not a valid model identifier listed on 'https://


Original: days all the pickme drivers are bad
Translated: Days all the pickme drivers are bad

Original: hayar yanna ai baya mekai siddiya me hayar eka yaddi kalin giya hayar valata ganak PickMe ekata gewanna thiyanawa meka giyoth adu wela tmi labenne godak kattiya hayar ganne nethhe okai habai hamoma mehema ne e nisa okkotama dos kiyana eka weradi meka e riyadurage weraddak
Translated: Hayar yanna ai siddiya me hayar eka kalin kalin giya giya gyaar ganak pickme ekata gewanna tiyanawa meka deiyoh a adu wella tmi labenne godne nethhe okai habai habai hamoma mehema nei okkotama dos kiyana eka weradi meka e riyadurage weraddak

Original: හයර් එක එයා approve කරන්න ඇත්තෙ cash ද card ද කියල බලල මෙයා මගදි තමයි card payment එකකට මාරු වෙලා තියෙන්නෙ ඒක අසාධාරණයි මෙයා කලින්ම card payment කියල තිබුනනම් මේ හයර් එක ත් ‍ රීවීල් එකේ කෙනා ගන්නෙ නෑ eken ගෙව්වවම ඒක යන්නෙ pick me එකට ඒ සල්ලි එයාට ලැබෙන්නෙ පස්සෙඅවුරුදු 6 ක් pick me app එකෙන් ගියානම් දන්නෙ නැද්ද කලින්ම cash ද card ද කියල දාන්න
Translated: 

In [56]:
import pandas as pd
import numpy as np
import time
import random
import re
import os
from tqdm.auto import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from googletrans import Translator
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Global variables to avoid reloading models
helsinki_translator = None
google_translator = None

# -------------------- Text Processing Functions --------------------

def clean_sinhala_text(text):
    """Special preprocessing for Sinhala text"""
    if not text or len(text) == 0:
        return ""
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Handle mixed English-Sinhala text
    # Extract English words/terms that should be preserved
    english_terms = re.findall(r'[a-zA-Z]+', text)
    
    # Replace numbers and special characters with spaces
    text = re.sub(r'[^\u0D80-\u0DFFa-zA-Z\s.,!?]', ' ', text)
    
    # Clean up extra spaces again
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

def post_process_translation(text):
    """Clean up translation results"""
    if not text or len(text) == 0:
        return ""
    
    # Fix common translation artifacts
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r' \.', '.', text)
    text = re.sub(r' ,', ',', text)
    
    # Remove repetitive phrases (common in poor translations)
    sentences = text.split('. ')
    unique_sentences = []
    for s in sentences:
        if s and s not in unique_sentences:
            unique_sentences.append(s)
    
    # Rejoin and capitalize
    text = '. '.join(s.capitalize() for s in unique_sentences if s)
    
    return text.strip()

def chunk_text_advanced(text, max_length=100):
    """Break text into smaller chunks with better boundaries"""
    if not text or len(text) <= max_length:
        return [text]
    
    # Try to split on punctuation first
    chunks = []
    pattern = r'([።,.!?])'
    sentences = re.split(pattern, text)
    
    current_chunk = ""
    for i in range(0, len(sentences), 2):
        sentence = sentences[i]
        punctuation = sentences[i+1] if i+1 < len(sentences) else ""
        
        if len(current_chunk) + len(sentence) + len(punctuation) <= max_length:
            current_chunk += sentence + punctuation
        else:
            if current_chunk:
                chunks.append(current_chunk)
            current_chunk = sentence + punctuation
    
    if current_chunk:
        chunks.append(current_chunk)
    
    # If no good sentence boundaries, try word boundaries
    if not chunks or len(chunks) == 1 and len(chunks[0]) > max_length:
        chunks = []
        words = text.split()
        current_chunk = ""
        
        for word in words:
            if len(current_chunk) + len(word) + 1 <= max_length:
                current_chunk += " " + word if current_chunk else word
            else:
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = word
        
        if current_chunk:
            chunks.append(current_chunk)
    
    # Last resort: character-based chunking
    if not chunks:
        chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
    
    return chunks

# -------------------- Translation Functions --------------------

def get_helsinki_translator():
    """Initialize Helsinki translator only once"""
    global helsinki_translator
    if helsinki_translator is None:
        try:
            logger.info("Loading Helsinki-NLP model...")
            # Use the multilingual model instead of the Sinhala-specific one
            model_name = "Helsinki-NLP/opus-mt-mul-en"  # Multi-language to English model
            
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
            device = 0 if torch.cuda.is_available() else -1
            
            helsinki_translator = pipeline(
                "translation", 
                model=model,
                tokenizer=tokenizer,
                device=device
            )
            logger.info("Helsinki-NLP model loaded successfully")
        except Exception as e:
            logger.error(f"Error loading Helsinki model: {str(e)}")
            try:
                # Try with direct pipeline creation
                helsinki_translator = pipeline(
                    "translation", 
                    model="Helsinki-NLP/opus-mt-mul-en",
                    device=0 if torch.cuda.is_available() else -1
                )
                logger.info("Helsinki-NLP fallback model loaded successfully")
            except Exception as e2:
                logger.error(f"Error in fallback Helsinki initialization: {str(e2)}")
                logger.info("Will use Google Translate as fallback")
                helsinki_translator = None
    
    return helsinki_translator

def translate_helsinki_improved(text):
    """Enhanced Helsinki translation with better preprocessing"""
    try:
        if not text or len(text) == 0:
            return ""
        
        # Clean text specifically for Sinhala
        text = clean_sinhala_text(text)
        
        # Get or initialize translator
        translator = get_helsinki_translator()
        if not translator:
            logger.error("Helsinki translator not available")
            return ""
        
        # Break into smaller chunks for better quality
        chunks = chunk_text_advanced(text, max_length=80)  # Smaller chunks for better quality
        translated_chunks = []
        
        for chunk in chunks:
            try:
                # Use temperature for slightly more fluent translations
                result = translator(chunk, max_length=512, do_sample=True, temperature=0.7)[0]['translation_text']
                translated_chunks.append(result)
                # Small delay to avoid resource issues
                time.sleep(0.2)
            except Exception as chunk_error:
                logger.error(f"Error translating chunk: {str(chunk_error)}")
                # Try again with a simpler approach
                try:
                    result = translator(chunk, max_length=400)[0]['translation_text']
                    translated_chunks.append(result)
                except:
                    # If all else fails, append an empty string to maintain chunk alignment
                    translated_chunks.append("")
        
        # Join and post-process
        full_translation = ' '.join(chunk for chunk in translated_chunks if chunk)
        return post_process_translation(full_translation)
    
    except Exception as e:
        logger.error(f"Helsinki improved translation error: {str(e)}")
        return ""

def get_google_translator():
    """Initialize Google translator only once"""
    global google_translator
    if google_translator is None:
        try:
            google_translator = Translator()
            logger.info("Google translator initialized successfully")
        except Exception as e:
            logger.error(f"Error initializing Google translator: {str(e)}")
            return None
    
    return google_translator

def translate_google_improved(text):
    """Enhanced Google translation with better preprocessing"""
    try:
        if not text or len(text) == 0:
            return ""
        
        # Clean text specifically for Sinhala
        text = clean_sinhala_text(text)
        
        # Get or initialize translator
        translator = get_google_translator()
        if not translator:
            logger.error("Failed to initialize Google translator")
            return ""
        
        # Break into smaller chunks for better quality
        chunks = chunk_text_advanced(text, max_length=100)
        translated_chunks = []
        
        for chunk in chunks:
            try:
                # Random delay to avoid rate limiting
                time.sleep(random.uniform(0.7, 2.0))
                result = translator.translate(chunk, src='si', dest='en').text
                translated_chunks.append(result)
            except Exception as chunk_error:
                logger.error(f"Error translating chunk with Google: {str(chunk_error)}")
                # Try again after a longer delay
                try:
                    time.sleep(3.0)
                    result = translator.translate(chunk, src='si', dest='en').text
                    translated_chunks.append(result)
                except:
                    # If all else fails, append an empty string to maintain chunk alignment
                    translated_chunks.append("")
        
        # Join and post-process
        full_translation = ' '.join(chunk for chunk in translated_chunks if chunk)
        return post_process_translation(full_translation)
    
    except Exception as e:
        logger.error(f"Google improved translation error: {str(e)}")
        return ""

def translate_improved(text, max_retries=3):
    """Try multiple translation approaches with better quality settings"""
    if not text or len(text) == 0:
        return ""
    
    # Keep track of all successful translations
    translations = []
    
    # Try Helsinki with improved settings if available
    translator = get_helsinki_translator()
    if translator:  # Only try if translator was successfully initialized
        for attempt in range(max_retries):
            try:
                result = translate_helsinki_improved(text)
                if result and len(result) > len(text)/4:  # Ensure meaningful translation
                    translations.append(result)
                    break
            except Exception as e:
                logger.error(f"Helsinki attempt {attempt+1} failed: {str(e)}")
                time.sleep(1.5)
    
    # Try Google with improved settings
    for attempt in range(max_retries):
        try:
            result = translate_google_improved(text)
            if result and len(result) > len(text)/4:
                translations.append(result)
                break
        except Exception as e:
            logger.error(f"Google attempt {attempt+1} failed: {str(e)}")
            time.sleep(2)
    
    # If at least one translation succeeded
    if translations:
        # Choose based on quality heuristics (currently using length as a proxy for completeness)
        translations.sort(key=len, reverse=True)
        return translations[0]
    
    return ""  # Return empty if all attempts fail

# -------------------- Main Processing Function --------------------

def translate_all_sequential(texts):
    """Process all texts sequentially for maximum reliability"""
    results = []
    for text in tqdm(texts, desc="Translating"):
        results.append(translate_improved(text))
    return results

# -------------------- Main Execution --------------------

# Main execution section - for Jupyter notebook use
if __name__ == "__main__":
    try:
        # Load data
        file_path = '/Volumes/KODAK/folder 02/language_translation/Language_translator/data/filtered_dataset.csv'
        df = pd.read_csv(file_path)
        
        # Check columns
        print("Available columns:", df.columns.tolist())
        
        # Sample data for testing - smaller sample to focus on quality
        sample_size = min(5, len(df))
        sample_df = df.sample(sample_size, random_state=42)
        
        print("Starting translation with focus on quality...")
        
        # Identify the text column
        text_column = 'Comment' if 'Comment' in df.columns else df.columns[0]
        print(f"Using text column: {text_column}")
        
        # Use sequential processing for maximum reliability
        translated_texts = translate_all_sequential(sample_df[text_column].tolist())
        
        # Add translations to the dataframe
        sample_df['translated_text'] = translated_texts
        
        # Show results
        for idx, row in sample_df.iterrows():
            print(f"\nOriginal: {row[text_column]}")
            print(f"Translated: {row['translated_text']}")
        
        # Optional: Save results to CSV
        # sample_df.to_csv('/Volumes/KODAK/folder 02/language_translation/Language_translator/data/translated_sample_improved.csv', index=False)
        
    except Exception as e:
        print(f"Error during execution: {str(e)}")
        import traceback
        traceback.print_exc()

Available columns: ['Comment']
Starting translation with focus on quality...
Using text column: Comment


Translating:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:__main__:Loading Helsinki-NLP model...


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/707k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/791k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.42M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/310M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/Helsinki-NLP/opus-mt-mul-en/33ff438ec37160a105f0700819a5b78a07918e1913fc2f249184b1f46a248e4e?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1745313271&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NTMxMzI3MX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9IZWxzaW5raS1OTFAvb3B1cy1tdC1tdWwtZW4vMzNmZjQzOGVjMzcxNjBhMTA1ZjA3MDA4MTlhNWI3OGEwNzkxOGUxOTEzZmMyZjI0OTE4NGIxZjQ2YTI0OGU0ZT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=E952aYE-vZ%7Eco1oqA40RLdBRst38MDngaLGP9u1RrdfHOd6--hBtd6UvohB6oQRXbX1LhADBZzZewHCGB5rvQuTYeS%7EnBkCUhiVAs00UCEcQ6gxVMEpfFSK-qzvmNq2g39LhK6xuMd6akIDFVahsBSLU5R5dM97Qezg0J5%7E3WbuRL534qfUdmANRXF4w55WTQbUyiDFkZMGsc3vjAQPvVTr2gNFDZuyGWt0wB-sNiHFGMnFilaYS2owHDtViGsYO-8Muefc0Z0tiOHz2FhTt4e6G%7E%7EnjGHU6TVOS75m

pytorch_model.bin:  61%|######    | 189M/310M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/Helsinki-NLP/opus-mt-mul-en/33ff438ec37160a105f0700819a5b78a07918e1913fc2f249184b1f46a248e4e?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1745313271&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NTMxMzI3MX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9IZWxzaW5raS1OTFAvb3B1cy1tdC1tdWwtZW4vMzNmZjQzOGVjMzcxNjBhMTA1ZjA3MDA4MTlhNWI3OGEwNzkxOGUxOTEzZmMyZjI0OTE4NGIxZjQ2YTI0OGU0ZT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=E952aYE-vZ%7Eco1oqA40RLdBRst38MDngaLGP9u1RrdfHOd6--hBtd6UvohB6oQRXbX1LhADBZzZewHCGB5rvQuTYeS%7EnBkCUhiVAs00UCEcQ6gxVMEpfFSK-qzvmNq2g39LhK6xuMd6akIDFVahsBSLU5R5dM97Qezg0J5%7E3WbuRL534qfUdmANRXF4w55WTQbUyiDFkZMGsc3vjAQPvVTr2gNFDZuyGWt0wB-sNiHFGMnFilaYS2owHDtViGsYO-8Muefc0Z0tiOHz2FhTt4e6G%7E%7EnjGHU6TVOS75m

pytorch_model.bin:  61%|######    | 189M/310M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/Helsinki-NLP/opus-mt-mul-en/33ff438ec37160a105f0700819a5b78a07918e1913fc2f249184b1f46a248e4e?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1745313271&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NTMxMzI3MX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9IZWxzaW5raS1OTFAvb3B1cy1tdC1tdWwtZW4vMzNmZjQzOGVjMzcxNjBhMTA1ZjA3MDA4MTlhNWI3OGEwNzkxOGUxOTEzZmMyZjI0OTE4NGIxZjQ2YTI0OGU0ZT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=E952aYE-vZ%7Eco1oqA40RLdBRst38MDngaLGP9u1RrdfHOd6--hBtd6UvohB6oQRXbX1LhADBZzZewHCGB5rvQuTYeS%7EnBkCUhiVAs00UCEcQ6gxVMEpfFSK-qzvmNq2g39LhK6xuMd6akIDFVahsBSLU5R5dM97Qezg0J5%7E3WbuRL534qfUdmANRXF4w55WTQbUyiDFkZMGsc3vjAQPvVTr2gNFDZuyGWt0wB-sNiHFGMnFilaYS2owHDtViGsYO-8Muefc0Z0tiOHz2FhTt4e6G%7E%7EnjGHU6TVOS75m

pytorch_model.bin:  61%|######    | 189M/310M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Device set to use cpu
INFO:__main__:Helsinki-NLP model loaded successfully
INFO:__main__:Google translator initialized successfully


model.safetensors:   0%|          | 0.00/310M [00:00<?, ?B/s]


Original: days all the pickme drivers are bad
Translated: Days all the pickme drivers are bad

Original: hayar yanna ai baya mekai siddiya me hayar eka yaddi kalin giya hayar valata ganak PickMe ekata gewanna thiyanawa meka giyoth adu wela tmi labenne godak kattiya hayar ganne nethhe okai habai hamoma mehema ne e nisa okkotama dos kiyana eka weradi meka e riyadurage weraddak
Translated: Position after which the pointer must stay as though the pointer were pushing the mouse pointer over the clock. Pickme and i have a lot of fun and i don't know what it's like. I don't know what's going on here. Like a dream of a dream, we're going to have a dream.

Original: හයර් එක එයා approve කරන්න ඇත්තෙ cash ද card ද කියල බලල මෙයා මගදි තමයි card payment එකකට මාරු වෙලා තියෙන්නෙ ඒක අසාධාරණයි මෙයා කලින්ම card payment කියල තිබුනනම් මේ හයර් එක ත් ‍ රීවීල් එකේ කෙනා ගන්නෙ නෑ eken ගෙව්වවම ඒක යන්නෙ pick me එකට ඒ සල්ලි එයාට ලැබෙන්නෙ පස්සෙඅවුරුදු 6 ක් pick me app එකෙන් ගියානම් දන්නෙ නැද්ද කලින්ම cash ද card ද 