In [1]:
# # Install required packages
# %pip install langdetect pycld2 lingua-language-detector fasttext

In [1]:
# Import necessary libraries
from dataclasses import dataclass
from typing import Optional, Dict, List
import langdetect
import pycld2
from lingua import Language, LanguageDetectorBuilder
import fasttext
import os
import json
from datetime import datetime

In [2]:
# Define LanguageDetectionResult dataclass
@dataclass
class LanguageDetectionResult:
    """Result of language detection operation."""
    language_code: str
    confidence: float
    source: str  # 'langdetect', 'pycld2', 'lingua', 'whisper', etc.
    
    def __post_init__(self):
        """Validate the result after initialization."""
        if not isinstance(self.language_code, str) or len(self.language_code) < 2:
            raise ValueError("language_code must be a valid string")
        if not (0.0 <= self.confidence <= 1.0):
            raise ValueError("confidence must be between 0.0 and 1.0")
        if not isinstance(self.source, str):
            raise ValueError("source must be a string")
    
    def to_dict(self) -> Dict:
        """Convert result to dictionary."""
        return {
            'language_code': self.language_code,
            'confidence': self.confidence,
            'source': self.source
        }
    
    @classmethod
    def from_dict(cls, data: Dict) -> 'LanguageDetectionResult':
        """Create result from dictionary."""
        return cls(
            language_code=data['language_code'],
            confidence=data['confidence'],
            source=data['source']
        )

In [3]:
# Define language code normalization function
def normalize_language_code(raw_code: str) -> str:
    """Normalize language codes to standard ISO 639-1 format.
    
    Args:
        raw_code: Raw language code from various providers
        
    Returns:
        Normalized ISO 639-1 language code (e.g., 'en', 'es', 'fr')
    """
    # Common mappings from various providers to ISO 639-1
    normalization_map = {
        # langdetect mappings
        'en': 'en', 'english': 'en',
        'es': 'es', 'spanish': 'es', 
        'fr': 'fr', 'french': 'fr',
        'de': 'de', 'german': 'de',
        'it': 'it', 'italian': 'it',
        'pt': 'pt', 'portuguese': 'pt',
        'ru': 'ru', 'russian': 'ru',
        'ja': 'ja', 'japanese': 'ja',
        'ko': 'ko', 'korean': 'ko',
        'zh': 'zh', 'chinese': 'zh',
        'ar': 'ar', 'arabic': 'ar',
        'hi': 'hi', 'hindi': 'hi',
        
        # pycld2 mappings (often returns full names)
        'ENGLISH': 'en',
        'SPANISH': 'es',
        'FRENCH': 'fr',
        'GERMAN': 'de',
        'ITALIAN': 'it',
        'PORTUGUESE': 'pt',
        'RUSSIAN': 'ru',
        'JAPANESE': 'ja',
        'KOREAN': 'ko',
        'CHINESE': 'zh',
        'ARABIC': 'ar',
        'HINDI': 'hi',
        
        # Lingua mappings (Language enum values)
        'Language.ENGLISH': 'en',
        'Language.SPANISH': 'es',
        'Language.FRENCH': 'fr',
        'Language.GERMAN': 'de',
        'Language.ITALIAN': 'it',
        'Language.PORTUGUESE': 'pt',
        'Language.RUSSIAN': 'ru',
        'Language.JAPANESE': 'ja',
        'Language.KOREAN': 'ko',
        'Language.CHINESE': 'zh',
        'Language.ARABIC': 'ar',
        'Language.HINDI': 'hi',
        
        # Whisper mappings (ISO 639-3 to ISO 639-1)
        'eng': 'en',
        'spa': 'es',
        'fra': 'fr',
        'deu': 'de',
        'ita': 'it',
        'por': 'pt',
        'rus': 'ru',
        'jpn': 'ja',
        'kor': 'ko',
        'zho': 'zh',
        'ara': 'ar',
        'hin': 'hi',
    }
    
    # Clean and normalize the input
    cleaned_code = raw_code.strip().upper()
    
    # Direct mapping
    if cleaned_code in normalization_map:
        return normalization_map[cleaned_code]
    
    # Try lowercase version
    lower_code = cleaned_code.lower()
    if lower_code in normalization_map:
        return normalization_map[lower_code]
    
    # Try first two characters for ISO codes
    if len(raw_code) >= 2:
        prefix = raw_code[:2].lower()
        if prefix in normalization_map:
            return normalization_map[prefix]
    
    # Return original if no mapping found
    return raw_code.lower()

In [4]:
# Define LanguageDetector class
class LanguageDetector:
    """Multi-provider language detection with fallback support."""
    
    def __init__(self):
        """Initialize language detectors."""
        self._detectors = {}
        self._initialize_detectors()
    
    def _initialize_detectors(self):
        """Initialize available language detection providers."""
        try:
            # langdetect
            self._detectors['langdetect'] = True
        except Exception as e:
            print(f"Warning: langdetect not available: {e}")
        
        try:
            # pycld2
            self._detectors['pycld2'] = True
        except Exception as e:
            print(f"Warning: pycld2 not available: {e}")
        
        try:
            # Lingua
            languages = [Language.ENGLISH, Language.SPANISH, Language.FRENCH, 
                        Language.GERMAN, Language.ITALIAN, Language.PORTUGUESE,
                        Language.RUSSIAN, Language.JAPANESE, Language.KOREAN,
                        Language.CHINESE, Language.ARABIC, Language.HINDI]
            self._lingua_detector = LanguageDetectorBuilder.from_languages(*languages).build()
            self._detectors['lingua'] = True
        except Exception as e:
            print(f"Warning: Lingua not available: {e}")
    
    def detect_from_text(self, text: str) -> LanguageDetectionResult:
        """Detect language from text using multiple providers with fallback.
        
        Args:
            text: Input text to detect language for
            
        Returns:
            LanguageDetectionResult with detected language, confidence, and source
        """
        if not text or not text.strip():
            return LanguageDetectionResult('unknown', 0.0, 'empty_input')
        
        results = []
        
        # Try langdetect first
        if 'langdetect' in self._detectors:
            try:
                lang = langdetect.detect(text)
                confidence = langdetect.detect_langs(text)[0].prob
                results.append(LanguageDetectionResult(
                    normalize_language_code(lang), 
                    confidence, 
                    'langdetect'
                ))
            except Exception as e:
                print(f"langdetect failed: {e}")
        
        # Try pycld2
        if 'pycld2' in self._detectors:
            try:
                is_reliable, text_bytes_found, details = pycld2.detect(text)
                if is_reliable and details:
                    lang_code = details[0][1]
                    confidence = details[0][2] / 100.0  # Convert to 0-1 scale
                    results.append(LanguageDetectionResult(
                        normalize_language_code(lang_code),
                        confidence,
                        'pycld2'
                    ))
            except Exception as e:
                print(f"pycld2 failed: {e}")
        
        # Try Lingua
        if 'lingua' in self._detectors:
            try:
                confidence_values = self._lingua_detector.compute_language_confidence_values(text)
                if confidence_values:
                    best_match = confidence_values[0]
                    lang_code = str(best_match.language).split('.')[-1].lower()
                    results.append(LanguageDetectionResult(
                        normalize_language_code(lang_code),
                        best_match.value,
                        'lingua'
                    ))
            except Exception as e:
                print(f"Lingua failed: {e}")
        
        # Return best result or fallback
        if results:
            # Sort by confidence and return highest
            results.sort(key=lambda x: x.confidence, reverse=True)
            return results[0]
        else:
            return LanguageDetectionResult('unknown', 0.0, 'no_providers')
    
    def detect_from_audio(self, audio_bytes: bytes) -> Optional[LanguageDetectionResult]:
        """Detect language from audio bytes using Whisper.
        
        Note: This requires the STT module to be available.
        For now, returns None as we need to integrate with STT.
        
        Args:
            audio_bytes: Audio data in bytes
            
        Returns:
            LanguageDetectionResult or None if detection fails
        """
        # This would integrate with the STT module's language detection
        # For now, return None to indicate audio detection not implemented
        # In a full implementation, this would call the STT module
        return None
    
    def get_available_providers(self) -> List[str]:
        """Get list of available language detection providers."""
        return list(self._detectors.keys())

In [5]:
# Define LanguagePreferenceManager class
class LanguagePreferenceManager:
    """Manages language preferences per session/user."""
    
    def __init__(self, storage_file: str = "language_preferences.json"):
        """Initialize preference manager.
        
        Args:
            storage_file: File to store preferences (JSON format)
        """
        self.storage_file = storage_file
        self.preferences = self._load_preferences()
    
    def _load_preferences(self) -> Dict:
        """Load preferences from storage file."""
        if os.path.exists(self.storage_file):
            try:
                with open(self.storage_file, 'r', encoding='utf-8') as f:
                    return json.load(f)
            except Exception as e:
                print(f"Warning: Could not load preferences: {e}")
                return {}
        return {}
    
    def _save_preferences(self):
        """Save preferences to storage file."""
        try:
            with open(self.storage_file, 'w', encoding='utf-8') as f:
                json.dump(self.preferences, f, indent=2, ensure_ascii=False)
        except Exception as e:
            print(f"Warning: Could not save preferences: {e}")
    
    def set_user_preference(self, user_id: str, language_code: str, confidence: float = 1.0):
        """Set language preference for a user.
        
        Args:
            user_id: Unique identifier for the user/session
            language_code: Preferred language code
            confidence: Confidence in this preference (0.0-1.0)
        """
        if user_id not in self.preferences:
            self.preferences[user_id] = {}
        
        self.preferences[user_id][language_code] = {
            'confidence': confidence,
            'timestamp': datetime.now().isoformat(),
            'count': self.preferences[user_id].get(language_code, {}).get('count', 0) + 1
        }
        
        self._save_preferences()
    
    def get_user_preference(self, user_id: str) -> Optional[str]:
        """Get the most preferred language for a user.
        
        Args:
            user_id: Unique identifier for the user/session
            
        Returns:
            Most preferred language code or None if no preferences
        """
        if user_id not in self.preferences:
            return None
        
        user_prefs = self.preferences[user_id]
        if not user_prefs:
            return None
        
        # Find language with highest combined score (confidence * count)
        best_lang = max(user_prefs.items(), 
                       key=lambda x: x[1]['confidence'] * x[1]['count'])
        return best_lang[0]
    
    def update_from_detection(self, user_id: str, detection_result: LanguageDetectionResult):
        """Update user preferences based on language detection result.
        
        Args:
            user_id: Unique identifier for the user/session
            detection_result: Result from language detection
        """
        self.set_user_preference(user_id, detection_result.language_code, 
                               detection_result.confidence)
    
    def get_user_history(self, user_id: str) -> Dict:
        """Get complete language preference history for a user.
        
        Args:
            user_id: Unique identifier for the user/session
            
        Returns:
            Dictionary of language preferences with metadata
        """
        return self.preferences.get(user_id, {}).copy()
    
    def clear_user_preferences(self, user_id: str):
        """Clear all preferences for a user.
        
        Args:
            user_id: Unique identifier for the user/session
        """
        if user_id in self.preferences:
            del self.preferences[user_id]
            self._save_preferences()
    
    def get_all_users(self) -> List[str]:
        """Get list of all users with preferences."""
        return list(self.preferences.keys())

In [6]:
# Test LanguageDetectionResult
try:
    result = LanguageDetectionResult('en', 0.95, 'langdetect')
    print("LanguageDetectionResult created successfully:")
    print(f"Language: {result.language_code}")
    print(f"Confidence: {result.confidence}")
    print(f"Source: {result.source}")
    print(f"As dict: {result.to_dict()}")
except Exception as e:
    print(f"Error creating LanguageDetectionResult: {e}")

LanguageDetectionResult created successfully:
Language: en
Confidence: 0.95
Source: langdetect
As dict: {'language_code': 'en', 'confidence': 0.95, 'source': 'langdetect'}


In [7]:
# Test normalize_language_code function
test_codes = ['en', 'ENGLISH', 'eng', 'Language.ENGLISH', 'spa', 'SPANISH', 'fra', 'zh', 'unknown']
print("Testing language code normalization:")
for code in test_codes:
    normalized = normalize_language_code(code)
    print(f"'{code}' -> '{normalized}'")

Testing language code normalization:
'en' -> 'en'
'ENGLISH' -> 'en'
'eng' -> 'en'
'Language.ENGLISH' -> 'language.english'
'spa' -> 'es'
'SPANISH' -> 'es'
'fra' -> 'fr'
'zh' -> 'zh'
'unknown' -> 'unknown'


In [8]:
# Test LanguageDetector class instantiation
try:
    detector = LanguageDetector()
    print("LanguageDetector initialized successfully.")
    print(f"Available providers: {detector.get_available_providers()}")
except Exception as e:
    print(f"Error initializing LanguageDetector: {e}")

LanguageDetector initialized successfully.
Available providers: ['langdetect', 'pycld2', 'lingua']


In [9]:
# Test text language detection
test_texts = [
    "Hello, how are you today?",
    "Hola, ¿cómo estás hoy?",
    "Bonjour, comment allez-vous?",
    "Hallo, wie geht es dir?",
    "こんにちは、今日はどうですか？",
    "안녕하세요, 오늘 어떻게 지내세요?"
]

print("Testing language detection from text:")
for text in test_texts:
    try:
        result = detector.detect_from_text(text)
        print(f"Text: '{text[:30]}...' -> {result.language_code} ({result.confidence:.2f}) via {result.source}")
    except Exception as e:
        print(f"Error detecting language for '{text[:30]}...': {e}")

Testing language detection from text:
Text: 'Hello, how are you today?...' -> en (0.96) via pycld2
Text: 'Hola, ¿cómo estás hoy?...' -> es (0.94) via lingua
Text: 'Bonjour, comment allez-vous?...' -> fr (1.00) via langdetect
Text: 'Hallo, wie geht es dir?...' -> de (0.95) via pycld2
Text: 'こんにちは、今日はどうですか？...' -> ja (1.00) via lingua
Text: '안녕하세요, 오늘 어떻게 지내세요?...' -> ko (1.00) via lingua


In [10]:
# Test LanguagePreferenceManager
try:
    pref_manager = LanguagePreferenceManager()
    print("LanguagePreferenceManager initialized successfully.")
    
    # Test setting and getting preferences
    user_id = "test_user"
    pref_manager.set_user_preference(user_id, "en", 0.9)
    pref_manager.set_user_preference(user_id, "es", 0.8)
    pref_manager.set_user_preference(user_id, "en", 0.95)  # Update English preference
    
    preference = pref_manager.get_user_preference(user_id)
    print(f"User preference: {preference}")
    
    history = pref_manager.get_user_history(user_id)
    print(f"User history: {history}")
    
except Exception as e:
    print(f"Error testing LanguagePreferenceManager: {e}")

LanguagePreferenceManager initialized successfully.
User preference: en
User history: {'en': {'confidence': 0.95, 'timestamp': '2025-11-21T22:00:51.339325', 'count': 4}, 'es': {'confidence': 0.8, 'timestamp': '2025-11-21T22:00:51.338250', 'count': 2}}


In [11]:
# Test integration with detection results
try:
    # Detect language and update preferences
    test_text = "This is a test sentence in English."
    result = detector.detect_from_text(test_text)
    print(f"Detection result: {result.language_code} ({result.confidence:.2f})")
    
    # Update preferences
    pref_manager.update_from_detection("integration_test", result)
    pref = pref_manager.get_user_preference("integration_test")
    print(f"Updated preference: {pref}")
    
except Exception as e:
    print(f"Error in integration test: {e}")

Detection result: en (1.00)
Updated preference: en
