# Initial Setup and Dependencies

This cell sets up the required environment for a language classification system:

- Imports essential libraries:
  - NLP: NLTK for text processing
  - Data processing: NumPy, Pandas
  - Machine Learning: Scikit-learn components
  - Utilities: tqdm, warnings

The cell also:
- Downloads required NLTK resources: udhr, punkt, stopwords
- Verifies NLTK functionality with a test sentence
- Suppresses warnings for cleaner output

In [None]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import udhr, gutenberg, brown, reuters
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords 
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import string
import re
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
print("Downloading required NLTK resources...")
resources = ['udhr', 'punkt', 'stopwords']
for resource in resources:
    nltk.download(resource)

print("\nVerifying NLTK data...")
test_text = "This is a test sentence."
tokens = word_tokenize(test_text)
print("NLTK resources verified successfully!")

# Enhanced Language Feature Extractor

Custom scikit-learn transformer that extracts linguistic features from text:

## Key Features
- TF-IDF vectorization with n-gram support
- Text length metrics
- Character-level statistics:
  - Punctuation ratios
  - Case ratios (upper/lowercase)
  - Digit ratios
  - Whitespace analysis
  - Special character distribution

## Methods
- `get_advanced_features()`: Extracts detailed linguistic features
- `fit()`: Trains the feature extractor
- `transform()`: Converts texts into feature matrices

Designed for integration with scikit-learn pipelines and comprehensive text analysis.

In [None]:
class EnhancedLanguageFeatureExtractor(BaseEstimator, TransformerMixin):
    """Enhanced feature extractor with linguistic features"""
    
    def __init__(self, n_gram_range=(1, 3), max_features=1000):
        self.n_gram_range = n_gram_range
        self.max_features = max_features
        self.stopwords = set(stopwords.words('english'))
        self.feature_names_ = None
        self.tfidf = TfidfVectorizer(
            max_features=max_features,
            ngram_range=n_gram_range,
            stop_words='english'
        )
        
    def get_advanced_features(self, text):
        """Extract advanced linguistic features"""
        features = {}
        
        # Text length features
        features['total_length'] = len(text)
        features['avg_word_length'] = np.mean([len(w) for w in text.split()])
        
        # Punctuation features
        for punct in string.punctuation:
            features[f'punct_ratio_{punct}'] = text.count(punct) / len(text)
            
        # Case features
        features['uppercase_ratio'] = sum(1 for c in text if c.isupper()) / len(text)
        features['lowercase_ratio'] = sum(1 for c in text if c.islower()) / len(text)
        
        # Digit features
        features['digit_ratio'] = sum(1 for c in text if c.isdigit()) / len(text)
        
        # Whitespace features
        features['space_ratio'] = text.count(' ') / len(text)
        features['newline_ratio'] = text.count('\n') / len(text)
        
        # Language-specific features
        features['english_char_ratio'] = sum(1 for c in text if c in string.ascii_letters) / len(text)
        features['special_char_ratio'] = sum(1 for c in text if not c.isalnum() and c not in string.whitespace) / len(text)
        
        return features
    
    def fit(self, X, y=None):
        """Fit the feature extractor"""
        # Fit TF-IDF
        self.tfidf.fit(X)
        
        # Get sample features to establish feature names
        sample_features = self.get_advanced_features(X[0])
        self.feature_names_ = list(sample_features.keys()) + self.tfidf.get_feature_names_out().tolist()
        
        return self
    
    def transform(self, X):
        """Transform texts into feature matrix"""
        # Get TF-IDF features
        tfidf_features = self.tfidf.transform(X).toarray()
        
        # Get advanced features
        feature_matrix = []
        for text in tqdm(X, desc="Extracting features"):
            advanced_features = self.get_advanced_features(text)
            feature_vector = list(advanced_features.values())
            feature_matrix.append(feature_vector)
            
        # Combine features
        return np.hstack([np.array(feature_matrix), tfidf_features])

# Multilingual Text Collector

Text collection system that gathers and processes multilingual data from various sources:

## Features
- Collects English texts from:
  - Project Gutenberg corpus
  - Brown corpus
- Gathers non-English texts from:
  - Universal Declaration of Human Rights (UDHR) corpus

## Key Functions
- `collect_english_texts()`: Extracts English text samples
- `collect_non_english_texts()`: Gathers non-English samples
- `chunk_text()`: Splits texts into manageable chunks
- `collect_dataset()`: Creates balanced dataset with specified sample limits

Includes progress tracking and error handling for robust data collection.

In [None]:
class MultilingualTextCollector:
    """Collect and preprocess multilingual text data"""
    
    def __init__(self, min_text_length=1000):
        self.min_text_length = min_text_length
    
    def collect_english_texts(self):
        """Collect English texts from multiple sources"""
        english_texts = []
        
        print("\nCollecting English Texts:")
        print("-----------------------")
        
        # Collect from Gutenberg
        print("\nSamples from Gutenberg:")
        for fileid in gutenberg.fileids():
            try:
                text = ' '.join(gutenberg.words(fileid))
                chunks = self.chunk_text(text)
                if chunks:
                    english_texts.extend(chunks)
                    print(f"\nFile: {fileid}")
                    print("Sample text:")
                    print(chunks[0][:200] + "...\n")
            except Exception as e:
                print(f"Error processing Gutenberg file {fileid}: {str(e)}")
        
        # Collect from Brown corpus
        print("\nSamples from Brown corpus:")
        for fileid in brown.fileids():
            try:
                text = ' '.join(brown.words(fileid))
                chunks = self.chunk_text(text)
                if chunks:
                    english_texts.extend(chunks)
                    print(f"\nFile: {fileid}")
                    print("Sample text:")
                    print(chunks[0][:200] + "...\n")
            except Exception as e:
                print(f"Error processing Brown file {fileid}: {str(e)}")
        
        return english_texts
    
    def collect_non_english_texts(self):
        """Collect non-English texts from UDHR"""
        non_english_texts = []
        print("\nCollecting Non-English Texts:")
        print("---------------------------")
        
        available_languages = [fid for fid in udhr.fileids() 
                             if fid != 'English-Latin1' and 'Latin1' in fid]
        
        for lang in tqdm(available_languages, desc="Collecting non-English texts"):
            try:
                text = ' '.join(udhr.words(lang))
                chunks = self.chunk_text(text)
                if chunks:
                    non_english_texts.extend(chunks)
                    print(f"\nLanguage: {lang}")
                    print("Sample text:")
                    print(chunks[0][:200] + "...\n")
            except Exception as e:
                print(f"Error processing language {lang}: {str(e)}")
                continue
        
        return non_english_texts
    
    def chunk_text(self, text):
        """Split text into chunks of minimum length"""
        if not text:
            return []
            
        chunks = []
        current_chunk = []
        current_length = 0
        
        for word in text.split():
            current_chunk.append(word)
            current_length += len(word) + 1
            
            if current_length >= self.min_text_length:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_length = 0
        
        if current_chunk and current_length >= self.min_text_length / 2:
            chunks.append(' '.join(current_chunk))
            
        return chunks
    
    def collect_dataset(self, max_samples_per_class=1000):
        """Collect and prepare the complete dataset"""
        print("Collecting English texts...")
        english_texts = self.collect_english_texts()
        
        print(f"\nTotal English texts collected: {len(english_texts)}")
        if english_texts:
            print(f"Average length of English texts: {np.mean([len(text) for text in english_texts]):.0f} characters")
        
        print("\nCollecting non-English texts...")
        non_english_texts = self.collect_non_english_texts()
        
        print(f"\nTotal non-English texts collected: {len(non_english_texts)}")
        if non_english_texts:
            print(f"Average length of non-English texts: {np.mean([len(text) for text in non_english_texts]):.0f} characters")
        
        # Balance the dataset
        min_samples = min(len(english_texts), len(non_english_texts), max_samples_per_class)
        english_texts = english_texts[:min_samples]
        non_english_texts = non_english_texts[:min_samples]
        
        print("\nFinal Dataset Statistics:")
        print(f"Number of English samples: {len(english_texts)}")
        print(f"Number of non-English samples: {len(non_english_texts)}")
        
        # Create labels
        texts = english_texts + non_english_texts
        labels = ['english'] * len(english_texts) + ['non-english'] * len(non_english_texts)
        
        return texts, labels

# Complete Language Classification System

Comprehensive system for language classification with multiple models and evaluation:

## Components
1. **Classifiers**
   - Random Forest
   - SVM
   - Neural Network (MLP)

2. **Pipeline Features**
   - Text preparation
   - Feature extraction
   - Model evaluation
   - Cross-validation

## Main Functions
- `train_and_evaluate()`: Trains models and selects best performer
- `predict()`: Makes predictions with confidence scores
- `main()`: Demonstrates system with diverse test cases

## Testing Suite
Includes various test cases:
- Short texts
- Technical language
- Mixed language indicators
- Special characters and numbers

In [None]:
class LanguageClassificationSystem:
    """Complete language classification system with improvements"""
    
    def __init__(self, min_text_length=100, max_samples_per_class=1000):
        self.min_text_length = min_text_length
        self.max_samples_per_class = max_samples_per_class
        
        # Initialize classifiers with better parameters
        self.classifiers = {
            'random_forest': RandomForestClassifier(
                n_estimators=200,
                max_depth=20,
                min_samples_leaf=5,
                class_weight='balanced',
                random_state=42
            ),
            'svm': SVC(
                kernel='rbf',
                probability=True,
                class_weight='balanced',
                random_state=42
            ),
            'neural_net': MLPClassifier(
                hidden_layer_sizes=(200, 100, 50),
                max_iter=500,
                early_stopping=True,
                validation_fraction=0.2,
                random_state=42
            )
        }
        
        self.feature_extractor = EnhancedLanguageFeatureExtractor(
            max_features=2000
        )
        self.pipelines = {}
        self.best_pipeline = None
        
    def prepare_text(self, text):
        """Prepare text for prediction by padding if necessary"""
        if len(text) < self.min_text_length:
            repetitions = (self.min_text_length // len(text)) + 1
            text = (text + " ") * repetitions
        return text[:self.min_text_length]
    
    def build_pipelines(self):
        """Build classification pipelines"""
        for name, classifier in self.classifiers.items():
            self.pipelines[name] = Pipeline([
                ('features', self.feature_extractor),
                ('classifier', classifier)
            ])
    
    def train_and_evaluate(self):
        """Train and evaluate all models"""
        # Prepare data
        print("Preparing dataset...")
        collector = MultilingualTextCollector(min_text_length=self.min_text_length)
        texts, labels = collector.collect_dataset(max_samples_per_class=self.max_samples_per_class)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            texts, labels, test_size=0.2, stratify=labels, random_state=42
        )
        
        # Build and train pipelines
        print("\nTraining models...")
        self.build_pipelines()
        
        # Train and evaluate each model
        results = {}
        for name, pipeline in self.pipelines.items():
            print(f"\nEvaluating {name}...")
            
            # Cross-validation
            cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
            print(f"Cross-validation scores: {cv_scores}")
            print(f"Average CV score: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
            
            # Train on full training set
            pipeline.fit(X_train, y_train)
            
            # Evaluate on test set
            test_score = pipeline.score(X_test, y_test)
            print(f"Test score: {test_score:.4f}")
            
            # Detailed classification report
            y_pred = pipeline.predict(X_test)
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            
            # Save results
            results[name] = {
                'cv_mean': cv_scores.mean(),
                'cv_std': cv_scores.std(),
                'test_score': test_score,
                'pipeline': pipeline
            }
        
        # Select best model based on both CV and test performance
        best_model = max(results.items(), 
                        key=lambda x: (x[1]['cv_mean'] + x[1]['test_score']) / 2)
        self.best_pipeline = best_model[1]['pipeline']
        print(f"\nBest model: {best_model[0]}")
        print(f"CV Score: {best_model[1]['cv_mean']:.4f}")
        print(f"Test Score: {best_model[1]['test_score']:.4f}")
        
        return self.best_pipeline
    
    def predict(self, text, get_probabilities=False):
        """Make prediction with text preparation"""
        if self.best_pipeline is None:
            raise ValueError("Model not trained. Call train_and_evaluate first.")
        
        # Prepare text
        prepared_text = self.prepare_text(text)
        
        if get_probabilities:
            probs = self.best_pipeline.predict_proba([prepared_text])[0]
            # Add confidence adjustment for very short texts
            if len(text) < self.min_text_length:
                # Reduce confidence for short texts
                probs = (probs + 1) / 3
            return probs
        return self.best_pipeline.predict([prepared_text])[0]

def main():
    """Main execution function with diverse test cases"""
    print("Initializing Language Classification System...")
    system = LanguageClassificationSystem(
        min_text_length=100,
        max_samples_per_class=2000
    )
    
    try:
        best_classifier = system.train_and_evaluate()
        
        # Test with various lengths and styles
        test_texts = [
            # Very short
            "This is English.",
            
            # Short with numbers and punctuation
            "Testing 123! Is this working properly?",
            
            # Medium with variety
            """This text includes numbers (123), punctuation marks (!?.,), 
            and some UPPERCASE words. It should test various features.""",
            
            # Non-English looking text
            "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
            
            # Mixed language indicators
            "English with émbelishments and école words.",
            
            # Technical English
            "The API endpoint returns JSON data with UTF-8 encoding.",
        ]
        
        print("\nTesting with diverse examples:")
        for i, test_text in enumerate(test_texts, 1):
            prediction = system.predict(test_text)
            probabilities = system.predict(test_text, get_probabilities=True)
            
            print(f"\nTest Example {i}:")
            print(f"Length: {len(test_text)} characters")
            print("Text:", test_text)
            print(f"Classification: {prediction}")
            print("Confidence scores:")
            for label, prob in zip(['english', 'non-english'], probabilities):
                print(f"  {label}: {prob:.4f}")
                
            # Add warning for very short texts
            if len(test_text) < system.min_text_length:
                print("Warning: Text is shorter than recommended length. Results may be less reliable.")
    
    except Exception as e:
        print(f"Error during execution: {str(e)}")

if __name__ == "__main__":
    main()

# Model Persistence

Optional code for model serialization:
- Saves trained model using pickle
- Provides loading functionality for future use
- Currently commented out for selective implementation

Note: Uncomment and use as needed for model deployment.

In [None]:
# import pickle
# import system

# # Save model
# with open('language_classifier.pkl', 'wb') as f:
#     pickle.dump(system.best_pipeline, f)

# # Load model
# with open('language_classifier.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)