# Arabic Text Classification - Active Learning Pipelines

This notebook demonstrates two active learning approaches for Arabic text classification:
1. **Uncertainty Sampling Pipeline**: Uses SBERT embeddings with a single classifier
2. **Committee-Based Sampling Pipeline**: Uses SBERT embeddings with multiple classifiers

## Setup and Imports

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import torch
from typing import Dict, Any, List

# Add project root to path
project_root = os.path.dirname(os.getcwd()) if 'notebooks' in os.getcwd() else os.getcwd()
sys.path.append(project_root)

from utils.data_loader import DataLoader
from utils.embeddings import SBERTEmbedder
from pipelines.uncertainty_sampling import UncertaintySamplingPipeline
from pipelines.committee_sampling import CommitteeSamplingPipeline

print(f"Using device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")
print(f"Project root: {project_root}")

## Create Sample Data

Let's create sample Arabic text data for demonstration purposes:

In [None]:
def create_sample_data(output_path: str = "data/sample_arabic_data.csv", n_samples: int = 500):
    """Create sample Arabic text data for testing"""
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    security_texts = [
        "تسريب البيانات الشخصية للمستخدمين",
        "هجمات القرصنة الإلكترونية على البنوك",
        "اختراق أنظمة الحماية الرقمية",
        "برمجيات خبيثة تهدد الشبكات",
        "فيروسات الحاسوب والبرمجيات الضارة",
        "سرقة كلمات المرور والحسابات",
        "أمان المعلومات والحماية الرقمية",
        "تشفير البيانات الحساسة",
        "حماية الخصوصية على الإنترنت",
        "أمان الشبكات والخوادم"
    ]
    
    non_security_texts = [
        "الطقس اليوم مشمس وجميل",
        "وصفة الطبخ التقليدية العربية",
        "أخبار الرياضة والمباريات",
        "السياحة والسفر إلى البلدان العربية",
        "التعليم والثقافة في المجتمع",
        "الفنون والموسيقى العربية",
        "الأدب والشعر الكلاسيكي",
        "التجارة والاقتصاد المحلي",
        "الصحة والطب التقليدي",
        "التكنولوجيا والابتكار"
    ]
    
    data = []
    for i in range(n_samples):
        if i % 2 == 0:
            text = np.random.choice(security_texts)
            label = 1  # Security-related
        else:
            text = np.random.choice(non_security_texts)
            label = 0  # Non-security
        
        data.append({
            'text': f"{text} - عينة رقم {i+1}",
            'label': label
        })
    
    df = pd.DataFrame(data)
    df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"Sample data created at: {output_path}")
    print(f"Data shape: {df.shape}")
    print(f"Label distribution:\n{df['label'].value_counts()}")
    return output_path

# Create sample data
data_path = create_sample_data(f"{project_root}/data/sample_arabic_data.csv")

## Load Data and Initialize Components

In [None]:
# Initialize data loader and embedder
data_loader = DataLoader(data_path)
embedder = SBERTEmbedder()

# Get initial split: small labeled set, large unlabeled set
labeled_texts, labeled_labels, unlabeled_texts, unlabeled_labels = data_loader.get_initial_split(
    initial_labeled_size=50, random_state=42
)

print(f"Initial labeled pool size: {len(labeled_texts)}")
print(f"Initial unlabeled pool size: {len(unlabeled_texts)}")
print(f"Label distribution in labeled set: {np.unique(labeled_labels, return_counts=True)}")

## Pipeline 1: Uncertainty Sampling

This pipeline uses a single classifier with uncertainty sampling strategies.

In [None]:
# Initialize Uncertainty Sampling Pipeline
uncertainty_pipeline = UncertaintySamplingPipeline(
    embedder=embedder, 
    classifier_type='logistic',  # Can be 'logistic' or 'svm'
    random_state=42
)

# Initialize the pools
uncertainty_pipeline.initialize_pools(labeled_texts, labeled_labels, unlabeled_texts)

print("Uncertainty Sampling Pipeline initialized successfully!")

In [None]:
# Run Active Learning Iterations - Uncertainty Sampling
print("Running Uncertainty Sampling Pipeline...\n")

uncertainty_results = []

for iteration in range(2):  # Run 2 iterations
    print(f"--- Iteration {iteration + 1} ---")
    
    # Run one active learning iteration
    results = uncertainty_pipeline.run_active_learning_iteration(
        n_samples=10, 
        strategy='least_confident'  # Options: 'least_confident', 'margin', 'entropy'
    )
    
    print(f"Pool sizes: Labeled={results['pool_sizes']['labeled']}, Unlabeled={results['pool_sizes']['unlabeled']}")
    print(f"Selected {len(results['selected_texts'])} samples with {results['strategy']} strategy")
    
    # Show top 3 most uncertain samples
    print("\nTop 3 most uncertain samples:")
    for i, (text, uncertainty) in enumerate(zip(results['selected_texts'][-3:], results['uncertainties'][-3:])):
        print(f"{i+1}. Uncertainty: {uncertainty:.4f}")
        print(f"   Text: {text[:100]}...\n")
    
    # Simulate labeling process (in real scenario, human would label these)
    simulated_labels = []
    for text in results['selected_texts']:
        # Simple heuristic for simulation: if text contains security keywords, label as 1
        if any(keyword in text for keyword in ['أمان', 'حماية', 'تسريب', 'اختراق', 'فيروس']):
            simulated_labels.append(1)
        else:
            simulated_labels.append(0)
    
    # Add newly labeled samples to the pipeline
    uncertainty_pipeline.add_labeled_samples(
        results['selected_texts'], 
        simulated_labels, 
        results['selected_indices']
    )
    
    uncertainty_results.append(results)
    print("\n" + "="*50)

print("Uncertainty Sampling Pipeline completed!")

## Pipeline 2: Committee-Based Sampling

This pipeline uses multiple classifiers and measures their disagreement.

In [None]:
# Initialize Committee Sampling Pipeline
committee_pipeline = CommitteeSamplingPipeline(
    embedder=embedder,
    use_gpu_lightgbm=True,  # Use GPU-accelerated LightGBM if available
    random_state=42
)

# Initialize the pools (using fresh copies)
committee_pipeline.initialize_pools(labeled_texts, labeled_labels, unlabeled_texts)

print("Committee-Based Sampling Pipeline initialized successfully!")

In [None]:
# Run Active Learning Iterations - Committee Sampling
print("Running Committee-Based Sampling Pipeline...\n")

committee_results = []

for iteration in range(2):  # Run 2 iterations
    print(f"--- Iteration {iteration + 1} ---")
    
    # Run one active learning iteration
    results = committee_pipeline.run_active_learning_iteration(
        n_samples=10,
        strategy='vote_entropy'  # Options: 'vote_entropy', 'disagreement'
    )
    
    print(f"Pool sizes: Labeled={results['pool_sizes']['labeled']}, Unlabeled={results['pool_sizes']['unlabeled']}")
    
    # Show committee performance
    print("\nCommittee Member Accuracies:")
    for classifier_name, accuracy in results['committee_accuracies'].items():
        print(f"  {classifier_name}: {accuracy:.4f}")
    
    print(f"\nSelected {len(results['selected_texts'])} samples with {results['strategy']} strategy")
    
    # Show top 3 most disagreed samples
    print("\nTop 3 most disagreed samples:")
    for i, (text, score) in enumerate(zip(results['selected_texts'][-3:], results['scores'][-3:])):
        print(f"{i+1}. {results['strategy'].title()}: {score:.4f}")
        print(f"   Text: {text[:100]}...\n")
    
    # Simulate labeling process
    simulated_labels = []
    for text in results['selected_texts']:
        if any(keyword in text for keyword in ['أمان', 'حماية', 'تسريب', 'اختراق', 'فيروس']):
            simulated_labels.append(1)
        else:
            simulated_labels.append(0)
    
    # Add newly labeled samples to the pipeline
    committee_pipeline.add_labeled_samples(
        results['selected_texts'], 
        simulated_labels, 
        results['selected_indices']
    )
    
    committee_results.append(results)
    print("\n" + "="*50)

print("Committee-Based Sampling Pipeline completed!")

## Results Summary and Comparison

In [None]:
# Summary of both pipelines
print("ACTIVE LEARNING PIPELINES SUMMARY")
print("=" * 50)

print("\n1. UNCERTAINTY SAMPLING PIPELINE:")
print(f"   - Final labeled pool size: {uncertainty_pipeline.get_pool_sizes()['labeled']}")
print(f"   - Final unlabeled pool size: {uncertainty_pipeline.get_pool_sizes()['unlabeled']}")
print(f"   - Classifier type: {uncertainty_pipeline.classifier_type}")
print(f"   - Sampling strategy: least_confident")

print("\n2. COMMITTEE-BASED SAMPLING PIPELINE:")
print(f"   - Final labeled pool size: {committee_pipeline.get_pool_sizes()['labeled']}")
print(f"   - Final unlabeled pool size: {committee_pipeline.get_pool_sizes()['unlabeled']}")
print(f"   - Committee size: {len(committee_pipeline.committee)} classifiers")
print(f"   - Committee members: {[type(c).__name__ for c in committee_pipeline.committee]}")
print(f"   - Sampling strategy: vote_entropy")

print("\n" + "=" * 50)
print("Both pipelines completed successfully!")
print("You can now use your own Arabic CSV data by updating the data_path.")

## Using Your Own Data

To use your own Arabic text data:

1. **Prepare your CSV file** with columns:
   - `text`: Arabic text content
   - `label`: Binary labels (0 for non-security, 1 for security)

2. **Update the data path**:
   ```python
   data_path = "path/to/your/arabic_data.csv"
   data_loader = DataLoader(data_path)
   ```

3. **Customize the pipelines**:
   - Adjust `initial_labeled_size` based on your budget
   - Choose different sampling strategies
   - Modify the number of iterations
   - Change the number of samples per iteration

4. **Replace simulated labeling** with real human annotation:
   ```python
   # Instead of simulated_labels, get real labels from annotators
   real_labels = get_human_annotations(results['selected_texts'])
   pipeline.add_labeled_samples(results['selected_texts'], real_labels, results['selected_indices'])
   ```