# Notebook 2: Pseudo-Labeling and Final Packaging

This script loads the engineered features, uses LLMs to generate labels,
filters for high-quality data, and packages the final train/val/test splits
and metadata file for model training.

##1. SETUP AND INSTALLATION

In [None]:
print("🚀 Starting enhanced setup for Notebook 2...")

# Install required packages
!pip install -q pandas scikit-learn transformers torch huggingface_hub accelerate sentencepiece statsmodels bitsandbytes psutil

# Core imports
import json
import pandas as pd
import numpy as np
import os
import gc
import pickle
import warnings
import time
import logging
import psutil
from pathlib import Path
from collections import defaultdict, Counter
from tqdm.auto import tqdm

# Parallel processing imports
import concurrent.futures
import threading
from queue import Queue, Empty
import multiprocessing as mp
from functools import partial

# ML imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from statsmodels.stats.inter_rater import fleiss_kappa

# Transformers imports
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login

# Colab setup
from google.colab import drive
drive.mount('/content/drive')

# Configure logging
transformers.logging.set_verbosity_error()
logging.getLogger("transformers.generation_utils").setLevel(logging.ERROR)
warnings.filterwarnings('ignore')

# HuggingFace login
print("\n🔑 Please log in to Hugging Face...")
login(new_session=False)

print("\n✅ Setup complete!")
print("-" * 80)

🚀 Starting enhanced setup for Notebook 2...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

🔑 Please log in to Hugging Face...

✅ Setup complete!
--------------------------------------------------------------------------------


## 2. CONFIGURATION

In [79]:
print("⚙️ Loading enhanced configuration for parallel processing...")

# --- File Paths ---
BASE_DIR = "/content/drive/MyDrive/Tiktok_Hackaton/preprocessed_data"
ENGINEERED_FEATURES_FILE = os.path.join(BASE_DIR, "engineered_features.pkl")
PROCESSED_TRAIN_FILE = os.path.join(BASE_DIR, "processed_train.pkl")
PROCESSED_VAL_FILE = os.path.join(BASE_DIR, "processed_val.pkl")
PROCESSED_TEST_FILE = os.path.join(BASE_DIR, "processed_test.pkl")
METADATA_FILE = os.path.join(BASE_DIR, "feature_metadata.json")
ENCODERS_FILE = os.path.join(BASE_DIR, "encoders.pkl")
ACTIVE_LEARNING_SAMPLES_FILE = os.path.join(BASE_DIR, "active_learning_sample_user_ids.pkl")

# --- Model Configuration ---
MODELS_TO_TRY = [
    "mistralai/Mistral-7B-Instruct-v0.3",
    "google/gemma-2-2b-it",
    "meta-llama/Meta-Llama-3.1-8B-Instruct"
]

# --- Generation Parameters (optimized for consistency) ---
GENERATION_CONFIG = {
    "max_new_tokens": 15,
    "min_new_tokens": 1,
    "temperature": 0.05,  # Very low for consistency
    "do_sample": False,   # Deterministic
    "top_p": 0.9,
    "repetition_penalty": 1.1,
    "pad_token_id": None,  # Set dynamically
    "use_cache": False     # Memory optimization
}

# --- Parallel Processing Configuration ---
PARALLEL_CONFIG = {
    "enable_parallel": True,
    "text_batch_size": 6,          # Smaller batches for stability
    "processing_threads": 3,        # Concurrent batch processing
    "preprocessing_workers": 4,     # Text preprocessing threads
    "memory_cleanup_interval": 8,   # Cleanup every N batches
    "max_retries": 3,
    "queue_timeout": 30
}

# --- Quality Thresholds ---
QUALITY_CONFIG = {
    "min_confidence_threshold": 0.6,
    "target_fleiss_kappa": 0.75,
    "min_acceptable_kappa": 0.5,
    "max_error_rate": 0.25
}

# --- Data Configuration ---
DATA_CONFIG = {
    "train_ratio": 0.8,
    "val_ratio": 0.10,
    "test_ratio": 0.10,
    "sample_large_datasets": True,
    "max_sample_size": 8000
}

# --- Feature Lists ---
CATEGORICAL_FEATURES = ['review_day_of_week', 'review_hour_of_day', 'dominant_topic']
CONTINUOUS_FEATURES = [
    'sentiment_score', 'review_length_words', 'num_urls', 'num_emails',
    'num_phone_numbers', 'num_mentions', 'num_hashtags', 'num_irrelevant_words',
    'num_exclamations', 'num_questions', 'num_ellipsis', 'num_total_punctuation',
    'all_caps_word_count', 'elongated_word_count', 'caps_ratio', 'unique_word_ratio',
    'digit_ratio', 'punctuation_ratio', 'num_sentiment_words_pos', 'num_sentiment_words_neg',
    'num_emojis', 'sentiment_polarity_lex', 'pos_words_per_100w', 'neg_words_per_100w',
    'emojis_per_100w', 'user_review_count', 'num_same_day_reviews', 'avg_review_rating',
    'user_max_7d_burst', 'user_review_span_days', 'user_burst_ratio', 'user_place_diversity',
    'user_avg_length', 'user_text_share', 'user_response_rate', 'n_photos',
    'photo_url_length_mean', 'photo_google_cdn_flag', 'photo_url_length_max',
    'photo_url_length_std', 'photo_url_param_count_mean', 'photo_host_nunique',
    'photo_has_multi_hosts', 'photo_has_thumbnail_param', 'photo_url_is_duplicate_any',
    'has_photo_and_no_text', 'has_photo_and_short_text', 'has_photo_and_low_rating',
    'has_photo_and_high_rating', 'n_photos_user_z', 'user_photo_host_diversity',
    'user_avg_photo_url_len', 'user_median_n_photos', 'place_photo_host_diversity',
    'place_median_n_photos'
]

VALID_LABELS = ['advertisement', 'irrelevant', 'rant_without_visit', 'relevant_and_quality']
CLASS_NAMES = ["Spam/Advertisement", "Irrelevant Content", "Rant/Complaint (without visit)", "Relevant and Quality"]

print("✅ Enhanced configuration loaded.")
print(f"Parallel processing: {'Enabled' if PARALLEL_CONFIG['enable_parallel'] else 'Disabled'}")
print(f"Target Fleiss' Kappa: {QUALITY_CONFIG['target_fleiss_kappa']}")
print("-" * 80)

⚙️ Loading enhanced configuration for parallel processing...
✅ Enhanced configuration loaded.
Parallel processing: Enabled
Target Fleiss' Kappa: 0.75
--------------------------------------------------------------------------------


## 3. LOAD ENGINEERED FEATURES

In [None]:
print("📥 Loading engineered features from Notebook 1...")

try:
    reviews_df = pd.read_pickle(ENGINEERED_FEATURES_FILE)
    print(f"✅ Successfully loaded {len(reviews_df):,} reviews from engineered features file")
    print(f"Columns available: {len(reviews_df.columns)}")

    # Display basic info about the dataset
    print(f"\nDataset Overview:")
    print(f"Shape: {reviews_df.shape}")
    print(f"Memory usage: {reviews_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

    # Check for required columns
    required_columns = ['cleaned_text', 'text']
    missing_columns = [col for col in required_columns if col not in reviews_df.columns]
    if missing_columns:
        print(f"⚠️ Missing required columns: {missing_columns}")
        # Try to create cleaned_text from text if missing
        if 'cleaned_text' not in reviews_df.columns and 'text' in reviews_df.columns:
            reviews_df['cleaned_text'] = reviews_df['text'].fillna('')
            print("✅ Created cleaned_text from text column")

    # Ensure all required features exist
    missing_features = []
    for col in CATEGORICAL_FEATURES + CONTINUOUS_FEATURES:
        if col not in reviews_df.columns:
            reviews_df[col] = 0
            missing_features.append(col)

    if missing_features:
        print(f"⚠️ Added missing features with default values: {len(missing_features)} features")

    # Data quality checks
    print(f"\nData Quality Checks:")
    print(f"Non-null text entries: {reviews_df['cleaned_text'].notna().sum():,}")
    print(f"Empty text entries: {(reviews_df['cleaned_text'].fillna('').str.len() == 0).sum():,}")

    # Sample of text lengths
    text_lengths = reviews_df['cleaned_text'].fillna('').str.len()
    print(f"Text length stats:")
    print(f"  Mean: {text_lengths.mean():.1f} characters")
    print(f"  Median: {text_lengths.median():.1f} characters")
    print(f"  Max: {text_lengths.max()} characters")

except FileNotFoundError:
    print(f"❌ Error: Could not find {ENGINEERED_FEATURES_FILE}")
    print("Please ensure you have run Notebook 1 (feature engineering) successfully.")
    print("The file should contain the engineered features from the preprocessing step.")
    raise FileNotFoundError(
        f"Required file not found: {ENGINEERED_FEATURES_FILE}\n"
        "Run Notebook 1 first to generate engineered features."
    )

except Exception as e:
    print(f"❌ Error loading data: {str(e)}")
    print("Please check the file path and ensure the pickle file is not corrupted.")
    raise

print("-" * 80)

📥 Loading engineered features from Notebook 1...
✅ Successfully loaded 50,000 reviews from engineered features file
Columns available: 70

Dataset Overview:
Shape: (50000, 70)
Memory usage: 66.1 MB

Data Quality Checks:
Non-null text entries: 50,000
Empty text entries: 0
Text length stats:
  Mean: 112.8 characters
  Median: 61.0 characters
  Max: 3560 characters
--------------------------------------------------------------------------------


In [None]:
# --- Your existing code to load reviews_df from file ---
# reviews_df = pd.read_pickle(ENGINEERED_FEATURES_FILE)

# --- Load Active Learning Samples ---
def load_active_learning_samples(file_path):
    """Loads a list of user IDs from the active learning output file."""
    if not os.path.exists(file_path):
        print(f"⚠️ Warning: Active learning sample file not found at {file_path}. Skipping.")
        return []

    try:
        with open(file_path, 'rb') as f:
            user_ids = pickle.load(f)
            if not isinstance(user_ids, list):
                print("❌ Error: Loaded data is not a list. Check the file format.")
                return []
            print(f"✅ Loaded {len(user_ids):,} user IDs from active learning sample file.")
            return user_ids
    except Exception as e:
        print(f"❌ Error loading active learning samples: {e}")
        return []

# --- Filter the Data before running the pipeline ---
print("Filtering the main dataset using active learning samples...")

# Load the list of user IDs from the active learning output
active_learning_user_ids = load_active_learning_samples(ACTIVE_LEARNING_SAMPLES_FILE)

# Check if any samples were loaded
if not active_learning_user_ids:
    print("No active learning samples found. Exiting.")
    raise SystemExit

# Filter the main DataFrame to include only the active learning samples
reviews_df = reviews_df[reviews_df['user_id'].isin(active_learning_user_ids)].copy()

if reviews_df.empty:
    print("⚠️ No samples in the main dataset matched the active learning IDs. Exiting.")
    raise SystemExit

print(f"✅ The pipeline will now process {len(reviews_df):,} samples.")
print("-" * 60)


Filtering the main dataset using active learning samples...
✅ Loaded 6,082 user IDs from active learning sample file.
✅ The pipeline will now process 17,204 samples.
------------------------------------------------------------


##4. PSEUDO-LABELING WITH MULTIPLE LLMS

##Policy Definition

In [None]:
policy_definitions = """Classification Guidelines for Google Location Reviews:

Classify each review into exactly ONE category. Be consistent and precise.

1. ADVERTISEMENT (label: advertisement)
   - Primary purpose is promoting products, services, or websites.
   - Contains promotional links, discount codes, or commercial language.
   - Examples: "Visit our website for deals!", "Use promo code SAVE20", "Buy our products"

2. IRRELEVANT CONTENT (label: irrelevant)
   - Content that has NO connection to the business or location.
   - Includes personal stories, generic statements about the weather, or comments that are extremely generic and provide ZERO specific insight into the business. This includes short, one-word positive or negative reviews that don't mention any aspect of the business.
   - Examples: "My dog is cute", "The weather was nice today", "Happy birthday mom", "Great work!"

3. RANT WITHOUT VISIT (label: rant_without_visit)
   - Negative opinions or complaints that LACK specific details of a personal experience. This is for comments based on secondhand information, hearsay, or extremely generic, unsubstantiated negative statements.
   - CRITICAL DISTINCTION: If the review contains *any* specific detail about an in-person experience (e.g., a specific food item, service issue, or store layout), it is NOT a rant without a visit.
   - Examples: "Never been but heard it's terrible", "My friend told me it's bad", "pretty awful"

4. RELEVANT AND QUALITY (label: relevant_and_quality)
   - Based on a clear personal experience at the actual location.
   - Provides specific, useful information, regardless of length. This information can be positive, negative, or neutral.
   - CRITICAL DISTINCTION: This category includes any review that mentions a specific product, service, staff interaction, or a tangible aspect of the visit, even if the tone is a rant.
   - Examples: "Great food but slow service", "Staff was very helpful", "Parking was difficult", "Great food", "8 for two slices and a beer!", "They messed up an easy sandwich and do not wear gloves when making sandwiches"

CRITICAL: Respond with ONLY the exact label. No explanations, no punctuation, no extra words.
"""

print("📋 Refined policy definitions loaded for consistent classification")

📋 Refined policy definitions loaded for consistent classification


## System Monitoring Utilities

In [None]:
class SystemMonitor:
    """Monitor system resources for optimal parallel processing"""

    @staticmethod
    def get_memory_info():
        """Get current memory usage"""
        memory = psutil.virtual_memory()
        gpu_memory = None
        if torch.cuda.is_available():
            gpu_memory = {
                'allocated': torch.cuda.memory_allocated() / 1024**3,
                'cached': torch.cuda.memory_reserved() / 1024**3,
                'total': torch.cuda.get_device_properties(0).total_memory / 1024**3
            }
        return {
            'ram_used': memory.used / 1024**3,
            'ram_total': memory.total / 1024**3,
            'ram_percent': memory.percent,
            'gpu': gpu_memory
        }

    @staticmethod
    def should_cleanup_memory():
        """Determine if memory cleanup is needed"""
        mem_info = SystemMonitor.get_memory_info()

        # Cleanup if RAM usage > 80% or GPU memory > 85%
        if mem_info['ram_percent'] > 80:
            return True, "High RAM usage"

        if mem_info['gpu'] and mem_info['gpu']['allocated'] / mem_info['gpu']['total'] > 0.85:
            return True, "High GPU memory usage"

        return False, "Memory OK"

    @staticmethod
    def aggressive_cleanup():
        """Perform aggressive memory cleanup"""
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.synchronize()
        time.sleep(1)

    @staticmethod
    def print_memory_status():
        """Print current memory status"""
        mem_info = SystemMonitor.get_memory_info()
        print(f"RAM: {mem_info['ram_used']:.1f}GB / {mem_info['ram_total']:.1f}GB ({mem_info['ram_percent']:.1f}%)")
        if mem_info['gpu']:
            gpu_pct = mem_info['gpu']['allocated'] / mem_info['gpu']['total'] * 100
            print(f"GPU: {mem_info['gpu']['allocated']:.1f}GB / {mem_info['gpu']['total']:.1f}GB ({gpu_pct:.1f}%)")

print("🖥️ System monitoring utilities loaded")

🖥️ System monitoring utilities loaded


##Enhanced Multi-Model Classifier with Parallel Processing











In [None]:
class EnhancedParallelClassifier:
    """Enhanced classifier with optimized parallel text processing"""

    def __init__(self, models_to_try):
        self.current_model = None
        self.current_tokenizer = None
        self.current_model_name = None
        self.model_paths = {model: model for model in models_to_try}
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Parallel processing setup
        self.model_lock = threading.Lock()
        self.results_lock = threading.Lock()

        # Quantization config
        self.quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )

    def load_model(self, model_name, max_retries=3):
        """Load model with enhanced error handling"""
        with self.model_lock:
            self._cleanup_model()

            for attempt in range(max_retries):
                try:
                    print(f"Loading {model_name} (attempt {attempt + 1}/{max_retries})...")
                    model_path = self.model_paths[model_name]

                    # Load tokenizer
                    self.current_tokenizer = AutoTokenizer.from_pretrained(
                        model_path, trust_remote_code=True, use_fast=True
                    )
                    if self.current_tokenizer.pad_token is None:
                        self.current_tokenizer.pad_token = self.current_tokenizer.eos_token

                    # Determine if quantization is needed
                    use_quantization = (torch.cuda.is_available() and
                                      any(x in model_name.lower() for x in ['8b', '7b']))

                    # Load model
                    if use_quantization:
                        self.current_model = AutoModelForCausalLM.from_pretrained(
                            model_path,
                            quantization_config=self.quantization_config,
                            device_map="auto",
                            low_cpu_mem_usage=True,
                            trust_remote_code=True,
                            torch_dtype=torch.float16
                        )
                    else:
                        self.current_model = AutoModelForCausalLM.from_pretrained(
                            model_path,
                            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                            device_map="auto" if torch.cuda.is_available() else None,
                            low_cpu_mem_usage=True,
                            trust_remote_code=True
                        )

                    self.current_model_name = model_name

                    # Update generation config
                    GENERATION_CONFIG["pad_token_id"] = self.current_tokenizer.eos_token_id

                    print(f"✅ Successfully loaded {model_name}")
                    return True

                except Exception as e:
                    print(f"❌ Attempt {attempt + 1} failed: {str(e)[:100]}")
                    self._cleanup_model()
                    if attempt < max_retries - 1:
                        time.sleep(5 * (attempt + 1))

            return False

    def _cleanup_model(self):
        """Clean up model resources"""
        if self.current_model is not None:
            if hasattr(self.current_model, 'cpu'):
                self.current_model.cpu()
            del self.current_model
            self.current_model = None

        if self.current_tokenizer is not None:
            del self.current_tokenizer
            self.current_tokenizer = None

        self.current_model_name = None
        SystemMonitor.aggressive_cleanup()

    def _preprocess_batch(self, texts, policy_definitions):
        """Preprocess a batch of texts for inference"""
        processed_inputs = []

        for text in texts:
            # Create prompt
            prompt = f"""{policy_definitions}

Review: "{text[:500]}"

Classification:"""

            messages = [{"role": "user", "content": prompt}]

            try:
                # Tokenize
                inputs = self.current_tokenizer.apply_chat_template(
                    messages,
                    add_generation_prompt=True,
                    tokenize=True,
                    return_dict=True,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=1024
                )
                processed_inputs.append(inputs)
            except Exception as e:
                print(f"Preprocessing error: {e}")
                processed_inputs.append(None)

        return processed_inputs

    def _classify_single_batch(self, batch_inputs):
        """Classify a single batch with error handling"""
        results = []

        for inputs in batch_inputs:
            if inputs is None:
                results.append("preprocessing_error")
                continue

            try:
                # Move to device
                if torch.cuda.is_available() and self.current_model.device.type == 'cuda':
                    inputs = {k: v.to(self.current_model.device) for k, v in inputs.items()}

                # Generate
                with torch.inference_mode():
                    outputs = self.current_model.generate(
                        **inputs,
                        **GENERATION_CONFIG,
                        eos_token_id=self.current_tokenizer.eos_token_id
                    )

                # Decode
                gen_ids = outputs[0][inputs["input_ids"].shape[-1]:]
                generated_text = self.current_tokenizer.decode(
                    gen_ids, skip_special_tokens=True
                ).strip().lower()

                # Extract label
                classification = self._extract_label(generated_text)
                results.append(classification)

            except torch.cuda.OutOfMemoryError:
                print("    CUDA OOM - cleaning up")
                SystemMonitor.aggressive_cleanup()
                results.append("cuda_error")
            except Exception as e:
                print(f"    Classification error: {str(e)[:50]}")
                results.append("processing_error")

        return results

    def _extract_label(self, generated_text):
        """Enhanced label extraction"""
        text = generated_text.lower().strip()

        # Direct matching
        for label in VALID_LABELS:
            if label in text:
                return label

        # Fuzzy matching
        fuzzy_matches = {
            'advertisement': ['ad', 'spam', 'promo', 'commercial', 'marketing'],
            'irrelevant': ['irrelevant', 'unrelated', 'off-topic', 'random'],
            'rant_without_visit': ['rant', 'complaint', 'angry', 'never been', 'heard'],
            'relevant_and_quality': ['relevant', 'quality', 'helpful', 'experience', 'visited']
        }

        for label, keywords in fuzzy_matches.items():
            if any(keyword in text for keyword in keywords):
                return label

        return "error"

    def classify_texts_parallel(self, texts, policy_definitions):
        """Main parallel classification method"""
        if not PARALLEL_CONFIG["enable_parallel"] or len(texts) < 50:
            return self._classify_sequential(texts, policy_definitions)

        print(f"🔄 Processing {len(texts)} texts with parallel batching")

        # Split into batches
        batch_size = PARALLEL_CONFIG["text_batch_size"]
        batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]

        all_results = []
        completed_batches = 0

        # Process batches with limited parallelism
        max_workers = PARALLEL_CONFIG["processing_threads"]

        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit preprocessing tasks
            future_to_batch = {}

            for i, batch in enumerate(batches):
                future = executor.submit(self._preprocess_batch, batch, policy_definitions)
                future_to_batch[future] = (i, batch)

            # Process completed preprocessing
            batch_results = [None] * len(batches)

            for future in tqdm(concurrent.futures.as_completed(future_to_batch),
                             total=len(batches), desc=f"Processing with {self.current_model_name}"):
                try:
                    batch_idx, original_batch = future_to_batch[future]
                    preprocessed_inputs = future.result(timeout=PARALLEL_CONFIG["queue_timeout"])

                    # Classify this batch (sequential within batch for GPU safety)
                    with self.model_lock:
                        batch_classifications = self._classify_single_batch(preprocessed_inputs)

                    batch_results[batch_idx] = batch_classifications
                    completed_batches += 1

                    # Memory cleanup
                    if completed_batches % PARALLEL_CONFIG["memory_cleanup_interval"] == 0:
                        should_cleanup, reason = SystemMonitor.should_cleanup_memory()
                        if should_cleanup:
                            print(f"    Cleaning memory: {reason}")
                            SystemMonitor.aggressive_cleanup()

                except Exception as e:
                    print(f"Batch processing error: {e}")
                    batch_idx, original_batch = future_to_batch[future]
                    batch_results[batch_idx] = ["error"] * len(original_batch)

        # Flatten results
        for batch_result in batch_results:
            if batch_result:
                all_results.extend(batch_result)

        # Validate results length
        if len(all_results) != len(texts):
            print(f"⚠️ Result length mismatch: {len(all_results)} vs {len(texts)}")
            # Pad or trim as needed
            if len(all_results) < len(texts):
                all_results.extend(["error"] * (len(texts) - len(all_results)))
            else:
                all_results = all_results[:len(texts)]

        return all_results

    def _classify_sequential(self, texts, policy_definitions):
        """Fallback sequential processing"""
        print(f"🔄 Processing {len(texts)} texts sequentially")
        results = []

        batch_size = PARALLEL_CONFIG["text_batch_size"]

        for i in tqdm(range(0, len(texts), batch_size), desc=f"Sequential {self.current_model_name}"):
            batch = texts[i:i + batch_size]
            preprocessed = self._preprocess_batch(batch, policy_definitions)
            batch_results = self._classify_single_batch(preprocessed)
            results.extend(batch_results)

            # Memory cleanup
            if (i // batch_size + 1) % PARALLEL_CONFIG["memory_cleanup_interval"] == 0:
                SystemMonitor.aggressive_cleanup()

        return results

    def cleanup(self):
        """Final cleanup"""
        self._cleanup_model()

print("🤖 Enhanced parallel classifier loaded")

🤖 Enhanced parallel classifier loaded


##IRA Functions

In [None]:
def interpret_kappa(kappa_score):
    """Interpret Fleiss' kappa score according to Landis & Koch (1977)"""
    if kappa_score < 0:
        return "Poor (worse than chance)"
    elif kappa_score < 0.20:
        return "Slight agreement"
    elif kappa_score < 0.40:
        return "Fair agreement"
    elif kappa_score < 0.60:
        return "Moderate agreement"
    elif kappa_score < 0.80:
        return "Substantial agreement"
    else:
        return "Almost perfect agreement"

In [None]:
def compute_enhanced_fleiss_kappa(df, model_columns):
    """Enhanced Fleiss' kappa with detailed analysis"""
    # Filter valid predictions
    valid_mask = df[model_columns].apply(
        lambda row: all(val in VALID_LABELS for val in row), axis=1
    )
    valid_df = df[valid_mask]

    if len(valid_df) == 0:
        return 0.0, 0, pd.DataFrame(), {}

    print(f"Computing Fleiss' Kappa for {len(valid_df)} valid items")

    # Build ratings matrix
    ratings_data = []
    for _, row in valid_df.iterrows():
        rating_counts = [0] * len(VALID_LABELS)
        for col in model_columns:
            label = row[col]
            if label in VALID_LABELS:
                label_idx = VALID_LABELS.index(label)
                rating_counts[label_idx] += 1
        ratings_data.append(rating_counts)

    ratings_table = pd.DataFrame(ratings_data, columns=VALID_LABELS)

    # Compute kappa
    try:
        kappa_result = fleiss_kappa(ratings_table.values)
    except Exception as e:
        print(f"Error computing kappa: {e}")
        return 0.0, len(valid_df), ratings_table, {}

    # Additional metrics
    metrics = {
        'perfect_agreement': (ratings_table.max(axis=1) == len(model_columns)).sum(),
        'majority_agreement': (ratings_table.max(axis=1) >= len(model_columns)/2).sum(),
        'no_agreement': (ratings_table.max(axis=1) == 1).sum()
    }

    total = len(valid_df)
    print(f"Perfect agreement: {metrics['perfect_agreement']}/{total} ({metrics['perfect_agreement']/total*100:.1f}%)")
    print(f"Majority agreement: {metrics['majority_agreement']}/{total} ({metrics['majority_agreement']/total*100:.1f}%)")

    return kappa_result, len(valid_df), ratings_table, metrics


In [None]:
def analyze_model_performance(df, model_columns):
    """Analyze individual model performance"""
    print("\n" + "="*50)
    print("MODEL PERFORMANCE ANALYSIS")
    print("="*50)

    performance_stats = {}

    for col in model_columns:
        valid_predictions = df[col].isin(VALID_LABELS).sum()
        error_rate = (len(df) - valid_predictions) / len(df)

        # Label distribution
        label_dist = df[df[col].isin(VALID_LABELS)][col].value_counts()

        performance_stats[col] = {
            'valid_predictions': valid_predictions,
            'error_rate': error_rate,
            'label_distribution': label_dist.to_dict()
        }

        print(f"\n{col}:")
        print(f"  Valid predictions: {valid_predictions}/{len(df)} ({valid_predictions/len(df)*100:.1f}%)")
        print(f"  Error rate: {error_rate*100:.1f}%")

    return performance_stats

In [None]:
def analyze_consensus_quality(df, model_columns):
    """Comprehensive analysis of consensus label quality"""
    print("\n" + "="*60)
    print("CONSENSUS QUALITY ANALYSIS")
    print("="*60)

    # Basic statistics
    total_items = len(df)
    error_items = len(df[df['consensus_label'] == 'error'])
    valid_items = total_items - error_items

    print(f"Total items processed: {total_items:,}")
    print(f"Valid consensus labels: {valid_items:,}")
    print(f"Error/failed items: {error_items:,}")
    print(f"Success rate: {(valid_items/total_items)*100:.2f}%")

    if valid_items == 0:
        print("❌ No valid consensus labels generated!")
        return valid_items, error_items

    # Confidence analysis
    valid_df = df[df['consensus_label'] != 'error'].copy()

    print(f"\nCONFIDENCE DISTRIBUTION:")
    print(f"Mean confidence: {valid_df['consensus_confidence'].mean():.3f}")
    print(f"Median confidence: {valid_df['consensus_confidence'].median():.3f}")
    print(f"Std deviation: {valid_df['consensus_confidence'].std():.3f}")
    print(f"Min confidence: {valid_df['consensus_confidence'].min():.3f}")
    print(f"Max confidence: {valid_df['consensus_confidence'].max():.3f}")

    # High confidence analysis
    high_conf_threshold = QUALITY_CONFIG["min_confidence_threshold"]
    high_conf_items = valid_df[valid_df['consensus_confidence'] >= high_conf_threshold]
    print(f"\nHigh confidence items (≥{high_conf_threshold}): {len(high_conf_items):,} ({(len(high_conf_items)/len(valid_df))*100:.1f}%)")

    # Label distribution analysis
    print(f"\nLABEL DISTRIBUTION:")
    label_counts = valid_df['consensus_label'].value_counts()
    for label in VALID_LABELS:
        count = label_counts.get(label, 0)
        percentage = (count / len(valid_df)) * 100
        print(f"  {label}: {count:,} ({percentage:.1f}%)")
    return valid_items, error_items

In [None]:
def compute_majority_vote_with_ties(df, model_columns, tie_breaking='confidence'):
   """Enhanced majority voting with sophisticated tie-breaking"""

   def resolve_majority_vote(row):
       """Resolve majority vote with advanced tie handling"""
       valid_votes = [row[col] for col in model_columns if row[col] in VALID_LABELS]

       if len(valid_votes) == 0:
           return 'error', 0.0

       vote_counts = Counter(valid_votes)

       # Check for clear majority
       most_common = vote_counts.most_common()
       if len(most_common) == 1 or most_common[0][1] > most_common[1][1]:
           # Clear winner
           consensus_label = most_common[0][0]
           confidence = most_common[0][1] / len(valid_votes)
       else:
           # Handle ties
           tied_labels = [label for label, count in most_common if count == most_common[0][1]]

           if tie_breaking == 'confidence':
               consensus_label = tied_labels[0]  # Fallback to first
               confidence = most_common[0][1] / len(valid_votes) * 0.8  # Reduce confidence for ties
           elif tie_breaking == 'random':
               consensus_label = np.random.choice(tied_labels)
               confidence = most_common[0][1] / len(valid_votes) * 0.7
           else:
               # Default: pick first alphabetically
               consensus_label = sorted(tied_labels)[0]
               confidence = most_common[0][1] / len(valid_votes) * 0.8

       return consensus_label, confidence

   print(f"Computing majority vote with tie-breaking: {tie_breaking}")

   consensus_data = df.apply(resolve_majority_vote, axis=1)
   df['majority_consensus_label'] = [item[0] for item in consensus_data]
   df['majority_consensus_confidence'] = [item[1] for item in consensus_data]

   return df


In [None]:
def compute_confidence_weighted_consensus(df, model_columns, kappa_score):
   """Advanced consensus using confidence weighting and agreement quality"""

   def confidence_weighted_vote(row):
       """Calculate confidence-weighted consensus"""
       valid_votes = [row[col] for col in model_columns if row[col] in VALID_LABELS]

       if len(valid_votes) < 2:
           return 'error', 0.0

       vote_counts = Counter(valid_votes)
       most_common_label, most_common_count = vote_counts.most_common(1)[0]

       # Base confidence from agreement ratio
       agreement_ratio = most_common_count / len(valid_votes)

       # Kappa adjustment factor
       kappa_factor = max(0.2, min(1.2, kappa_score + 0.3))

       # Participation bonus (more models = higher confidence)
       participation_factor = len(valid_votes) / len(model_columns)

       # Unanimity bonus
       unanimity_bonus = 0.15 if agreement_ratio == 1.0 else 0.0

       # Strong majority bonus
       strong_majority_bonus = 0.1 if agreement_ratio >= 0.8 else 0.0

       # Calculate final confidence
       base_confidence = agreement_ratio * kappa_factor * participation_factor
       final_confidence = min(1.0, base_confidence + unanimity_bonus + strong_majority_bonus)

       return most_common_label, final_confidence

   print(f"Computing confidence-weighted consensus (kappa factor: {kappa_score:.3f})")

   consensus_data = df.apply(confidence_weighted_vote, axis=1)
   df['consensus_label'] = [item[0] for item in consensus_data]
   df['consensus_confidence'] = [item[1] for item in consensus_data]

   return df

In [None]:
def validate_consensus_quality(df, model_columns, min_confidence=0.6):
   """Validate and filter consensus results based on quality metrics"""

   print(f"\nVALIDATING CONSENSUS QUALITY (min confidence: {min_confidence})")
   print("-" * 50)

   initial_count = len(df)
   valid_consensus = df[df['consensus_label'] != 'error'].copy()

   # Apply confidence filter
   high_quality = valid_consensus[valid_consensus['consensus_confidence'] >= min_confidence].copy()

   print(f"Initial samples: {initial_count:,}")
   print(f"Valid consensus: {len(valid_consensus):,} ({len(valid_consensus)/initial_count*100:.1f}%)")
   print(f"High quality (≥{min_confidence}): {len(high_quality):,} ({len(high_quality)/initial_count*100:.1f}%)")

   # Quality metrics for high-quality subset
   if len(high_quality) > 0:
       print(f"\nHIGH QUALITY SUBSET ANALYSIS:")
       print(f"Mean confidence: {high_quality['consensus_confidence'].mean():.3f}")
       print(f"Label distribution:")

       for label in VALID_LABELS:
           count = (high_quality['consensus_label'] == label).sum()
           pct = count / len(high_quality) * 100
           print(f"  {label}: {count:,} ({pct:.1f}%)")

   return high_quality




In [None]:
def analyze_disagreement_patterns(df, model_columns, sample_size=200):
   """Analyze patterns in model disagreements to identify improvement areas"""

   print(f"\nANALYZING DISAGREEMENT PATTERNS")
   print("-" * 40)

   # Find disagreement cases
   def has_disagreement(row):
       valid_preds = [row[col] for col in model_columns if row[col] in VALID_LABELS]
       return len(set(valid_preds)) > 1 if len(valid_preds) > 1 else False

   disagreement_mask = df.apply(has_disagreement, axis=1)
   disagreement_df = df[disagreement_mask].copy()

   print(f"Total disagreements: {len(disagreement_df):,} ({len(disagreement_df)/len(df)*100:.1f}%)")

   if len(disagreement_df) == 0:
       print("✅ No disagreements found - perfect consensus!")
       return disagreement_df

   # Analyze disagreement patterns
   disagreement_patterns = defaultdict(int)

   for _, row in disagreement_df.iterrows():
       valid_preds = [row[col] for col in model_columns if row[col] in VALID_LABELS]
       pattern = tuple(sorted(set(valid_preds)))
       disagreement_patterns[pattern] += 1

   print(f"\nMost common disagreement patterns:")
   sorted_patterns = sorted(disagreement_patterns.items(), key=lambda x: x[1], reverse=True)

   for i, (pattern, count) in enumerate(sorted_patterns[:5]):
       print(f"  {i+1}. {' vs '.join(pattern)}: {count} cases")

   # Sample disagreement cases for manual review
   if len(disagreement_df) > sample_size:
       sample_disagreements = disagreement_df.sample(n=sample_size, random_state=42)
   else:
       sample_disagreements = disagreement_df

   print(f"\nSample disagreement cases (showing first 3):")
   for i, (idx, row) in enumerate(sample_disagreements.head(3).iterrows()):
       print(f"\nCase {i+1}:")
       print(f"Text: {row.get('cleaned_text', row.get('text', 'N/A'))[:150]}...")
       for col in model_columns:
           if row[col] in VALID_LABELS:
               print(f"  {col.split('_')[0]}: {row[col]}")

   return disagreement_df

##VOTING AND CONSENSUS FUNCTIONS








In [None]:
def compute_confidence_weighted_consensus(df, model_columns, kappa_score):
    """Advanced consensus using confidence weighting and agreement quality"""

    def confidence_weighted_vote(row):
        """Calculate confidence-weighted consensus"""
        valid_votes = [row[col] for col in model_columns if row[col] in VALID_LABELS]

        if len(valid_votes) < 2:
            return 'error', 0.0

        vote_counts = Counter(valid_votes)
        most_common_label, most_common_count = vote_counts.most_common(1)[0]

        # Base confidence from agreement ratio
        agreement_ratio = most_common_count / len(valid_votes)

        # Kappa adjustment factor
        kappa_factor = max(0.2, min(1.2, kappa_score + 0.3))

        # Participation bonus (more models = higher confidence)
        participation_factor = len(valid_votes) / len(model_columns)

        # Unanimity bonus
        unanimity_bonus = 0.15 if agreement_ratio == 1.0 else 0.0

        # Strong majority bonus
        strong_majority_bonus = 0.1 if agreement_ratio >= 0.8 else 0.0

        # Calculate final confidence
        base_confidence = agreement_ratio * kappa_factor * participation_factor
        final_confidence = min(1.0, base_confidence + unanimity_bonus + strong_majority_bonus)

        return most_common_label, final_confidence

    print(f"Computing confidence-weighted consensus (kappa factor: {kappa_score:.3f})")

    consensus_data = df.apply(confidence_weighted_vote, axis=1)
    df['consensus_label'] = [item[0] for item in consensus_data]
    df['consensus_confidence'] = [item[1] for item in consensus_data]

    return df

In [None]:
def compute_ensemble_consensus(df, model_columns, methods=['majority', 'confidence_weighted']):
   """Compute multiple consensus methods and select the best"""

   print(f"Computing ensemble consensus using methods: {methods}")

   # Store original columns
   original_consensus = df.get('consensus_label', None)
   original_confidence = df.get('consensus_confidence', None)

   consensus_results = {}

   # Compute different consensus methods
   if 'majority' in methods:
       df_temp = compute_majority_vote_with_ties(df.copy(), model_columns, 'confidence')
       consensus_results['majority'] = {
           'labels': df_temp['majority_consensus_label'].values,
           'confidences': df_temp['majority_consensus_confidence'].values
       }

   if 'confidence_weighted' in methods:
       # Use the kappa score from global context or compute it
       kappa_score = getattr(compute_ensemble_consensus, 'last_kappa_score', 0.5)
       df_temp = compute_confidence_weighted_consensus(df.copy(), model_columns, kappa_score)
       consensus_results['confidence_weighted'] = {
           'labels': df_temp['consensus_label'].values,
           'confidences': df_temp['consensus_confidence'].values
       }

   # For now, use confidence_weighted as default, or majority if not available
   if 'confidence_weighted' in consensus_results:
       best_method = 'confidence_weighted'
   elif 'majority' in consensus_results:
       best_method = 'majority'
   else:
       raise ValueError("No valid consensus method computed")

   print(f"Selected consensus method: {best_method}")

   # Apply the best consensus
   df['consensus_label'] = consensus_results[best_method]['labels']
   df['consensus_confidence'] = consensus_results[best_method]['confidences']

   # Store method used
   df['consensus_method'] = best_method

   return df


In [None]:
def enhanced_consensus_pipeline(df, model_columns, kappa_score):
   """Complete consensus pipeline with multiple voting methods"""

   print("\n" + "="*60)
   print("ENHANCED CONSENSUS AND VOTING PIPELINE")
   print("="*60)

   # Store kappa score for ensemble methods
   compute_ensemble_consensus.last_kappa_score = kappa_score

   # 1. Compute multiple consensus methods
   print("\n1. Computing multiple consensus methods...")
   df = compute_ensemble_consensus(df, model_columns,
                                 methods=['majority', 'confidence_weighted'])

   # 2. Analyze disagreement patterns
   print("\n2. Analyzing disagreement patterns...")
   disagreement_df = analyze_disagreement_patterns(df, model_columns, sample_size=100)

   # 3. Validate consensus quality
   print("\n3. Validating consensus quality...")
   analyze_consensus_quality(df, model_columns)

   # 4. Apply quality filter
   print("\n4. Applying quality filters...")
   high_quality_df = validate_consensus_quality(df, model_columns,
                                               QUALITY_CONFIG["min_confidence_threshold"])

   # 5. Final quality report
   print("\n5. Final Quality Report:")
   print("-" * 30)

   total_original = len(df)
   total_high_quality = len(high_quality_df)
   retention_rate = total_high_quality / total_original * 100

   print(f"Original samples: {total_original:,}")
   print(f"High quality samples: {total_high_quality:,}")
   print(f"Retention rate: {retention_rate:.1f}%")

   if retention_rate < 50:
       print("⚠️  WARNING: Low retention rate - consider lowering quality thresholds")
   elif retention_rate > 80:
       print("✅ EXCELLENT: High retention rate with quality filtering")

   return df, high_quality_df, disagreement_df

print("🗳️ Voting and consensus functions loaded")

🗳️ Voting and consensus functions loaded


##MAIN PSEUDO-LABELING PIPELINE

In [None]:
def run_enhanced_parallel_pipeline(reviews_df):
    """Main pipeline with optimized parallel processing"""

    print("🚀 Starting Enhanced Parallel Pseudo-Labeling Pipeline")
    print(f"Target: Fleiss' Kappa ≥ {QUALITY_CONFIG['target_fleiss_kappa']}")
    print("-" * 60)

    # Handle large datasets
    original_size = len(reviews_df)
    if (DATA_CONFIG["sample_large_datasets"] and
        original_size > DATA_CONFIG["max_sample_size"]):
        print(f"Large dataset detected: {original_size} samples")
        print(f"Sampling {DATA_CONFIG['max_sample_size']} for processing")
        reviews_df = reviews_df.sample(n=DATA_CONFIG["max_sample_size"], random_state=42)

    review_texts = reviews_df['cleaned_text'].fillna('').astype(str).tolist()
    print(f"Processing {len(review_texts)} reviews with {len(MODELS_TO_TRY)} models")

    # Show initial memory status
    print(f"\nInitial system status:")
    SystemMonitor.print_memory_status()

    # Initialize classifier
    classifier = EnhancedParallelClassifier(MODELS_TO_TRY)
    successful_models = []
    processing_stats = {}

    try:
        for i, model_name in enumerate(MODELS_TO_TRY):
            print(f"\n{'='*60}")
            print(f"MODEL {i+1}/{len(MODELS_TO_TRY)}: {model_name.upper()}")
            print('='*60)

            start_time = time.time()

            # Load model
            if not classifier.load_model(model_name):
                print(f"❌ Failed to load {model_name}")
                continue

            try:
                # Process with parallel/sequential based on config
                classifications = classifier.classify_texts_parallel(
                    review_texts, policy_definitions
                )

                processing_time = time.time() - start_time

                # Validate results
                valid_count = sum(1 for c in classifications if c in VALID_LABELS)
                error_rate = (len(classifications) - valid_count) / len(classifications)

                processing_stats[model_name] = {
                    'processing_time': processing_time,
                    'valid_predictions': valid_count,
                    'error_rate': error_rate
                }

                print(f"Results: {valid_count}/{len(classifications)} valid ({valid_count/len(classifications)*100:.1f}%)")
                print(f"Error rate: {error_rate*100:.1f}%")
                print(f"Processing time: {processing_time:.1f}s")

                if error_rate <= QUALITY_CONFIG["max_error_rate"]:
                    reviews_df[f'{model_name}_classification'] = classifications
                    successful_models.append(model_name)
                    print(f"✅ {model_name} completed successfully")
                else:
                    print(f"⚠️ {model_name} has high error rate but keeping results")
                    reviews_df[f'{model_name}_classification'] = classifications
                    successful_models.append(model_name)

            except Exception as e:
                print(f"❌ Error processing {model_name}: {str(e)}")

            finally:
                # Cleanup after each model
                classifier.cleanup()
                SystemMonitor.aggressive_cleanup()
                print("Memory status after cleanup:")
                SystemMonitor.print_memory_status()

    finally:
        classifier.cleanup()

    if len(successful_models) == 0:
        raise RuntimeError("No models completed successfully!")

    print(f"\n🎉 Successfully processed with {len(successful_models)} models")

    # Analysis phase
    model_columns = [f'{model}_classification' for model in successful_models]

    # Performance analysis
    print(f"\n📊 MODEL PERFORMANCE ANALYSIS")
    print("="*50)
    performance_stats_detailed = analyze_model_performance(reviews_df, model_columns)

    # Agreement analysis
    print(f"\n🤝 INTER-RATER AGREEMENT ANALYSIS")
    print("="*50)
    kappa_score, valid_items, ratings_table, agreement_metrics = compute_enhanced_fleiss_kappa(
        reviews_df, model_columns
    )

    interpretation = interpret_kappa(kappa_score)
    print(f"\nFleiss' Kappa: {kappa_score:.4f} ({interpretation})")

    # Enhanced consensus and voting
    print(f"\n🗳️  CONSENSUS AND VOTING PHASE")
    print("="*60)

    # Run enhanced consensus pipeline
    reviews_df, high_quality_df, disagreement_df = enhanced_consensus_pipeline(
        reviews_df, model_columns, kappa_score
    )

    # Store results for later use
    consensus_stats = {
        'total_samples': len(reviews_df),
        'high_quality_samples': len(high_quality_df),
        'disagreement_cases': len(disagreement_df),
        'retention_rate': len(high_quality_df) / len(reviews_df) * 100
    }

    print(f"\n📊 CONSENSUS SUMMARY:")
    print(f"Retention rate: {consensus_stats['retention_rate']:.1f}%")
    print(f"Disagreement cases: {consensus_stats['disagreement_cases']:,}")

    # Final quality gate
    if kappa_score >= QUALITY_CONFIG["target_fleiss_kappa"]:
        print(f"\n✅ EXCELLENT: Achieved target kappa of {kappa_score:.3f}")
        quality_status = "EXCELLENT"
    elif kappa_score >= QUALITY_CONFIG["min_acceptable_kappa"]:
        print(f"\n✅ GOOD: Acceptable kappa of {kappa_score:.3f}")
        quality_status = "GOOD"
    else:
        print(f"\n⚠️ WARNING: Low kappa of {kappa_score:.3f}")
        print("Recommendations:")
        print("- Review and refine policy definitions")
        print("- Consider using different models")
        print("- Increase sample size for training")
        print("- Manual review of disagreement cases")

        response = input("Continue with data preparation? (y/n): ").lower().strip()
        if response != 'y':
            raise RuntimeError("Pipeline stopped due to low agreement")
        quality_status = "POOR"

    return {
        'reviews_df': reviews_df,
        'high_quality_df': high_quality_df,
        'disagreement_df': disagreement_df,
        'successful_models': successful_models,
        'kappa_score': kappa_score,
        'processing_stats': processing_stats,
        'consensus_stats': consensus_stats,
        'quality_status': quality_status
    }

In [None]:
# Run the enhanced pipeline
print("🚀 Starting the enhanced parallel processing pipeline...")
pipeline_results = run_enhanced_parallel_pipeline(reviews_df)

# Extract results
reviews_df = pipeline_results['reviews_df']
high_quality_df = pipeline_results['high_quality_df']
kappa_score = pipeline_results['kappa_score']
successful_models = pipeline_results['successful_models']

print(f"\n🎯 PIPELINE COMPLETED")
print(f"Final Fleiss' Kappa: {kappa_score:.4f}")
print(f"Quality Status: {pipeline_results['quality_status']}")

🚀 Starting the enhanced parallel processing pipeline...
🚀 Starting Enhanced Parallel Pseudo-Labeling Pipeline
Target: Fleiss' Kappa ≥ 0.75
------------------------------------------------------------
Large dataset detected: 17204 samples
Sampling 8000 for processing
Processing 8000 reviews with 3 models

Initial system status:
RAM: 10.6GB / 83.5GB (13.7%)
GPU: 0.0GB / 39.6GB (0.1%)

MODEL 1/3: MISTRALAI/MISTRAL-7B-INSTRUCT-V0.3
Loading mistralai/Mistral-7B-Instruct-v0.3 (attempt 1/3)...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Successfully loaded mistralai/Mistral-7B-Instruct-v0.3
🔄 Processing 8000 texts with parallel batching


Processing with mistralai/Mistral-7B-Instruct-v0.3:   0%|          | 0/1334 [00:00<?, ?it/s]

Results: 7998/8000 valid (100.0%)
Error rate: 0.0%
Processing time: 3742.9s
✅ mistralai/Mistral-7B-Instruct-v0.3 completed successfully
Memory status after cleanup:
RAM: 9.7GB / 83.5GB (12.7%)
GPU: 0.0GB / 39.6GB (0.1%)

MODEL 2/3: GOOGLE/GEMMA-2-2B-IT
Loading google/gemma-2-2b-it (attempt 1/3)...


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

✅ Successfully loaded google/gemma-2-2b-it
🔄 Processing 8000 texts with parallel batching


Processing with google/gemma-2-2b-it:   0%|          | 0/1334 [00:00<?, ?it/s]

Results: 8000/8000 valid (100.0%)
Error rate: 0.0%
Processing time: 2516.8s
✅ google/gemma-2-2b-it completed successfully
Memory status after cleanup:
RAM: 10.6GB / 83.5GB (13.7%)
GPU: 0.0GB / 39.6GB (0.1%)

MODEL 3/3: META-LLAMA/META-LLAMA-3.1-8B-INSTRUCT
Loading meta-llama/Meta-Llama-3.1-8B-Instruct (attempt 1/3)...


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

✅ Successfully loaded meta-llama/Meta-Llama-3.1-8B-Instruct
🔄 Processing 8000 texts with parallel batching


Processing with meta-llama/Meta-Llama-3.1-8B-Instruct:   0%|          | 0/1334 [00:00<?, ?it/s]

Results: 8000/8000 valid (100.0%)
Error rate: 0.0%
Processing time: 2715.9s
✅ meta-llama/Meta-Llama-3.1-8B-Instruct completed successfully
Memory status after cleanup:
RAM: 11.4GB / 83.5GB (14.7%)
GPU: 0.0GB / 39.6GB (0.1%)

🎉 Successfully processed with 3 models

📊 MODEL PERFORMANCE ANALYSIS

MODEL PERFORMANCE ANALYSIS

mistralai/Mistral-7B-Instruct-v0.3_classification:
  Valid predictions: 7998/8000 (100.0%)
  Error rate: 0.0%

google/gemma-2-2b-it_classification:
  Valid predictions: 8000/8000 (100.0%)
  Error rate: 0.0%

meta-llama/Meta-Llama-3.1-8B-Instruct_classification:
  Valid predictions: 8000/8000 (100.0%)
  Error rate: 0.0%

🤝 INTER-RATER AGREEMENT ANALYSIS
Computing Fleiss' Kappa for 7998 valid items
Perfect agreement: 5015/7998 (62.7%)
Majority agreement: 7891/7998 (98.7%)

Fleiss' Kappa: 0.4740 (Moderate agreement)

🗳️  CONSENSUS AND VOTING PHASE

ENHANCED CONSENSUS AND VOTING PIPELINE

1. Computing multiple consensus methods...
Computing ensemble consensus using methods

##LABEL ENCODING AND DATA PREPARATION




In [74]:
print("🔧 Starting data preparation and encoding...")

# Use high quality data for training if retention rate is acceptable
if pipeline_results['consensus_stats']['retention_rate'] >= 60:
    print("Using high-quality filtered dataset for training")
    df_filtered = high_quality_df.copy()
else:
    print("Using full consensus dataset (low retention rate)")
    df_filtered = reviews_df[reviews_df['consensus_label'] != 'error'].copy()

print(f"Training dataset size: {len(df_filtered):,} samples")

# Ensure all features exist and handle missing values
print("Validating and preparing features...")
missing_features = []
for col in CATEGORICAL_FEATURES + CONTINUOUS_FEATURES:
    if col not in df_filtered.columns:
        df_filtered[col] = 0
        missing_features.append(col)

if missing_features:
    print(f"Added missing features with default values: {len(missing_features)} features")

# Fill any remaining NaN values
df_filtered[CONTINUOUS_FEATURES] = df_filtered[CONTINUOUS_FEATURES].fillna(0)
df_filtered[CATEGORICAL_FEATURES] = df_filtered[CATEGORICAL_FEATURES].fillna(0)

# Initialize encoders
print("Initializing encoders...")
label_encoder = LabelEncoder()
categorical_encoders = {}
scaler = StandardScaler()

# Encode target labels
df_filtered['labels'] = label_encoder.fit_transform(df_filtered['consensus_label'])
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(f"Label encoding: {label_mapping}")

# Encode categorical features
categorical_encoded = {}
for col in CATEGORICAL_FEATURES:
    encoder = LabelEncoder()
    # Ensure consistent data type
    df_filtered[col] = df_filtered[col].astype(str)
    # Convert encoded values to a pandas Series with a *new* numerical index
    categorical_encoded[col] = pd.Series(encoder.fit_transform(df_filtered[col])).reset_index(drop=True)
    categorical_encoders[col] = encoder
    print(f"Encoded {col}: {len(encoder.classes_)} unique values")


# Add required columns for packaging
# Reset the index of df_filtered before adding package-specific columns
# This ensures that the index aligns with the categorical_encoded Series created above
df_filtered = df_filtered.reset_index(drop=True)
df_filtered['review_id'] = df_filtered.index.astype(str) # Use the new numerical index for review_id
df_filtered['review_text'] = df_filtered.get('cleaned_text', df_filtered.get('text', '')).fillna('')


print("✅ Data preparation complete!")

🔧 Starting data preparation and encoding...
Using high-quality filtered dataset for training
Training dataset size: 5,016 samples
Validating and preparing features...
Initializing encoders...
Label encoding: {'advertisement': np.int64(0), 'irrelevant': np.int64(1), 'rant_without_visit': np.int64(2), 'relevant_and_quality': np.int64(3)}
Encoded review_day_of_week: 7 unique values
Encoded review_hour_of_day: 24 unique values
Encoded dominant_topic: 5 unique values
✅ Data preparation complete!


##DATA SPLITTING AND NORMALIZATION

In [80]:
print("📊 Splitting data and normalizing features...")

# Check for minimum samples per class
label_counts = df_filtered['consensus_label'].value_counts()
min_samples = label_counts.min()
print(f"Label distribution: {dict(label_counts)}")
print(f"Minimum samples per class: {min_samples}")

if min_samples < 10:
    print("⚠️ WARNING: Some classes have very few samples. Consider collecting more data.")

# Stratified split to maintain label distribution
try:
    train_df, temp_df = train_test_split(
        df_filtered,
        test_size=DATA_CONFIG["val_ratio"] + DATA_CONFIG["test_ratio"],
        random_state=42,
        stratify=df_filtered['labels']
    )

    val_df, test_df = train_test_split(
        temp_df,
        test_size=DATA_CONFIG["test_ratio"] / (DATA_CONFIG["val_ratio"] + DATA_CONFIG["test_ratio"]),
        random_state=42,
        stratify=temp_df['labels']
    )

except ValueError as e:
    print(f"⚠️ Stratification failed: {e}")
    print("Using random split instead...")

    train_df, temp_df = train_test_split(
        df_filtered,
        test_size=DATA_CONFIG["val_ratio"] + DATA_CONFIG["test_ratio"],
        random_state=42
    )

    val_df, test_df = train_test_split(
        temp_df,
        test_size=DATA_CONFIG["test_ratio"] / (DATA_CONFIG["val_ratio"] + DATA_CONFIG["test_ratio"]),
        random_state=42
    )

print(f"Data splits:")
print(f"  Training: {len(train_df):,} samples ({len(train_df)/len(df_filtered)*100:.1f}%)")
print(f"  Validation: {len(val_df):,} samples ({len(val_df)/len(df_filtered)*100:.1f}%)")
print(f"  Test: {len(test_df):,} samples ({len(test_df)/len(df_filtered)*100:.1f}%)")

# Normalize continuous features
print("Normalizing continuous features...")
train_continuous = scaler.fit_transform(train_df[CONTINUOUS_FEATURES])
val_continuous = scaler.transform(val_df[CONTINUOUS_FEATURES])
test_continuous = scaler.transform(test_df[CONTINUOUS_FEATURES])

print("✅ Data splitting and normalization complete!")

📊 Splitting data and normalizing features...
Label distribution: {'relevant_and_quality': np.int64(3764), 'irrelevant': np.int64(1230), 'rant_without_visit': np.int64(12), 'advertisement': np.int64(10)}
Minimum samples per class: 10
Data splits:
  Training: 4,012 samples (80.0%)
  Validation: 502 samples (10.0%)
  Test: 502 samples (10.0%)
Normalizing continuous features...
✅ Data splitting and normalization complete!


##5.DATA PACKAGING AND SAVING

In [81]:
print("📦 Packaging final datasets...")

def pack_dataset(df, continuous_features):
    """Package dataset for model training"""
    # Reset index to ensure simple numerical index for alignment within the function
    # Note: df_filtered already had its index reset in the previous cell,
    # but resetting again here on the split dataframes (train_df, val_df, test_df)
    # ensures that their indices are a simple 0-based range for easy .iloc access.
    df_reset = df.reset_index(drop=True)

    # Use the new numerical index to select corresponding rows
    # from the categorical_encoded Series.
    # Since categorical_encoded was created with a simple numerical index
    # and df_reset now also has a simple numerical index, .iloc will work correctly.
    categorical_array = np.column_stack([
        categorical_encoded[col].iloc[df_reset.index] for col in CATEGORICAL_FEATURES
    ])

    return {
        'review_text': df_reset['review_text'].tolist(),
        'categorical_features': categorical_array.tolist(),
        'continuous_features': continuous_features.tolist(),
        'labels': df_reset['labels'].tolist(),
        # When packaging, use a new simple numerical review_id based on the reset index
        # or retain the original review_id if needed for tracking.
        # Assuming a new simple numerical index is sufficient for packaging:
        'review_id': df_reset.index.astype(str).tolist(),
        'original_labels': df_reset['consensus_label'].tolist(),
        'confidence_scores': df_reset['consensus_confidence'].tolist()
    }

# Package datasets
print("Creating dataset packages...")
datasets = {
    'train': pack_dataset(train_df, train_continuous),
    'val': pack_dataset(val_df, val_continuous),
    'test': pack_dataset(test_df, test_continuous)
}

# Create comprehensive metadata
metadata = {
    "version": "3.0",
    "creation_timestamp": pd.Timestamp.now().isoformat(),
    "pipeline_info": {
        "models_used": successful_models,
        "parallel_processing": PARALLEL_CONFIG["enable_parallel"],
        "quality_status": pipeline_results['quality_status']
    },
    "ira_analysis": {
        "fleiss_kappa": float(kappa_score),
        "interpretation": interpret_kappa(kappa_score),
        "target_kappa": QUALITY_CONFIG["target_fleiss_kappa"],
        "valid_items_analyzed": pipeline_results['consensus_stats']['total_samples'],
        "retention_rate": pipeline_results['consensus_stats']['retention_rate']
    },
    "categorical_features": {
        "feature_names": CATEGORICAL_FEATURES,
        "feature_dim": len(CATEGORICAL_FEATURES),
        "encoders": {col: {
            "classes": categorical_encoders[col].classes_.tolist(),
            "n_classes": len(categorical_encoders[col].classes_)
        } for col in CATEGORICAL_FEATURES}
    },
    "continuous_features": {
        "feature_names": CONTINUOUS_FEATURES,
        "feature_dim": len(CONTINUOUS_FEATURES),
        "normalization_stats": {
            "means": scaler.mean_.tolist(),
            "stds": scaler.scale_.tolist()
        }
    },
    "labels": {
        "encoder_classes": label_encoder.classes_.tolist(),
        "num_classes": len(label_encoder.classes_),
        "class_names": CLASS_NAMES,
        "label_distribution": {
            "train": train_df['consensus_label'].value_counts().to_dict(),
            "val": val_df['consensus_label'].value_counts().to_dict(),
            "test": test_df['consensus_label'].value_counts().to_dict()
        }
    },
    "dataset_stats": {
        "original_samples": pipeline_results['consensus_stats']['total_samples'],
        "high_quality_samples": pipeline_results['consensus_stats']['high_quality_samples'],
        "final_training_samples": len(df_filtered),
        "train_samples": len(train_df),
        "val_samples": len(val_df),
        "test_samples": len(test_df),
        "split_ratios": [DATA_CONFIG["train_ratio"], DATA_CONFIG["val_ratio"], DATA_CONFIG["test_ratio"]]
    },
    "processing_stats": pipeline_results['processing_stats'],
    "text_processing": {
        "max_length": 512,
        "text_column": "review_text"
    },
    "quality_metrics": {
        # Accessing the 'consensus_method' from the first row of df_filtered after index reset
        "consensus_method": df_filtered['consensus_method'].iloc[0] if len(df_filtered) > 0 and 'consensus_method' in df_filtered.columns else 'unknown',
        "min_confidence_threshold": QUALITY_CONFIG["min_confidence_threshold"],
        "disagreement_cases": pipeline_results['consensus_stats']['disagreement_cases']
    }
}

# Save all files
print("💾 Saving datasets and metadata...")

# Ensure directory exists
Path(BASE_DIR).mkdir(parents=True, exist_ok=True)

try:
    # Save datasets
    with open(PROCESSED_TRAIN_FILE, "wb") as f:
        pickle.dump(datasets['train'], f, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"✅ Saved training data: {len(datasets['train']['labels']):,} samples")

    with open(PROCESSED_VAL_FILE, "wb") as f:
        pickle.dump(datasets['val'], f, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"✅ Saved validation data: {len(datasets['val']['labels']):,} samples")

    with open(PROCESSED_TEST_FILE, "wb") as f:
        pickle.dump(datasets['test'], f, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"✅ Saved test data: {len(datasets['test']['labels']):,} samples")

    # Save metadata
    with open(METADATA_FILE, "w") as f:
        json.dump(metadata, f, indent=4)
    print(f"✅ Saved metadata")

    # Save encoders for inference
    encoders_dict = {
        'label_encoder': label_encoder,
        'categorical_encoders': categorical_encoders,
        'scaler': scaler
    }
    with open(ENCODERS_FILE, "wb") as f:
        pickle.dump(encoders_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"✅ Saved encoders")

    print(f"\n📁 All files saved to: {BASE_DIR}")
    print(f"Files created:")
    print(f"  - {os.path.basename(PROCESSED_TRAIN_FILE)}")
    print(f"  - {os.path.basename(PROCESSED_VAL_FILE)}")
    print(f"  - {os.path.basename(PROCESSED_TEST_FILE)}")
    print(f"  - {os.path.basename(METADATA_FILE)}")
    print(f"  - {os.path.basename(ENCODERS_FILE)}")

except Exception as e:
    print(f"❌ Error saving files: {str(e)}")
    raise

print("\n" + "="*80)
print("🎉 NOTEBOOK 2 COMPLETED SUCCESSFULLY!")
print("="*80)
print(f"📊 FINAL")

📦 Packaging final datasets...
Creating dataset packages...
💾 Saving datasets and metadata...
✅ Saved training data: 4,012 samples
✅ Saved validation data: 502 samples
✅ Saved test data: 502 samples
✅ Saved metadata
✅ Saved encoders

📁 All files saved to: /content/drive/MyDrive/Tiktok_Hackaton/preprocessed_data
Files created:
  - processed_train.pkl
  - processed_val.pkl
  - processed_test.pkl
  - feature_metadata.json
  - encoders.pkl

🎉 NOTEBOOK 2 COMPLETED SUCCESSFULLY!
📊 FINAL
