In [1]:
import os
import zipfile

# Paths to your datasets
data_folders = [
    "/kaggle/input/test-labeled",
    "/kaggle/input/test-unlabeled",
    "/kaggle/input/train-data"
]

output_path = "/kaggle/working/"  # Extracting files here

# Unzip all files
for folder in data_folders:
    if os.path.exists(folder):  # Check if the folder exists
        for file in os.listdir(folder):
            if file.endswith(".zip"):
                file_path = os.path.join(folder, file)
                extract_path = os.path.join(output_path, os.path.splitext(file)[0])  # Extract to a subfolder

                if not os.path.exists(extract_path):  # Avoid re-extracting
                    try:
                        with zipfile.ZipFile(file_path, 'r') as zip_ref:
                            zip_ref.extractall(extract_path)
                        print(f"Extracted: {file} to {extract_path}")
                    except zipfile.BadZipFile:
                        print(f"Error: Corrupted zip file - {file}")
                else:
                    print(f"Skipping extraction (already exists): {file}")

print("All datasets processed successfully!")



All datasets processed successfully!


In [2]:
import os
import json
import pandas as pd
import zipfile
from pathlib import Path
import glob
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class MuShroomDataLoader:
    """
    Professional data loader for Mu-SHROOM hallucination detection project.
    Handles multiple file formats and provides comprehensive dataset analysis.
    """
    
    def __init__(self, base_paths=None):
        """
        Initialize the data loader with specified base paths.
        
        Args:
            base_paths (dict): Dictionary mapping dataset names to their paths
        """
        if base_paths is None:
            self.base_paths = {
                'test_labeled': '/kaggle/input/test-labeled/v1/',
                'test_unlabeled': '/kaggle/input/test-unlabeled/v1/',
                'train_data': '/kaggle/input/train-data/train/'
            }
        else:
            self.base_paths = base_paths
            
        self.loaded_datasets = {}
        self.file_registry = {}
        
    def discover_files(self):
        """
        Discover all files in the specified data directories.
        
        Returns:
            dict: Dictionary mapping dataset names to lists of file information
        """
        logger.info("Starting file discovery process")
        
        discovered_files = {}
        
        for dataset_name, path in self.base_paths.items():
            logger.info(f"Exploring dataset: {dataset_name}")
            logger.info(f"Path: {path}")
            
            if not os.path.exists(path):
                logger.warning(f"Path does not exist: {path}")
                discovered_files[dataset_name] = []
                continue
            
            files = []
            
            try:
                for root, dirs, filenames in os.walk(path):
                    for filename in filenames:
                        file_path = os.path.join(root, filename)
                        file_size = os.path.getsize(file_path)
                        file_ext = os.path.splitext(filename)[1].lower()
                        
                        relative_path = os.path.relpath(file_path, path)
                        
                        file_info = {
                            'name': relative_path,
                            'full_path': file_path,
                            'size_bytes': file_size,
                            'extension': file_ext,
                            'dataset': dataset_name
                        }
                        
                        files.append(file_info)
                        logger.debug(f"Found file: {relative_path} ({file_size} bytes)")
                
                logger.info(f"Dataset {dataset_name}: Found {len(files)} files")
                discovered_files[dataset_name] = files
                
            except Exception as e:
                logger.error(f"Error exploring {dataset_name}: {str(e)}")
                discovered_files[dataset_name] = []
        
        self.file_registry = discovered_files
        return discovered_files
    
    def load_json_file(self, file_path, max_records=None):
        """
        Load a JSON file and return its contents.
        
        Args:
            file_path (str): Path to the JSON file
            max_records (int, optional): Maximum number of records to load
            
        Returns:
            dict or list: Loaded JSON data
        """
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            if isinstance(data, list) and max_records is not None:
                data = data[:max_records]
            
            logger.info(f"Successfully loaded JSON file: {os.path.basename(file_path)}")
            return data
            
        except Exception as e:
            logger.error(f"Error loading JSON file {file_path}: {str(e)}")
            return None
    
    def load_jsonl_file(self, file_path, max_records=None):
        """
        Load a JSONL file and return its contents as a list.
        
        Args:
            file_path (str): Path to the JSONL file
            max_records (int, optional): Maximum number of records to load
            
        Returns:
            list: List of JSON objects
        """
        try:
            data = []
            with open(file_path, 'r', encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if max_records is not None and i >= max_records:
                        break
                    
                    line = line.strip()
                    if line:
                        data.append(json.loads(line))
            
            logger.info(f"Successfully loaded JSONL file: {os.path.basename(file_path)} ({len(data)} records)")
            return data
            
        except Exception as e:
            logger.error(f"Error loading JSONL file {file_path}: {str(e)}")
            return None
    
    def load_csv_file(self, file_path, max_records=None):
        """
        Load a CSV file and return as pandas DataFrame.
        
        Args:
            file_path (str): Path to the CSV file
            max_records (int, optional): Maximum number of records to load
            
        Returns:
            pandas.DataFrame: Loaded CSV data
        """
        try:
            if max_records is not None:
                data = pd.read_csv(file_path, nrows=max_records)
            else:
                data = pd.read_csv(file_path)
            
            logger.info(f"Successfully loaded CSV file: {os.path.basename(file_path)} ({data.shape[0]} rows)")
            return data
            
        except Exception as e:
            logger.error(f"Error loading CSV file {file_path}: {str(e)}")
            return None
    
    def extract_and_load_zip(self, file_path, max_records=None):
        """
        Extract and load contents from a ZIP file.
        
        Args:
            file_path (str): Path to the ZIP file
            max_records (int, optional): Maximum number of records to load per file
            
        Returns:
            dict: Dictionary mapping filenames to their loaded data
        """
        try:
            zip_contents = {}
            
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                file_list = zip_ref.namelist()
                logger.info(f"ZIP file contains {len(file_list)} files")
                
                for zip_filename in file_list:
                    if zip_filename.endswith('/'):  # Skip directories
                        continue
                    
                    file_ext = os.path.splitext(zip_filename)[1].lower()
                    
                    try:
                        if file_ext == '.json':
                            with zip_ref.open(zip_filename) as f:
                                data = json.load(f)
                                if isinstance(data, list) and max_records is not None:
                                    data = data[:max_records]
                                zip_contents[zip_filename] = data
                                
                        elif file_ext == '.jsonl':
                            data = []
                            with zip_ref.open(zip_filename) as f:
                                for i, line in enumerate(f):
                                    if max_records is not None and i >= max_records:
                                        break
                                    line = line.decode('utf-8').strip()
                                    if line:
                                        data.append(json.loads(line))
                            zip_contents[zip_filename] = data
                            
                        logger.debug(f"Loaded from ZIP: {zip_filename}")
                        
                    except Exception as e:
                        logger.warning(f"Could not load {zip_filename} from ZIP: {str(e)}")
            
            logger.info(f"Successfully processed ZIP file: {os.path.basename(file_path)}")
            return zip_contents
            
        except Exception as e:
            logger.error(f"Error processing ZIP file {file_path}: {str(e)}")
            return None
    
    def load_file(self, file_info, max_records=1000):
        """
        Load a single file based on its extension.
        
        Args:
            file_info (dict): File information dictionary
            max_records (int): Maximum number of records to load
            
        Returns:
            tuple: (success, data) where success is boolean and data is the loaded content
        """
        file_path = file_info['full_path']
        extension = file_info['extension']
        
        # Skip very small files (likely metadata or empty)
        if file_info['size_bytes'] < 50:
            logger.debug(f"Skipping small file: {file_info['name']} ({file_info['size_bytes']} bytes)")
            return False, None
        
        if extension == '.json':
            data = self.load_json_file(file_path, max_records)
            return data is not None, data
            
        elif extension == '.jsonl':
            data = self.load_jsonl_file(file_path, max_records)
            return data is not None, data
            
        elif extension == '.csv':
            data = self.load_csv_file(file_path, max_records)
            return data is not None, data
            
        elif extension == '.zip':
            data = self.extract_and_load_zip(file_path, max_records)
            return data is not None, data
            
        else:
            logger.debug(f"Unsupported file type: {extension}")
            return False, None
    
    def load_all_datasets(self, max_records_per_file=1000):
        """
        Load all discovered datasets.
        
        Args:
            max_records_per_file (int): Maximum records to load per file
            
        Returns:
            dict: Dictionary mapping dataset names to their loaded data
        """
        logger.info("Starting dataset loading process")
        
        if not self.file_registry:
            self.discover_files()
        
        loaded_data = {}
        
        for dataset_name, files in self.file_registry.items():
            logger.info(f"Loading dataset: {dataset_name}")
            
            dataset_data = {}
            successful_loads = 0
            
            for file_info in files:
                success, data = self.load_file(file_info, max_records_per_file)
                
                if success:
                    dataset_data[file_info['name']] = {
                        'data': data,
                        'file_info': file_info,
                        'loaded_at': pd.Timestamp.now()
                    }
                    successful_loads += 1
            
            loaded_data[dataset_name] = dataset_data
            logger.info(f"Dataset {dataset_name}: Successfully loaded {successful_loads}/{len(files)} files")
        
        self.loaded_datasets = loaded_data
        return loaded_data
    
    def analyze_dataset_structure(self, dataset_name=None):
        """
        Analyze the structure of loaded datasets.
        
        Args:
            dataset_name (str, optional): Specific dataset to analyze. If None, analyzes all.
            
        Returns:
            dict: Analysis results
        """
        if not self.loaded_datasets:
            logger.warning("No datasets loaded. Call load_all_datasets() first.")
            return {}
        
        datasets_to_analyze = [dataset_name] if dataset_name else list(self.loaded_datasets.keys())
        analysis_results = {}
        
        for ds_name in datasets_to_analyze:
            if ds_name not in self.loaded_datasets:
                logger.warning(f"Dataset {ds_name} not found in loaded datasets")
                continue
            
            logger.info(f"Analyzing dataset structure: {ds_name}")
            
            dataset = self.loaded_datasets[ds_name]
            analysis = {
                'file_count': len(dataset),
                'total_records': 0,
                'record_types': {},
                'common_keys': None,
                'sample_records': [],
                'data_schema': {}
            }
            
            all_keys_sets = []
            
            for filename, file_data in dataset.items():
                data = file_data['data']
                
                if isinstance(data, list):
                    analysis['total_records'] += len(data)
                    analysis['record_types'][filename] = f"list ({len(data)} items)"
                    
                    if len(data) > 0 and isinstance(data[0], dict):
                        keys = set(data[0].keys())
                        all_keys_sets.append(keys)
                        
                        # Store sample records
                        analysis['sample_records'].extend(data[:2])
                        
                        # Analyze data types
                        for key, value in data[0].items():
                            if key not in analysis['data_schema']:
                                analysis['data_schema'][key] = set()
                            analysis['data_schema'][key].add(type(value).__name__)
                
                elif isinstance(data, dict):
                    # Handle ZIP contents or nested dictionaries
                    for sub_key, sub_data in data.items():
                        if isinstance(sub_data, list) and len(sub_data) > 0:
                            analysis['total_records'] += len(sub_data)
                            if isinstance(sub_data[0], dict):
                                keys = set(sub_data[0].keys())
                                all_keys_sets.append(keys)
                                analysis['sample_records'].extend(sub_data[:2])
                
                elif isinstance(data, pd.DataFrame):
                    analysis['total_records'] += len(data)
                    analysis['record_types'][filename] = f"dataframe ({data.shape[0]}x{data.shape[1]})"
                    keys = set(data.columns)
                    all_keys_sets.append(keys)
                    analysis['sample_records'].extend(data.head(2).to_dict('records'))
            
            # Find common keys across all files
            if all_keys_sets:
                analysis['common_keys'] = set.intersection(*all_keys_sets) if all_keys_sets else set()
                analysis['all_unique_keys'] = set.union(*all_keys_sets) if all_keys_sets else set()
            
            # Clean up data schema
            for key, types in analysis['data_schema'].items():
                analysis['data_schema'][key] = list(types)
            
            analysis_results[ds_name] = analysis
            
            # Log analysis summary
            logger.info(f"Dataset {ds_name} analysis:")
            logger.info(f"  - Files: {analysis['file_count']}")
            logger.info(f"  - Total records: {analysis['total_records']}")
            logger.info(f"  - Common keys: {len(analysis['common_keys'])} keys")
            logger.info(f"  - Unique keys across all files: {len(analysis.get('all_unique_keys', set()))} keys")
        
        return analysis_results
    
    def get_summary_report(self):
        """
        Generate a comprehensive summary report of all loaded data.
        
        Returns:
            dict: Summary report
        """
        if not self.loaded_datasets:
            return {"error": "No datasets loaded"}
        
        total_files = 0
        total_records = 0
        datasets_summary = {}
        
        analysis = self.analyze_dataset_structure()
        
        for dataset_name, dataset_analysis in analysis.items():
            total_files += dataset_analysis['file_count']
            total_records += dataset_analysis['total_records']
            
            datasets_summary[dataset_name] = {
                'files': dataset_analysis['file_count'],
                'records': dataset_analysis['total_records'],
                'common_keys': list(dataset_analysis['common_keys']),
                'sample_schema': dataset_analysis['data_schema']
            }
        
        summary = {
            'total_datasets': len(self.loaded_datasets),
            'total_files': total_files,
            'total_records': total_records,
            'datasets': datasets_summary,
            'analysis_timestamp': pd.Timestamp.now().isoformat()
        }
        
        return summary

def main():
    """
    Main execution function for the data loader.
    """
    # Initialize the data loader
    loader = MuShroomDataLoader()
    
    # Discover all files
    logger.info("MU-SHROOM Data Loading Pipeline - Starting")
    discovered_files = loader.discover_files()
    
    # Load all datasets
    loaded_data = loader.load_all_datasets(max_records_per_file=1000)
    
    # Generate analysis
    analysis_results = loader.analyze_dataset_structure()
    
    # Generate summary report
    summary = loader.get_summary_report()
    
    # Print summary
    print("\nMU-SHROOM DATA LOADING SUMMARY")
    print("=" * 50)
    print(f"Datasets loaded: {summary['total_datasets']}")
    print(f"Total files: {summary['total_files']}")
    print(f"Total records: {summary['total_records']}")
    
    print("\nDATASET BREAKDOWN:")
    for dataset_name, info in summary['datasets'].items():
        print(f"  {dataset_name}:")
        print(f"    Files: {info['files']}")
        print(f"    Records: {info['records']}")
        print(f"    Common keys: {info['common_keys']}")
    
    return loader, loaded_data, analysis_results, summary

# Execute the pipeline
if __name__ == "__main__":
    loader, data, analysis, summary = main()
    print("\nData loading complete. Ready for preprocessing and feature engineering.")


MU-SHROOM DATA LOADING SUMMARY
Datasets loaded: 3
Total files: 32
Total records: 6305

DATASET BREAKDOWN:
  test_labeled:
    Files: 14
    Records: 1902
    Common keys: ['model_output_tokens', 'hard_labels', 'soft_labels', 'lang', 'model_input', 'model_output_logits', 'model_output_text', 'model_id', 'id']
  test_unlabeled:
    Files: 14
    Records: 1902
    Common keys: ['model_output_tokens', 'lang', 'model_input', 'model_output_logits', 'model_output_text', 'model_id', 'id']
  train_data:
    Files: 4
    Records: 2501
    Common keys: ['model_output_tokens', 'lang', 'model_input', 'model_output_logits', 'model_output_text', 'model_id']

Data loading complete. Ready for preprocessing and feature engineering.


In [3]:
import pandas as pd
import numpy as np
import json
import re
import string
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class SimpleMuShroomProcessor:
    """
    Simplified Mu-SHROOM processor that works without external NLTK dependencies.
    """
    
    def __init__(self):
        self.stop_words = self._get_stopwords()
        self.tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
        self.scaler = StandardScaler()
        
    def _get_stopwords(self):
        """Basic English stopwords."""
        return {
            'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
            'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
            'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
            'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been',
            'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
            'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
            'by', 'for', 'with', 'through', 'during', 'before', 'after', 'above', 'below',
            'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
            'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor',
            'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just'
        }
    
    def simple_tokenize(self, text):
        """Simple tokenization without NLTK."""
        if pd.isna(text) or not text:
            return []
        text = str(text).lower()
        # Remove punctuation and split
        text = text.translate(str.maketrans('', '', string.punctuation))
        return [word for word in text.split() if word]
    
    def simple_sentence_split(self, text):
        """Simple sentence splitting."""
        if pd.isna(text) or not text:
            return []
        sentences = re.split(r'[.!?]+', str(text))
        return [s.strip() for s in sentences if s.strip()]
    
    def extract_basic_features(self, text):
        """Extract basic text features."""
        if pd.isna(text) or not text:
            return {
                'char_count': 0, 'word_count': 0, 'sentence_count': 0,
                'avg_word_length': 0, 'avg_sentence_length': 0,
                'punctuation_count': 0, 'uppercase_ratio': 0, 'digit_count': 0,
                'stopword_ratio': 0, 'exclamation_count': 0, 'question_count': 0
            }
        
        text = str(text)
        words = self.simple_tokenize(text)
        sentences = self.simple_sentence_split(text)
        
        # Basic statistics
        char_count = len(text)
        word_count = len(words)
        sentence_count = max(len(sentences), 1)  # Avoid division by zero
        
        avg_word_length = np.mean([len(word) for word in words]) if words else 0
        avg_sentence_length = word_count / sentence_count
        
        # Count features
        punctuation_count = sum(1 for char in text if char in '.,!?;:')
        uppercase_count = sum(1 for char in text if char.isupper())
        uppercase_ratio = uppercase_count / char_count if char_count > 0 else 0
        digit_count = sum(1 for char in text if char.isdigit())
        
        # Stopword ratio
        stopword_count = sum(1 for word in words if word in self.stop_words)
        stopword_ratio = stopword_count / word_count if word_count > 0 else 0
        
        # Special punctuation
        exclamation_count = text.count('!')
        question_count = text.count('?')
        
        return {
            'char_count': char_count,
            'word_count': word_count,
            'sentence_count': sentence_count,
            'avg_word_length': float(avg_word_length),
            'avg_sentence_length': float(avg_sentence_length),
            'punctuation_count': punctuation_count,
            'uppercase_ratio': float(uppercase_ratio),
            'digit_count': digit_count,
            'stopword_ratio': float(stopword_ratio),
            'exclamation_count': exclamation_count,
            'question_count': question_count
        }
    
    def extract_similarity_features(self, input_text, output_text):
        """Extract similarity features between input and output."""
        if pd.isna(input_text) or pd.isna(output_text) or not input_text or not output_text:
            return {
                'tfidf_similarity': 0.0, 'word_overlap_jaccard': 0.0,
                'word_overlap_dice': 0.0, 'length_ratio': 0.0, 'compression_ratio': 1.0
            }
        
        input_text = str(input_text)
        output_text = str(output_text)
        
        # TF-IDF similarity
        try:
            tfidf_matrix = self.tfidf_vectorizer.fit_transform([input_text, output_text])
            tfidf_similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        except:
            tfidf_similarity = 0.0
        
        # Word overlap
        input_words = set(self.simple_tokenize(input_text))
        output_words = set(self.simple_tokenize(output_text))
        
        intersection = len(input_words & output_words)
        union = len(input_words | output_words)
        
        jaccard_similarity = intersection / union if union > 0 else 0.0
        dice_similarity = (2 * intersection) / (len(input_words) + len(output_words)) if (len(input_words) + len(output_words)) > 0 else 0.0
        
        # Length ratios
        length_ratio = len(output_text) / len(input_text) if len(input_text) > 0 else 0.0
        compression_ratio = len(input_text) / len(output_text) if len(output_text) > 0 else 1.0
        
        return {
            'tfidf_similarity': float(tfidf_similarity),
            'word_overlap_jaccard': float(jaccard_similarity),
            'word_overlap_dice': float(dice_similarity),
            'length_ratio': float(length_ratio),
            'compression_ratio': float(compression_ratio)
        }
    
    def extract_logits_features(self, logits):
        """Extract features from logits."""
        if pd.isna(logits) or not logits:
            return {
                'logits_mean': 0.0, 'logits_std': 0.0, 'logits_max': 0.0,
                'logits_min': 0.0, 'logits_range': 0.0, 'logits_entropy': 0.0
            }
        
        try:
            if isinstance(logits, str):
                logits = json.loads(logits)
            
            logits_array = np.array(logits, dtype=float)
            
            # Basic statistics
            logits_mean = np.mean(logits_array)
            logits_std = np.std(logits_array)
            logits_max = np.max(logits_array)
            logits_min = np.min(logits_array)
            logits_range = logits_max - logits_min
            
            # Entropy (uncertainty measure)
            probs = np.exp(logits_array) / np.sum(np.exp(logits_array))
            entropy = -np.sum(probs * np.log(probs + 1e-10))
            
            return {
                'logits_mean': float(logits_mean),
                'logits_std': float(logits_std),
                'logits_max': float(logits_max),
                'logits_min': float(logits_min),
                'logits_range': float(logits_range),
                'logits_entropy': float(entropy)
            }
            
        except Exception as e:
            logger.warning(f"Error processing logits: {e}")
            return {
                'logits_mean': 0.0, 'logits_std': 0.0, 'logits_max': 0.0,
                'logits_min': 0.0, 'logits_range': 0.0, 'logits_entropy': 0.0
            }
    
    def extract_hallucination_indicators(self, input_text, output_text):
        """Extract hallucination indicator features."""
        if pd.isna(input_text) or pd.isna(output_text) or not input_text or not output_text:
            return {'new_numbers': 0, 'new_caps_words': 0, 'definitive_words': 0, 'uncertain_words': 0}
        
        input_text = str(input_text).lower()
        output_text = str(output_text).lower()
        
        # Numbers that appear in output but not input
        input_numbers = set(re.findall(r'\b\d+\b', input_text))
        output_numbers = set(re.findall(r'\b\d+\b', output_text))
        new_numbers = len(output_numbers - input_numbers)
        
        # Capitalized words (potential named entities)
        input_caps = set(re.findall(r'\b[A-Z][a-z]+\b', str(input_text)))
        output_caps = set(re.findall(r'\b[A-Z][a-z]+\b', str(output_text)))
        new_caps_words = len(output_caps - input_caps)
        
        # Definitive and uncertain language
        definitive_words = ['definitely', 'certainly', 'absolutely', 'clearly', 'obviously']
        uncertain_words = ['might', 'could', 'possibly', 'perhaps', 'maybe', 'allegedly']
        
        definitive_count = sum(output_text.count(word) for word in definitive_words)
        uncertain_count = sum(output_text.count(word) for word in uncertain_words)
        
        return {
            'new_numbers': new_numbers,
            'new_caps_words': new_caps_words,
            'definitive_words': definitive_count,
            'uncertain_words': uncertain_count
        }
    
    def process_record(self, record):
        """Process a single record and extract all features."""
        features = {}
        
        # Extract texts
        input_text = record.get('model_input', '')
        output_text = record.get('model_output_text', '')
        logits = record.get('model_output_logits', [])
        
        # Metadata
        features['model_id'] = record.get('model_id', '')
        features['lang'] = record.get('lang', 'unknown')
        
        # Basic features for input and output
        input_features = self.extract_basic_features(input_text)
        output_features = self.extract_basic_features(output_text)
        
        # Add prefixes
        for key, value in input_features.items():
            features[f'input_{key}'] = value
        for key, value in output_features.items():
            features[f'output_{key}'] = value
        
        # Similarity features
        similarity_features = self.extract_similarity_features(input_text, output_text)
        features.update(similarity_features)
        
        # Logits features
        logits_features = self.extract_logits_features(logits)
        features.update(logits_features)
        
        # Hallucination indicators
        hallucination_features = self.extract_hallucination_indicators(input_text, output_text)
        features.update(hallucination_features)
        
        # Labels (if available)
        if 'hard_labels' in record:
            features['label'] = record['hard_labels']
        if 'soft_labels' in record:
            features['soft_label'] = record['soft_labels']
        
        return features
    
    def process_dataset(self, dataset_data, dataset_name):
        """Process entire dataset."""
        logger.info(f"Processing {dataset_name}")
        
        all_records = []
        for filename, file_data in dataset_data.items():
            data = file_data['data']
            if isinstance(data, list):
                all_records.extend(data)
            elif isinstance(data, dict):
                for sub_key, sub_data in data.items():
                    if isinstance(sub_data, list):
                        all_records.extend(sub_data)
        
        processed_features = []
        for i, record in enumerate(all_records):
            try:
                features = self.process_record(record)
                features['dataset'] = dataset_name
                processed_features.append(features)
                
                if (i + 1) % 500 == 0:
                    logger.info(f"Processed {i + 1}/{len(all_records)} records")
                    
            except Exception as e:
                logger.warning(f"Error processing record {i}: {e}")
        
        return pd.DataFrame(processed_features)

class SimpleModelTrainer:
    """Simple model trainer for hallucination detection."""
    
    def __init__(self):
        self.models = {}
        self.results = {}
        
    def prepare_features(self, df_list, target_col='label'):
        """Prepare features for modeling."""
        # Combine all dataframes
        combined_df = pd.concat(df_list, ignore_index=True)
        
        # Separate features from metadata
        feature_cols = [col for col in combined_df.columns 
                       if col not in ['model_id', 'lang', 'dataset', 'label', 'soft_label']]
        
        X = combined_df[feature_cols].fillna(0)
        y = combined_df[target_col] if target_col in combined_df.columns else None
        
        return X, y, feature_cols, combined_df
        
    def train_models(self, X_train, y_train):
        """Train classification models."""
        models = {
            'random_forest': RandomForestClassifier(n_estimators=100, random_state=42),
            'logistic_regression': LogisticRegression(random_state=42, max_iter=1000),
            'svm': SVC(probability=True, random_state=42)
        }
        
        results = {}
        trained_models = {}
        
        # Cross-validation
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        for name, model in models.items():
            logger.info(f"Training {name}")
            
            # Cross-validation scores
            cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')
            
            # Train on full dataset
            model.fit(X_train, y_train)
            
            results[name] = {
                'cv_f1_mean': np.mean(cv_scores),
                'cv_f1_std': np.std(cv_scores)
            }
            trained_models[name] = model
            
            logger.info(f"{name}: CV F1 = {results[name]['cv_f1_mean']:.3f} (+/- {results[name]['cv_f1_std']:.3f})")
        
        self.models = trained_models
        self.results = results
        return trained_models, results
    
    def evaluate_on_test(self, X_test, y_test):
        """Evaluate trained models on test set."""
        test_results = {}
        
        for name, model in self.models.items():
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
            
            f1 = f1_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None
            
            test_results[name] = {
                'f1_score': f1,
                'auc_score': auc,
                'classification_report': classification_report(y_test, y_pred)
            }
            
            logger.info(f"{name} Test Results: F1={f1:.3f}, AUC={auc:.3f if auc else 'N/A'}")
        
        return test_results

def run_simple_pipeline(loader_data):
    """Run the complete simple pipeline."""
    logger.info("Starting Simple Mu-SHROOM Pipeline")
    
    # Initialize processor
    processor = SimpleMuShroomProcessor()
    
    # Process all datasets
    processed_datasets = {}
    for dataset_name, dataset_data in loader_data.items():
        df = processor.process_dataset(dataset_data, dataset_name)
        processed_datasets[dataset_name] = df
        logger.info(f"Processed {dataset_name}: {len(df)} records, {len(df.columns)} features")
    
    # Prepare for modeling
    trainer = SimpleModelTrainer()
    
    # Check what data we have
    train_df = processed_datasets.get('train_data')
    test_labeled_df = processed_datasets.get('test_labeled') 
    test_unlabeled_df = processed_datasets.get('test_unlabeled')
    
    logger.info("Dataset summary:")
    if train_df is not None:
        logger.info(f"  Training data: {len(train_df)} records")
    if test_labeled_df is not None:
        logger.info(f"  Test labeled: {len(test_labeled_df)} records")
        logger.info(f"  Labels available: {'label' in test_labeled_df.columns}")
    if test_unlabeled_df is not None:
        logger.info(f"  Test unlabeled: {len(test_unlabeled_df)} records")
    
    # If we have labeled test data, use it for supervised learning
    if test_labeled_df is not None and 'label' in test_labeled_df.columns:
        X, y, feature_cols, combined_df = trainer.prepare_features([test_labeled_df])
        
        # Split for training and testing
        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train models
        models, cv_results = trainer.train_models(X_train_scaled, y_train)
        test_results = trainer.evaluate_on_test(X_test_scaled, y_test)
        
        return {
            'processed_datasets': processed_datasets,
            'models': models,
            'cv_results': cv_results,
            'test_results': test_results,
            'feature_columns': feature_cols,
            'scaler': scaler
        }
    
    else:
        logger.info("No labeled data available for supervised learning")
        return {
            'processed_datasets': processed_datasets,
            'feature_columns': None,
            'message': 'Data processed successfully, but no labels available for model training'
        }

# Usage:
# results = run_simple_pipeline(loaded_data)
print("Simple Mu-SHROOM pipeline ready! Use: results = run_simple_pipeline(loaded_data)")

Simple Mu-SHROOM pipeline ready! Use: results = run_simple_pipeline(loaded_data)
