In [None]:
import re
import nltk
import faiss
import requests
import numpy as np
import pandas as pd
import random
import json
import math
import time
import os
from datetime import datetime
from functools import lru_cache
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LdaModel
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.cluster import normalized_mutual_info_score
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any
from dataclasses import dataclass
from transformers import pipeline

# ======================================
# ENVIRONMENT SETUP
# ======================================

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Download required NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

# ======================================
# GOLDEN EXAMPLES FOR CALIBRATION
# ======================================

# Define golden examples of document coherence with perfect scores
GOLDEN_EXAMPLES = [
    {
        "group_name": "Golden Example: Technology",
        "documents": [
            "Cloud computing has transformed how businesses manage their IT infrastructure. Companies can now scale their computing resources on demand without investing in physical hardware. Services like AWS, Azure, and Google Cloud provide flexible options for storage, computation, and specialized services.",
            "Edge computing is gaining popularity as IoT devices proliferate. By processing data closer to where it's generated rather than sending everything to centralized cloud servers, edge computing reduces latency and bandwidth usage. Smart cities and autonomous vehicles benefit greatly from this distributed computing approach.",
            "Quantum computing promises to revolutionize computational capabilities for specific problems. Using quantum bits or qubits that can exist in multiple states simultaneously, these systems can potentially solve complex optimization problems exponentially faster than classical computers."
        ],
        "coherence_score": 10,
        "explanation": "These documents all discuss modern computing paradigms (cloud, edge, and quantum computing). They share technical vocabulary, focus on the same general domain of computing infrastructure, and each explains how a specific technology impacts computing capabilities. This group demonstrates perfect coherence with a clear unified theme."
    },
    {
        "group_name": "Golden Example: Partially Related",
        "documents": [
            "Renewable energy sources like solar and wind power are becoming increasingly important in the global energy mix. As technology improves and costs decrease, these clean energy options are becoming more competitive with fossil fuels.",
            "Electric vehicles are gaining market share in the automotive industry. Major manufacturers are investing billions in developing new EV models with longer ranges and shorter charging times to appeal to mainstream consumers.",
            "Urban planning in modern cities increasingly incorporates green spaces and pedestrian-friendly zones. These design choices help reduce urban heat islands and improve air quality for residents."
        ],
        "coherence_score": 5,
        "explanation": "These documents share some thematic connections around sustainability and modern infrastructure, but discuss different specific topics (energy production, transportation, and urban design). They have partial topical overlap through environmental themes, but each focuses on a distinct domain with different terminology and concepts. This represents moderate coherence with some connecting threads but no single unified topic."
    },
    {
        "group_name": "Golden Example: Mixed Topics",
        "documents": [
            "Photosynthesis is the process by which plants convert light energy into chemical energy. This process produces oxygen as a byproduct and is essential for maintaining Earth's atmosphere.",
            "The French Revolution began in 1789 and led to far-reaching social and political changes in France. Key events included the Storming of the Bastille and the Reign of Terror.",
            "JavaScript is a programming language commonly used for web development. It allows developers to create interactive elements on websites and runs directly in the user's browser."
        ],
        "coherence_score": 1,
        "explanation": "These documents cover completely different topics (biology, history, and computer science) with no meaningful connection between them. They use different terminology, discuss unrelated concepts, and share no common themes. This group demonstrates minimum coherence with no unified topic."
    }
]

# ======================================
# DATA CLASSES
# ======================================

@dataclass
class LLMResponse:
    """Represents a response from a language model."""
    content: str
    score: float = None
    topic: str = None
    confidence: float = None
    error: str = None
    metadata: Dict[str, Any] = None

    def __post_init__(self):
        if self.metadata is None:
            self.metadata = {}

@dataclass
class Document:
    """Represents a document with its content and embedding."""
    content: str
    embedding: np.ndarray = None
    metadata: Dict[str, Any] = None

# ======================================
# RAG SYSTEM
# ======================================

class RAGSystem:
    """Retrieval-Augmented Generation system for document retrieval."""
    
    def __init__(self, embedding_model="all-MiniLM-L6-v2"):
        """
        Initializes the RAG system:
        Loads the specified embedding model.
        Creates an empty document store and sets the FAISS index to None.
        """
        self.embedding_model = SentenceTransformer(embedding_model)
        self.document_store = []
        self.index = None

    def add_documents(self, documents: List[str]):
        """
        Adds documents to the system and prepares them for retrieval.
        """
        try:
            # Get embeddings from the model
            embeddings = self.embedding_model.encode(documents, convert_to_tensor=True)
            
            # Process each document and its embedding
            for doc, emb in zip(documents, embeddings):
                # Convert tensor to numpy, handling GPU tensors if needed
                if hasattr(emb, 'is_cuda') and emb.is_cuda:
                    # If tensor is on GPU, move to CPU first
                    numpy_emb = emb.cpu().numpy()
                else:
                    # If already on CPU
                    numpy_emb = emb.numpy()
                
                # Add document to store
                self.document_store.append(Document(
                    content=doc,
                    embedding=numpy_emb
                ))
            
            # Update the search index
            self._update_index()
            
        except Exception as e:
            print(f"Error adding documents to RAG system: {e}")
            # Provide more detailed error info for debugging
            import traceback
            traceback.print_exc()

    def _update_index(self):
        """
        Updates the FAISS index with embeddings from all stored documents.
        """
        try:
            if not self.document_store:
                print("Warning: Document store is empty, no index created")
                return
                
            # Stack all embeddings
            embeddings = np.vstack([doc.embedding for doc in self.document_store])
            dimension = embeddings.shape[1]
            
            # Create and populate the index
            self.index = faiss.IndexFlatL2(dimension)
            self.index.add(embeddings.astype('float32'))
            
        except Exception as e:
            print(f"Error updating FAISS index: {e}")

    def retrieve_relevant_docs(self, query: str, k: int = 3):
        """
        Retrieves the top-k documents most relevant to a given query.
        """
        try:
            if not self.index:
                print("Warning: Index not initialized, no documents can be retrieved")
                return []
                
            # Encode the query
            query_embedding = self.embedding_model.encode([query])[0]
            
            # Handle GPU tensor if needed
            if hasattr(query_embedding, 'is_cuda') and query_embedding.is_cuda:
                query_embedding = query_embedding.cpu()
                
            # Reshape and convert to the right format
            query_embedding = query_embedding.reshape(1, -1).astype('float32')
            
            # Search the index
            D, I = self.index.search(query_embedding, k)
            
            # Return the matching documents
            return [self.document_store[i] for i in I[0] if i < len(self.document_store)]
            
        except Exception as e:
            print(f"Error retrieving documents: {e}")
            return []

# ======================================
# TEXT PROCESSING UTILITIES
# ======================================
def find_pattern_safely(pattern, text, default=None):
    """
    Safely extract content matching a regex pattern with robust error handling.
    
    This function searches for a specified regex pattern in the provided text and
    returns the first matching group if found. It includes multiple safeguards:
    - Handles None or empty text input
    - Uses try-except to catch any regex-related errors
    - Supports a default return value for failed matches
    - Applies case-insensitive, multiline, and dot-all regex flags
    
    Parameters:
    -----------
    pattern : str
        The regular expression pattern to search for. Should include at least one
        capturing group, as the function returns the content of the first group.
        
    text : str or None
        The text to search within. Can be None or empty string, in which case
        the default value is returned.
        
    default : any, optional
        The value to return if no match is found or an error occurs.
        Defaults to None.
        
    Returns:
    --------
    str or default
        The content of the first capturing group if a match is found,
        or the default value if no match or an error occurs.
        The returned string is stripped of leading/trailing whitespace.
    """
    if not text:
        return default
    try:
        match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
        if match:
            return match.group(1).strip()
    except Exception as e:
        print(f"Error matching pattern {pattern}: {e}")
    return default


def clean_topic(topic):
    """
    Clean and normalize a topic string by removing artifacts and standardizing format.
    
    This function performs the following transformations:
    - Removes numeric prefixes (e.g., "1. Topic" -> "Topic")
    - Removes dash prefixes (e.g., "- Topic" -> "Topic")
    - Removes empty strings
    - Removes specific terms like "millisecond"
    - Replaces " and " with " & "
    - Normalizes whitespace
    - Trims leading/trailing whitespace
    - Rejects very short topics (fewer than 3 characters)
    
    Parameters:
    -----------
    topic : str or None
        The topic string to clean and normalize. Can be None, in which case
        None is returned.
        
    Returns:
    --------
    str or None
        The cleaned and normalized topic string, or None if the input is None,
        empty, or results in a topic shorter than 3 characters after cleaning.
    """
    
    if not topic:
        return None

    topic = re.sub(r'^\d+\.\s*', '', topic)
    topic = re.sub(r'^-\s*', '', topic)
    topic = re.sub(r'^$', '', topic)
    topic = re.sub(r'\b\d+millisecond\b', '', topic)
    topic = re.sub(r'\s+and\s+', ' & ', topic)
    topic = ' '.join(topic.split())
    topic = topic.strip()

    if len(topic) < 3:
        return None

    return topic


def clean_text(text):
    """
    Clean and normalize text content for analysis or processing.
    
    This function applies a sequence of cleaning operations to prepare text data:
    - Handles non-string inputs by returning empty string
    - Removes URLs and web links
    - Removes HTML tags
    - Removes special characters and punctuation
    - Removes numbers and digits
    - Normalizes whitespace (multiple spaces, tabs, newlines)
    - Converts text to lowercase
    - Trims leading/trailing whitespace
    
    Parameters:
    -----------
    text : str or any
        The text to clean. If not a string, returns an empty string.
        
    Returns:
    --------
    str
        The cleaned and normalized text content as a lowercase string
        with standardized spacing and without URLs, HTML, special characters,
        or numbers.
    """
    
    if not isinstance(text, str):
        return ""
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Strip and lower case
    text = text.strip().lower()
    
    return text
    
def preprocess(text):
    """
    Tokenize and preprocess text by removing stopwords and keeping only alphabetic tokens.
    
    Parameters:
    -----------
    text : str
        Raw text to be preprocessed
        
    Returns:
    --------
    List[str]
        List of tokenized words with stopwords removed
    """
    stop_words = set(stopwords.words('english'))
    return [word for word in word_tokenize(text.lower()) if word.isalpha() and word not in stop_words]

def preprocess_text_minimal(text):
    """
    Perform minimal text preprocessing while preserving core document content and meaning.
    
    This function applies a lightweight preprocessing pipeline designed to normalize text
    while maintaining the semantic content as much as possible. Unlike more aggressive
    preprocessing approaches, this function focuses on essential normalization steps
    with minimal information loss.
    
    The preprocessing steps include:
    1. Handling empty or non-string inputs
    2. Replacing URLs with a generic 'URL' token
    3. Replacing numbers with a generic 'NUM' token
    4. Removing special characters and punctuation
    5. Normalizing whitespace (multiple spaces, tabs, newlines)
    6. Converting to lowercase
    7. Removing a minimal set of basic English stopwords
    8. Filtering out tokens with 2 or fewer characters
    
    Parameters:
    -----------
    text : Any
        The text to preprocess. If None, empty, or not a string, returns "empty document".
        
    Returns:
    --------
    str
        The preprocessed text as a space-separated string of filtered tokens.
        Returns "empty document" if the input is invalid or if no tokens remain
        after filtering
    """
    
    if not text or not isinstance(text, str):
        return "empty document"
    
    # Very light cleaning
    text = re.sub(r'http\S+', 'URL', text)  # Replace URLs
    text = re.sub(r'\d+', 'NUM', text)  # Replace numbers
    text = re.sub(r'[^\w\s]', ' ', text)  # Keep alphanumeric only
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove only basic stopwords
    basic_stops = {'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at'}
    
    # Tokenize and filter
    tokens = text.split()
    filtered_tokens = [t for t in tokens if t not in basic_stops and len(t) > 2]
    
    return " ".join(filtered_tokens) if filtered_tokens else "empty document"

def preprocess_documents(documents, aggressive=False):
    """
    Process a collection of documents with improved tokenization and error handling.
    
    This function applies preprocessing to each document in a collection, converting
    them to tokenized format suitable for NLP tasks. It includes robust error handling
    to ensure all documents are processed, even if individual documents cause errors.
    
    The processing pipeline:
    1. Verifies each item is a string
    2. Applies minimal text preprocessing (via preprocess_text_minimal)
    3. Tokenizes the text using NLTK's word_tokenize
    4. Handles empty token lists by inserting a placeholder
    5. Handles non-string inputs and errors by using placeholders
    
    Parameters:
    -----------
    documents : List[str] or Iterable[Any]
        A collection of documents to process. Ideally strings, but the function
        handles non-string elements gracefully by replacing them with placeholders.
        
    aggressive : bool, default=False
        Flag to control preprocessing intensity (currently not used but kept
        for backward compatibility or future implementation of more aggressive
        preprocessing options).
        
    Returns:
    --------
    List[List[str]]
        A list of tokenized documents, where each document is represented as
        a list of tokens. Documents that couldn't be processed properly will
        contain a single token: 'placeholder'.
    """
    
    processed_docs = []
    for doc in documents:
        try:
            if isinstance(doc, str):
                # Use minimal preprocessing
                cleaned_text = preprocess_text_minimal(doc)
                tokens = word_tokenize(cleaned_text)
                
                if tokens:  # Only append if we have tokens
                    processed_docs.append(tokens)
                else:
                    processed_docs.append(['placeholder'])
            else:
                processed_docs.append(['placeholder'])
        except Exception as e:
            print(f"Error preprocessing document: {e}")
            processed_docs.append(['placeholder'])
    
    return processed_docs


def convert_to_serializable(obj):
    """
    Convert NumPy objects to JSON-serializable Python types.
    
    This utility function converts NumPy types (integers, floats, arrays) to their
    standard Python equivalents to enable JSON serialization. It's particularly useful
    when preparing data for JSON output, API responses, or storing in document databases.
    
    The function handles the following NumPy types:
    - np.integer → Python int
    - np.floating → Python float
    - np.ndarray → Python list
    - Other types are returned unchanged
    
    Parameters:
    -----------
    obj : Any
        The object to convert. This can be a NumPy scalar (np.int64, np.float32, etc.),
        a NumPy array, or any other object. Non-NumPy objects are returned as-is.
        
    Returns:
    --------
    Any
        The converted object that can be safely serialized to JSON. NumPy types are
        converted to their Python equivalents, while other objects are returned unchanged.
    """
    
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        return obj
# ======================================
# LLM PROMPTS
# ======================================

def get_improved_summarize_prompt():
    """Get prompt for document summarization."""
    return """You are a document analyzer. Extract the key content accurately.

TEXT TO ANALYZE:
{text}

INSTRUCTIONS:
1. Identify the ACTUAL topic (e.g., sports, religion, technology, politics)
2. Extract specific details mentioned in the text
3. Note any important context or background

PROVIDE EXACTLY THIS FORMAT:
MAIN_TOPIC: [what is this document actually about?]
CATEGORY: [broad category: sports, religion, technology, politics, science, etc.]
KEY_DETAILS: [specific facts, names, or events mentioned]
SUMMARY: [1-2 sentences capturing the essence]"""

def get_enhanced_grade_prompt():
    """Get prompt for coherence evaluation."""
    return """You are a document coherence evaluator. Assess how similar these documents are to each other.

CALIBRATION EXAMPLES:
Example 1 - Perfect Coherence (Score: 10):
Documents: [
  "Cloud computing has transformed how businesses manage their IT infrastructure. Companies can now scale their computing resources on demand without investing in physical hardware. Services like AWS, Azure, and Google Cloud provide flexible options for storage, computation, and specialized services.",
  "Edge computing is gaining popularity as IoT devices proliferate. By processing data closer to where it's generated rather than sending everything to centralized cloud servers, edge computing reduces latency and bandwidth usage. Smart cities and autonomous vehicles benefit greatly from this distributed computing approach.",
  "Quantum computing promises to revolutionize computational capabilities for specific problems. Using quantum bits or qubits that can exist in multiple states simultaneously, these systems can potentially solve complex optimization problems exponentially faster than classical computers."
]
Explanation: These documents all discuss modern computing paradigms (cloud, edge, and quantum computing). They share technical vocabulary, focus on the same general domain of computing infrastructure, and each explains how a specific technology impacts computing capabilities. This group demonstrates perfect coherence with a clear unified theme.

Example 2 - Moderate Coherence (Score: 5):
Documents: [
  "Renewable energy sources like solar and wind power are becoming increasingly important in the global energy mix. As technology improves and costs decrease, these clean energy options are becoming more competitive with fossil fuels.",
  "Electric vehicles are gaining market share in the automotive industry. Major manufacturers are investing billions in developing new EV models with longer ranges and shorter charging times to appeal to mainstream consumers.",
  "Urban planning in modern cities increasingly incorporates green spaces and pedestrian-friendly zones. These design choices help reduce urban heat islands and improve air quality for residents."
]
Explanation: These documents share some thematic connections around sustainability and modern infrastructure, but discuss different specific topics (energy production, transportation, and urban design). They have partial topical overlap through environmental themes, but each focuses on a distinct domain with different terminology and concepts. This represents moderate coherence with some connecting threads but no single unified topic.

Example 3 - No Coherence (Score: 1):
Documents: [
  "Photosynthesis is the process by which plants convert light energy into chemical energy. This process produces oxygen as a byproduct and is essential for maintaining Earth's atmosphere.",
  "The French Revolution began in 1789 and led to far-reaching social and political changes in France. Key events included the Storming of the Bastille and the Reign of Terror.",
  "JavaScript is a programming language commonly used for web development. It allows developers to create interactive elements on websites and runs directly in the user's browser."
]
Explanation: These documents cover completely different topics (biology, history, and computer science) with no meaningful connection between them. They use different terminology, discuss unrelated concepts, and share no common themes. This group demonstrates minimum coherence with no unified topic.

TARGET GROUP TO ANALYZE:
{documents}
SCORING GUIDE - YOU MUST USE THE FULL RANGE:
10: All documents are clearly about the SAME SPECIFIC topic (e.g., all about basketball strategies, all about AI applications)
8-9: Documents about the same topic with minimal variations or subtopic differences
6-7: Documents in the same general field but discussing different aspects/subtopics
4-5: Documents with weak connections, mostly different topics with some overlap
1-3: Documents on completely different topics with little to no meaningful connection

EVALUATION INSTRUCTIONS:
1. Identify the precise topic of each document
2. If ALL documents focus on the SAME SPECIFIC TOPIC (e.g., all about basketball), you MUST score 9-10
3. If documents share NO connection at all, you MUST score 1-2
4. Use the middle scores (3-8) only when documents have partial overlap
5. BE GENEROUS with high scores (9-10) when documents clearly share the same topic

PROVIDE EXACTLY THIS FORMAT:
COHERENCE_SCORE: [1-10]
MAIN_TOPICS: [list actual topics found in documents]
SHARED_ELEMENTS: [what connects them, if anything]
JUSTIFICATION: [why this score]"""

def get_topic_prompt():
    """Get prompt for topic identification."""
    return """Analyze this document's topic and category.

{text}

OUTPUT:
MAIN_TOPIC: [specific topic]
CATEGORY: [broad category like: religion, sports, technology, politics, science, etc.]
SUBTOPICS: [3-4 key themes]
CONFIDENCE: [0-1 score]"""

# ======================================
# LLM PROCESSING
# ======================================

class ImprovedLLMProcessor:
    """Handles interaction with Language Models for document evaluation."""
    
    def __init__(self, api_url: str, model: str = "deepseek-r1:32b"):
        self.model = model
        self.api_url = api_url
        self.prompts = {
            "summarize": get_improved_summarize_prompt(),
            "grade": get_enhanced_grade_prompt(),
            "topic": get_topic_prompt()
        }

    def process_text(self, text: str, task: str, additional_context: Dict = None) -> LLMResponse:
        """
        Process text with the LLM for different tasks (summarize, grade, topic).
        """
        try:
            prompt_template = self.prompts.get(task)
            if not prompt_template:
                raise ValueError(f"Unknown task: {task}")

            context = {
                "text": text,
                "documents": "",
                "other_groups": ""
            }
            if additional_context:
                context.update(additional_context)

            prompt = prompt_template.format(**context)
            
            # Lower temperature for more consistent evaluation
            temperature = 0.3 if task == "grade" else 0.2

            payload = {
                "model": self.model,
                "prompt": prompt,
                "temperature": temperature,
                "top_p": 0.9,
                "stream": False
            }

            # Add timeout and retry logic
            max_retries = 3
            retry_count = 0
            
            while retry_count < max_retries:
                try:
                    response = requests.post(self.api_url, json=payload, timeout=30)
                    response.raise_for_status()
                    
                    # Check if the response has the expected format
                    json_response = response.json()
                    
                    # Handle different response formats
                    if "response" in json_response:
                        content = json_response["response"].strip()
                    elif "text" in json_response:
                        content = json_response["text"].strip()
                    elif "content" in json_response:
                        content = json_response["content"].strip()
                    elif "output" in json_response:
                        content = json_response["output"].strip()
                    else:
                        # If we can't find expected keys, just use the whole response
                        print(f"Unexpected API response format: {json_response.keys()}")
                        content = str(json_response)
                    
                    if task == "grade":
                        score = self._extract_score(content)
                        main_topics = self._extract_value(content, "MAIN_TOPICS", "")
                        shared_elements = self._extract_value(content, "SHARED_ELEMENTS", "")
                        
                        return LLMResponse(
                            content=content, 
                            score=score, 
                            metadata={
                                "main_topics": main_topics,
                                "shared_elements": shared_elements
                            }
                        )
                    elif task == "topic":
                        topic_info = self._extract_topic_info(content)
                        return LLMResponse(
                            content=content,
                            topic=topic_info["main_topic"],
                            confidence=topic_info["confidence"],
                            metadata=topic_info
                        )

                    return LLMResponse(content=content)
                    
                except requests.exceptions.RequestException as e:
                    retry_count += 1
                    if retry_count < max_retries:
                        print(f"API request failed, retrying ({retry_count}/{max_retries}): {e}")
                        # Exponential backoff
                        time.sleep(2 ** retry_count)
                    else:
                        raise

        except Exception as e:
            print(f"Error in processing {task}: {e}")
            return LLMResponse(content="", error=str(e))

    def _extract_score(self, response_text):
        """Extract coherence score from LLM response."""
        try:
            # Look for COHERENCE_SCORE
            match = re.search(r'COHERENCE_SCORE:\s*(\d+)', response_text)
            if match:
                return int(match.group(1))
                
            # Look for any number between 1-10
            numbers = re.findall(r'\b([1-9]|10)\b', response_text)
            if numbers:
                return int(numbers[0])
                
            return 0
        except Exception as e:
            print(f"Error extracting score: {e}")
            return 0

    def _extract_value(self, response_text, field, default=""):
        """Extract a field value from LLM response."""
        try:
            pattern = f"{field}:\\s*([^\\n]+)"
            match = re.search(pattern, response_text)
            if match:
                return match.group(1).strip()
            return default
        except Exception:
            return default

    def _extract_topic_info(self, response_text):
        """Extract topic information from LLM response."""
        try:
            return {
                "main_topic": self._extract_value(response_text, "MAIN_TOPIC", "unknown"),
                "category": self._extract_value(response_text, "CATEGORY", "unknown"),
                "subtopics": self._extract_value(response_text, "SUBTOPICS", "").split(','),
                "confidence": float(self._extract_value(response_text, "CONFIDENCE", "0.5"))
            }
        except Exception:
            return {
                "main_topic": "unknown",
                "category": "unknown",
                "subtopics": [],
                "confidence": 0.5
            }

def update_llm_processor_with_calibration(llm_processor):
    """Update the LLM processor with enhanced prompts."""
    # Update the grade prompt
    llm_processor.prompts["grade"] = get_enhanced_grade_prompt()
    return llm_processor

# ======================================
# COHERENCE CALCULATION
# ======================================

def calculate_coherence_scores(groups, dictionary, measure="c_v"):
    """
    Calculate coherence scores for groups of documents.
    
    Parameters:
    -----------
    groups : List[List[str]]
        List of document groups, where each group is a list of tokenized documents
    dictionary : Dictionary
        Gensim Dictionary object (not used in this implementation as compute_coherence creates its own)
    measure : str, default="c_v"
        Coherence measure to use (not used as compute_coherence uses "c_v")
        
    Returns:
    --------
    List[float]
        List of coherence scores for each group
    """
    scores = []
    for group in groups:
        try:
            # Use the compute_coherence function directly
            score = compute_coherence(group)
            scores.append(score)
        except Exception as e:
            print(f"Error calculating coherence for group: {e}")
            scores.append(0.0)

    return scores
    
def compute_coherence(texts):
    """
    Compute coherence score for a group of texts using Latent Dirichlet Allocation (LDA) 
    and the c_v coherence measure.
    
    This function quantifies how semantically coherent a group of documents are with each other.
    It works by:
    1. Creating a dictionary from the tokenized texts
    2. Converting documents to bag-of-words representation
    3. Training an LDA model with 2 topics
    4. Computing the c_v coherence measure, which is based on normalized pointwise mutual information (NPMI)
       and the indirect cosine measure
    
    Higher coherence scores indicate greater semantic similarity among the documents, suggesting
    they discuss related topics or themes. The c_v measure typically ranges from 0 to 1, with 
    scores closer to 1 indicating stronger coherence.
    
    Parameters:
    -----------
    texts : List[List[str]]
        A list of tokenized documents, where each document is represented as a list of tokens/words.
        The texts should already be preprocessed (tokenized, with stopwords removed).
        Example: [['cloud', 'computing', 'infrastructure'], ['data', 'processing', 'algorithms']]
        
    Returns:
    --------
    float
        The coherence score (c_v measure) for the document group. Higher values indicate
        greater semantic coherence between documents.
    """

    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    # Create LDA model
    lda = LdaModel(
        corpus=corpus, 
        id2word=dictionary, 
        num_topics=2, 
        passes=10, 
        random_state=42
    )
    
    # Calculate coherence
    coherence_model = CoherenceModel(
        model=lda, 
        texts=texts, 
        dictionary=dictionary, 
        coherence='c_v'
    )
    
    return coherence_model.get_coherence()

def perform_lda_analysis(documents, n_topics=5):
    """
    Performs Latent Dirichlet Allocation (LDA) topic modeling on a collection of documents.
    
    This function applies improved LDA analysis with robust preprocessing to extract latent topics
    from a collection of text documents. It uses scikit-learn's CountVectorizer for text 
    preprocessing and LatentDirichletAllocation for topic modeling.
    
    The function automatically handles edge cases, such as insufficient features for the 
    requested number of topics, and provides informative error handling.
    
    Parameters:
    -----------
    documents : List[str]
        A list of document texts to analyze. Each element should be a string containing
        the text of one document.
        
    n_topics : int, default=5
        The number of topics to extract using LDA. If the number of features is insufficient
        for the requested number of topics, this value will be automatically adjusted.
        
    Returns:
    --------
    dict
        A dictionary containing two keys:
        - 'assigned_topics': A list of integers representing the most dominant topic
          for each document in the input list. Each integer corresponds to the index
          of the most probable topic for that document.
        - 'topics_keywords': A dictionary mapping topic indices to lists of keywords.
          Each topic is represented by its top 15 most relevant keywords.
    """
    if not documents or not isinstance(documents, list) or len(documents) < n_topics:
        return {"assigned_topics": [], "topics_keywords": {}}

    try:
        # Use minimal preprocessing
        vectorizer = CountVectorizer(
            stop_words='english',
            max_df=0.9,
            min_df=0.05,
            token_pattern=r'(?u)\b\w+\b',
            ngram_range=(1, 2)
        )

        # Create document-term matrix
        X = vectorizer.fit_transform([doc if isinstance(doc, str) else "" for doc in documents])
        
        # Check if we have enough features for LDA
        if X.shape[1] < n_topics * 2:
            print(f"Warning: Not enough features ({X.shape[1]}) for {n_topics} topics")
            n_topics = max(2, X.shape[1] // 2)
        
        # Fit LDA
        lda_model = LatentDirichletAllocation(
            n_components=n_topics,
            random_state=42,
            max_iter=50,
            learning_method='batch',
            learning_offset=50.0,
            doc_topic_prior=0.1,
            topic_word_prior=0.01
        )

        lda_model.fit(X)
        feature_names = vectorizer.get_feature_names_out()

        # Extract topics and keywords
        topics = {}
        for topic_idx, topic in enumerate(lda_model.components_):
            top_keywords_idx = topic.argsort()[:-15:-1]
            top_keywords = [feature_names[i] for i in top_keywords_idx]
            topics[topic_idx] = top_keywords

        # Get topic assignments for documents
        topic_assignments = lda_model.transform(X)
        assigned_topics = np.argmax(topic_assignments, axis=1)

        return {
            "assigned_topics": assigned_topics.tolist(),
            "topics_keywords": topics
        }

    except Exception as e:
        print(f"Error in LDA analysis: {str(e)}")
        return {"assigned_topics": [], "topics_keywords": {}}

# ======================================
# RAG UTILITIES
# ======================================


def create_coherence_calibration_rag(rag_system):
    """
    Initialize the RAG system with coherence calibration examples from the GOLDEN_EXAMPLES.
    
    This function takes a RAG system and populates it with carefully crafted document
    coherence examples that serve as reference points for coherence evaluation. These
    examples act as anchors with known coherence scores (ranging from 1-10) that help
    calibrate the evaluation process.
    
    For each golden example, the function:
    1. Creates a formatted document containing the coherence score, example name,
       document contents, and detailed explanation of why it received that score
    2. Adds these formatted documents to the RAG system's document store
    3. Ensures these examples can be retrieved later when evaluating new document groups
    
    The calibration examples cover different levels of coherence:
    - High coherence (score 10): Documents covering the same specific topic (e.g., computing)
    - Medium coherence (score 5): Documents with partial thematic connections but 
      covering different specific topics (e.g., sustainability across different domains)
    - Low coherence (score 1): Documents covering completely unrelated topics with
      no meaningful connections
    
    Parameters:
    -----------
    rag_system : RAGSystem
        An initialized but empty RAG system to which calibration examples will be added
        
    Returns:
    --------
    RAGSystem
        The updated RAG system with calibration examples added to its document store
    """
    
    print("Adding coherence calibration examples to RAG system...")
    
    # Create documents that contain both the example and its score
    calibration_docs = []
    
    for example in GOLDEN_EXAMPLES:
        # Create a document that describes the example
        doc_text = f"COHERENCE EXAMPLE - Score {example['coherence_score']}/10\n\n"
        doc_text += f"Group: {example['group_name']}\n\n"
        doc_text += "Documents:\n"
        
        for i, doc in enumerate(example['documents']):
            doc_text += f"Document {i+1}: {doc}\n\n"
        
        doc_text += f"Explanation: {example['explanation']}\n"
        doc_text += f"This is a reference example of coherence level {example['coherence_score']}/10."
        
        calibration_docs.append(doc_text)
    
    # Add the calibration documents to the RAG system
    rag_system.add_documents(calibration_docs)
    
    print(f"Added {len(calibration_docs)} calibration examples to RAG system")
    return rag_system

def retrieve_similar_coherence_examples(rag_system, documents, top_k=2):
    """
    Use the RAG system to find similar coherence examples for a group of documents.
    
    This function creates a query based on the provided documents and uses the RAG system
    to retrieve the most semantically similar calibration examples. These retrieved examples
    serve as reference points for the language model when evaluating document coherence.
    
    The process works as follows:
    1. Constructs a query by combining truncated versions of each document
    2. Uses the RAG system's vector search to find calibration examples with similar content
    3. Returns the top-k most relevant examples based on embedding similarity
    
    These retrieved examples help the LLM better assess coherence by providing concrete
    comparison points with known coherence scores. This helps ground the evaluation
    and improves scoring consistency across different document groups.
    
    Parameters:
    -----------
    rag_system : RAGSystem
        The RAG system containing indexed calibration examples
    documents : List[str]
        List of document texts for which to find similar calibration examples
    top_k : int, default=2
        Number of similar examples to retrieve
        
    Returns:
    --------
    List[Document]
        List of Document objects containing the most relevant calibration examples,
        ordered by decreasing similarity (most similar first)
    """
    # Create a query that describes the current document group
    query = "Find similar document groups to assess coherence:\n\n"
    
    # Add current documents to the query
    for i, doc in enumerate(documents):
        query += f"Document {i+1}: {doc[:150]}...\n\n"  # Use truncated versions to keep query manageable
    
    # Retrieve relevant calibration examples
    relevant_examples = rag_system.retrieve_relevant_docs(query, k=top_k)
    
    return relevant_examples


def reinforce_calibration_examples(rag_system):
    """
    Add additional examples to the RAG system that strongly reinforce proper coherence scoring behavior.
    
    This function enhances the RAG system's calibration by adding three categories of examples:
    
    1. Scoring guidelines - Direct instructions on how to use the full scoring range properly
    2. Bad division examples - Examples of document groups with low coherence (scores 1-4)
    3. Mixed division examples - Examples with moderate coherence levels (scores 5-6)
    
    These examples serve as additional reference points beyond the basic golden examples,
    helping the language model better understand edge cases and apply consistent scoring.
    The reinforcement examples specifically address common scoring biases, such as:
    - Reluctance to use extreme scores (1-2 or 9-10)
    - Inconsistent handling of partially related documents
    - Failure to recognize when all documents share the same specific topic
    
    Parameters:
    -----------
    rag_system : RAGSystem
        The RAG system instance to which calibration examples will be added
        
    Returns:
    --------
    RAGSystem
        The updated RAG system with reinforcement examples added to its document store
        
    """
    
    print("Adding reinforcement calibration examples to RAG system...")
    
    # Create documents that emphasize correct scoring
    reinforcement_docs = [
        "SCORING GUIDE: When evaluating document coherence, you MUST use the full scoring range. Documents that are all about the same specific topic (e.g., all discussing basketball strategies) deserve scores of 9-10. Only give scores of 6-7 if documents are in the same general field but different subtopics. Documents with no connection should receive scores of 1-2.",
        
        "COHERENCE EVALUATION EXAMPLE: Documents about artificial intelligence applications, ethical concerns in AI, and advancements in NLP are all clearly about the same specific topic (Artificial Intelligence). This group demonstrates high coherence and must receive a score of 9-10.",
        
        "COHERENCE EVALUATION EXAMPLE: Documents discussing basketball offensive strategies, basketball defensive evolution, and basketball player development are all explicitly about the same specific domain (Basketball). This group demonstrates high coherence and must receive a score of 9-10."
    ]
    
    # Add examples of BAD DIVISIONS with low coherence scores
    bad_division_examples = [
        """COHERENCE EXAMPLE - Score 1/10
        
        Group: Completely Unrelated Topics
        
        Documents:
        Document 1: Climate change is accelerating with global temperatures rising at an unprecedented rate. Arctic ice melt and extreme weather events are among the most visible impacts currently affecting communities worldwide.
        
        Document 2: The history of classical music in Vienna during the 18th century was dominated by composers like Mozart and Haydn who established many of the formal structures still used in orchestral composition today.
        
        Document 3: Cryptocurrency mining operations require significant computational resources and energy consumption, raising concerns about their environmental impact and long-term sustainability.
        
        Document 4: Traditional cake recipes often include flour, sugar, eggs, and butter as base ingredients, with variations in proportions and additional flavorings determining the specific type of cake produced.
        
        Document 5: Ancient Egyptian burial practices involved elaborate preservation techniques for the deceased, including mummification and the construction of tombs filled with artifacts believed necessary for the afterlife.
        
        Explanation: These documents have absolutely no topical connection to each other, covering climate science, music history, cryptocurrency, baking, and archaeology. They share no vocabulary, themes, or concepts. This represents the lowest level of coherence with completely unrelated content.
        This is a reference example of coherence level 1/10.""",

        """COHERENCE EXAMPLE - Score 3/10
        
        Group: Mostly Disconnected with Minimal Overlap
        
        Documents:
        Document 1: Recent advances in artificial intelligence have enabled more accurate weather prediction models that can forecast severe storms up to 5 days in advance.
        
        Document 2: The global semiconductor shortage has severely impacted automotive production, with many manufacturers unable to complete vehicles due to missing electronic components.
        
        Document 3: Smart farming technologies using IoT sensors can monitor soil moisture and automatically adjust irrigation systems to conserve water while improving crop yields.
        
        Document 4: The rising cost of housing in urban centers has forced many families to commute longer distances from affordable suburban areas to their workplaces.
        
        Document 5: Online learning platforms experienced unprecedented growth during the pandemic as schools and universities transitioned to remote education models.
        
        Explanation: While these documents all relate to modern developments, they address entirely different domains (weather forecasting, manufacturing, agriculture, housing, and education). There is minimal conceptual overlap with only a loose connection through technology references in some documents. Most documents have no meaningful relationship to the others. This represents very low coherence.
        This is a reference example of coherence level 3/10.""",
        
        """COHERENCE EXAMPLE - Score 4/10
        
        Group: Weakly Connected Topics
        
        Documents:
        Document 1: The European Union's carbon tax legislation aims to reduce greenhouse gas emissions by putting a price on carbon-intensive industrial production.
        
        Document 2: Electric vehicles are becoming increasingly popular in urban centers where charging infrastructure is more developed and commute distances are shorter.
        
        Document 3: Corporate social responsibility reports now commonly include detailed sustainability metrics including waste reduction and energy efficiency measures.
        
        Document 4: The global fashion industry faces criticism for its environmental impact, including water pollution from textile manufacturing and the short lifecycle of fast fashion products.
        
        Document 5: Advances in quantum computing research focus primarily on theoretical applications rather than immediate commercial deployment due to the technical challenges involved.
        
        Explanation: While four documents share loose connections to environmental themes, they address different industries and aspects (legislation, transportation, corporate reporting, and fashion). The fifth document about quantum computing is entirely unrelated to this loose environmental theme. This group demonstrates low coherence with some weak connections between most documents but still lacks a unified topic.
        This is a reference example of coherence level 4/10."""
    ]
    
    # Add more specific examples showing mixed divisions with varied scores
    mixed_division_examples = [
        """COHERENCE EXAMPLE - Score 5/10
        
        Group: Mixed Topics with Some Thematic Connection
        
        Documents:
        Document 1: Major smartphone manufacturers release new models annually, with incremental hardware improvements and software features to entice consumers to upgrade.
        
        Document 2: Wearable fitness trackers can monitor heart rate, sleep patterns, and activity levels, providing users with health insights through connected mobile applications.
        
        Document 3: Social media platforms use algorithmic content curation to maximize user engagement, which has raised concerns about filter bubbles and information diversity.
        
        Document 4: The rise of streaming services has transformed how television content is produced and consumed, with binge-watching becoming a common viewing habit.
        
        Document 5: Digital privacy regulations like GDPR have forced technology companies to revise their data collection and storage practices globally.
        
        Explanation: These documents all relate broadly to consumer technology and digital trends, but address different specific areas (smartphones, wearables, social media, streaming entertainment, and privacy regulation). They share some vocabulary and conceptual overlap through digital technology, but each focuses on a different aspect with distinct concerns. This represents moderate coherence with a loose connecting theme but no single specific topic.
        This is a reference example of coherence level 5/10.""",
        
        """COHERENCE EXAMPLE - Score 6/10
        
        Group: Related Topics with Stronger Connections
        
        Documents:
        Document 1: Machine learning algorithms require large training datasets to achieve high accuracy in pattern recognition tasks.
        
        Document 2: Computer vision systems can now identify objects in images with near-human accuracy when properly trained on diverse visual data.
        
        Document 3: Natural language processing has improved significantly with the development of transformer models that better understand contextual relationships between words.
        
        Document 4: The ethical implications of algorithmic decision-making include concerns about bias, transparency, and accountability in automated systems.
        
        Document 5: Recent developments in robotics focus on improving sensory capabilities to allow machines to navigate complex environments more effectively.
        
        Explanation: These documents all relate to artificial intelligence and its applications, with each addressing different aspects of the field (general machine learning, computer vision, NLP, ethics, and robotics). They share technical vocabulary and conceptual frameworks while maintaining distinct focuses. This represents good coherence with a clear general field but variation in specific subtopics.
        This is a reference example of coherence level 6/10."""
    ]
    
    # Add everything to our collection of examples
    reinforcement_docs.extend(bad_division_examples)
    reinforcement_docs.extend(mixed_division_examples)
    
    # Add these reinforcement documents to the RAG system
    rag_system.add_documents(reinforcement_docs)
    
    print(f"Added {len(reinforcement_docs)} reinforcement examples to RAG system")
    return rag_system

# ======================================
# SMOOTHING & EVALUATION
# ======================================

def smooth_coherence_scores(scores, threshold=2.5, expected_score=None, mixing_pattern=None):
    """
    Smooth coherence scores to handle outliers and inconsistencies.
    
    Parameters:
    - scores: List of coherence scores from multiple iterations
    - threshold: Maximum allowed difference between scores before applying smoothing
    - expected_score: Optional expected score based on mix pattern (if known)
    - mixing_pattern: String describing the document mix pattern (e.g., "5 same", "10 same")
    
    Returns:
    - Smoothed average score
    """
    if not scores:
        return 0.0
    
    if len(scores) == 1:
        return scores[0]
    
    # Calculate the range (max - min)
    score_range = max(scores) - min(scores)
    
    # If scores are close enough, use simple average
    if score_range <= threshold:
        return sum(scores) / len(scores)
    
    # If scores have high variance, apply smoothing
    print(f"  High variance detected in scores {scores}, applying smoothing...")
    
    # If mixing pattern is provided but expected score isn't, infer it
    if expected_score is None and mixing_pattern is not None:
        if "10 same" in mixing_pattern or "all same" in mixing_pattern.lower():
            expected_score = 10.0  # Expect high coherence for same category
        elif "all different" in mixing_pattern.lower() or "0 same" in mixing_pattern:
            expected_score = 1.0  # Expect low coherence for different categories
        elif "8 same" in mixing_pattern:
            expected_score = 8.0  # Mostly same
        elif "6 same" in mixing_pattern:
            expected_score = 6.0  # Mixed
        elif "4 same" in mixing_pattern:
            expected_score = 4.0  # Mostly different
        elif "5 same" in mixing_pattern:
            expected_score = 10.0  # Expect high coherence for same category 
        elif "4 same" in mixing_pattern:
            expected_score = 8.0  # Mostly same
        elif "3 same" in mixing_pattern:
            expected_score = 6.0  # Mixed
        elif "2 same" in mixing_pattern:
            expected_score = 4.0  # Mostly different
        elif "0 same" in mixing_pattern or "all different" in mixing_pattern.lower():
            expected_score = 1.0  # Expect low coherence for different categories
    
    # Apply different smoothing strategies based on available information
    if expected_score is not None:
        # If we have an expected score, weight by distance from expected
        weights = [1 / (abs(score - expected_score) + 0.5) for score in scores]
        total_weight = sum(weights)
        smoothed = sum(score * weight for score, weight in zip(scores, weights)) / total_weight
        print(f"  Used expected-score smoothing: {smoothed:.2f} (expected {expected_score})")
    else:
        # Without expected score, use a robust average:
        
        # Option 1: Median (good for 3+ scores)
        if len(scores) >= 3:
            # Sort scores and take the middle one
            smoothed = sorted(scores)[len(scores) // 2]
            print(f"  Used median smoothing: {smoothed:.2f}")
        
        # Option 2: Winsorized mean (clip outliers to within threshold of other scores)
        else:
            # For two scores with high variance, use the strategy:
            # 1. If one score is unusually low (< 2) and other is high (> 7), prefer the higher score
            if (min(scores) < 2.0 and max(scores) > 7.0):
                smoothed = max(scores) * 0.9  # Slightly discount the higher score
                print(f"  Used high-score preference smoothing: {smoothed:.2f}")
            else:
                # Otherwise use winsorized mean
                mean = sum(scores) / len(scores)
                clipped_scores = [
                    min(max(score, mean - threshold), mean + threshold)
                    for score in scores
                ]
                smoothed = sum(clipped_scores) / len(clipped_scores)
                print(f"  Used winsorized mean smoothing: {smoothed:.2f}")
    
    return smoothed

def evaluate_with_rag_assistance(groups, topics, llm_processor, rag_system, num_iterations=2, skip_summarization=False):
    """
    Enhanced evaluation function that uses RAG to assist with coherence assessment.
    
    This function evaluates the coherence of groups of documents using both computational methods
    and language model (LLM) assessment with RAG assistance. The function:
    
    1. Initializes the RAG system with calibration examples if needed
    2. Optionally summarizes documents to extract key information
    3. Calculates computational coherence scores using topic modeling
    4. Performs LDA analysis to identify primary topics in each group
    5. Uses an LLM to evaluate coherence with multiple iterations for reliability
    6. Retrieves similar examples from the RAG system to provide context for LLM evaluation
    7. Calculates and returns comprehensive coherence metrics and analyses
    
    Parameters:
    -----------
    groups : List[List[str]]
        List of document groups, where each group is a list of document strings
    topics : List[str]
        List of topic labels corresponding to each document group
    llm_processor : ImprovedLLMProcessor
        Instance of LLM processor to use for text evaluation 
    rag_system : RAGSystem
        Instance of RAG system for retrieving similar coherence examples
    num_iterations : int, default=2
        Number of iterations for LLM coherence scoring (higher = more reliable)
    skip_summarization : bool, default=False
        If True, uses original documents without summarization
        
    Returns:
    --------
    dict
        A dictionary containing comprehensive evaluation results including:
        - LLM coherence scores for each topic
        - LLM analysis details (identified topics, shared elements)
        - Computational coherence scores
        - LDA topic analysis results
        - Document samples used for evaluation
        - RAG examples retrieved for context
        - Summary statistics for each topic
    """
    # Initialize RAG with calibration examples if not already done
    if not rag_system.document_store:
        rag_system = create_coherence_calibration_rag(rag_system)
    
    results = {
        'llm_scores': {topic: [] for topic in topics},
        'llm_analysis': {topic: [] for topic in topics},
        'coherence_scores': {topic: [] for topic in topics},
        'lda_results': {topic: None for topic in topics},
        'document_samples': {topic: [] for topic in topics},
        'rag_examples': {topic: [] for topic in topics}  # Store which examples were retrieved
    }
    
    # Skip summarization if requested
    if skip_summarization:
        print("Using original documents (skipping summarization)...")
        summarized_groups = groups
    else:
        # Summarize documents
        print("Summarizing documents...")
        summarized_groups = []
        for i, group in enumerate(groups):
            print(f"Summarizing group {i+1}/{len(groups)}: {topics[i]}")
            
            # Take a sample of documents
            subset_size = min(5, len(group))
            if len(group) > subset_size:
                np.random.seed(42)
                subset_indices = np.random.choice(len(group), subset_size, replace=False)
                group_subset = [group[i] for i in subset_indices]
            else:
                group_subset = group
            
            # Store document samples
            results['document_samples'][topics[i]] = [doc[:200] for doc in group_subset]
                
            # Summarize the documents
            summarized = []
            for doc in group_subset:
                try:
                    response = llm_processor.process_text(doc, "summarize")
                    if response and not response.error and response.content:
                        # Extract summary properly
                        summary_match = re.search(r'SUMMARY:\s*([^\n]+)', response.content)
                        if summary_match:
                            summary = summary_match.group(1).strip()
                        else:
                            summary = response.content[:300]
                        summarized.append(summary)
                    else:
                        # If summarization fails, use original document
                        summarized.append(doc[:500] + "...")
                except Exception as e:
                    print(f"Summarization error: {e}")
                    summarized.append(doc[:500] + "...")
            summarized_groups.append(summarized)
    
    # Preprocess all groups for coherence calculation
    print("Preprocessing documents...")
    tokenized_groups = []
    for group in groups:
        tokenized_group = []
        for doc in group:
            if isinstance(doc, str):
                # Use the preprocess function
                tokens = preprocess(doc)
                if tokens:  # Only append if we have tokens
                    tokenized_group.append(tokens)
                else:
                    tokenized_group.append(['placeholder'])
            else:
                tokenized_group.append(['placeholder'])
        tokenized_groups.append(tokenized_group)
    
    # Create dictionary from all documents
    all_docs_tokenized = [token for group in tokenized_groups for token in group]
    dictionary = Dictionary(all_docs_tokenized)
    dictionary.filter_extremes(no_below=1, no_above=0.95)
    
    # Calculate coherence scores
    print("Calculating coherence scores...")
    coherence_scores = calculate_coherence_scores(tokenized_groups, dictionary)
    for topic, score in zip(topics, coherence_scores):
        results['coherence_scores'][topic] = [score]
        
    # Perform LDA analysis on each group
    print("Performing LDA analysis...")
    for i, (group, topic) in enumerate(zip(groups, topics)):
        n_topics = min(3, max(2, len(group) // 2))
        lda_result = perform_lda_analysis(group, n_topics=n_topics)
        results['lda_results'][topic] = lda_result
    
    # LLM evaluations with RAG assistance
    if llm_processor:
        for i in range(num_iterations):
            print(f"\nIteration {i + 1}/{num_iterations}")
            print("Performing LLM evaluation with RAG assistance...")
            
            for j, (group, topic) in enumerate(zip(summarized_groups, topics)):
                # Skip groups that are too small for meaningful evaluation
                if len(group) < 2:
                    print(f"Skipping {topic} (too few documents)")
                    continue
                    
                # Get similar coherence examples from RAG
                similar_examples = retrieve_similar_coherence_examples(rag_system, group)
                results['rag_examples'][topic] = [ex.content for ex in similar_examples]
                
                # Format the documents with clear separation
                formatted_docs = []
                for idx, doc in enumerate(group):
                    formatted_docs.append(f"DOCUMENT {idx+1}:\n{doc}")
                docs_text = "\n\n".join(formatted_docs)
                
                # Add RAG-retrieved examples to the evaluation context
                rag_context = ""
                if similar_examples:
                    rag_context = "\n\nSIMILAR REFERENCE EXAMPLES FROM DATABASE:\n"
                    for ex_idx, example in enumerate(similar_examples):
                        rag_context += f"Example {ex_idx+1}:\n{example.content}\n\n"
                
                # Get LLM evaluation with the enhanced context
                try:
                    response = llm_processor.process_text("", "grade", {
                        "documents": docs_text + rag_context
                    })
                    
                    if response and not response.error:
                        results['llm_scores'][topic].append(response.score)
                        results['llm_analysis'][topic].append({
                            'score': response.score,
                            'main_topics': response.metadata.get('main_topics', ''),
                            'shared_elements': response.metadata.get('shared_elements', ''),
                            'full_response': response.content
                        })
                        
                        print(f"  Evaluated {topic}: Score = {response.score}")
                        print(f"    Topics: {response.metadata.get('main_topics', 'N/A')}")
                        print(f"    Retrieved {len(similar_examples)} similar examples from RAG")
                    else:
                        print(f"Error in LLM evaluation for {topic}")
                except Exception as e:
                    print(f"Exception during evaluation of {topic}: {e}")
    
    # Calculate summary statistics
    results['summary'] = {}
    for topic in topics:
        summary = {}
        
        # Coherence score summary
        if results['coherence_scores'][topic]:
            coherence_values = results['coherence_scores'][topic]
            summary['coherence_score'] = {
                'value': coherence_values[0] if coherence_values else 0,
                'normalized': 1 + 9 * coherence_values[0] if coherence_values else 0
            }
        
        # LLM score summary
        if results['llm_scores'][topic]:
            llm_scores = results['llm_scores'][topic]
            summary['llm_score'] = {
                'average': sum(llm_scores) / len(llm_scores),
                'all_scores': llm_scores,
                'identified_topics': [
                    analysis.get('main_topics', '') 
                    for analysis in results['llm_analysis'][topic]
                ]
            }
        
        results['summary'][topic] = summary
    
    return results

# ======================================
# UTILITY FUNCTIONS FOR EXPERIMENT
# ======================================

def get_mixed_dataset(newsgroups, categories, docs_per_category=1):
    """
    Get a mixed dataset with documents from multiple categories.
    """
    mixed_docs = []
    for category in categories:
        print(f"  - {category}: {docs_per_category} document")
        category_indices = [i for i in range(len(newsgroups.target))
                        if newsgroups.target_names[newsgroups.target[i]] == category]
        if category_indices:
            mixed_docs.append(newsgroups.data[category_indices[0]])
    return mixed_docs

# ======================================
# MAIN EXECUTION
# ======================================

# Main function to run 5 runs of experiments, each with 5 teams of 5 documents
if __name__ == "__main__":
    # Make sure to set your API URL for LLM before running
    url = url  # Replace with your actual API URL
    
    # Download NLTK resources if not already downloaded
    try:
        nltk.data.find('tokenizers/punkt')
        nltk.data.find('corpora/stopwords')
    except LookupError:
        print("Downloading NLTK resources...")
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
    
    # Set to False if you don't want separate files for each team
    SAVE_INDIVIDUAL_TEAM_RESULTS = True
    
    # Initial setup - load and preprocess data
    print("\n" + "="*80)
    print("COHERENCE EXPERIMENTS - SETUP")
    print("="*80)
    
    print("\nLoading dataset...")
    # Load the CSV dataset
    try:
        # Load the CSV file
        df = pd.read_csv("event2012.csv")
        print(f"Loaded {len(df)} rows from event2012.csv")
    except Exception as e:
        print(f"Error loading CSV: {e}")
        exit(1)
    
    # Clean the text data
    print("Preprocessing text data...")
    df['clean_text'] = df['text'].apply(clean_text)
    df = df[df['clean_text'].str.len() > 20]  # Keep only texts with more than 20 chars
    print(f"After cleaning: {len(df)} rows")
    
    # Add the tokenization step
    print("Tokenizing text data...")
    df['tokens'] = df['clean_text'].apply(preprocess)
    print(f"Tokenization complete for {len(df)} documents")
    
    # Get label distribution
    label_counts = df['label'].value_counts()
    print("\nLabel distribution (top 15):")
    for label, count in label_counts.iloc[:15].items():
        print(f"  Label {label}: {count} documents ({count/len(df)*100:.2f}%)")
    
    # Get labels with enough documents for our experiment (at least 10 docs)
    # We need at least 10 because we'll sample 5 per label and want some buffer
    excluded_labels = [8,11,157]
    valid_labels = [label for label, count in label_counts.items() 
                   if count >= 10 and label not in excluded_labels]
    
    print(f"\nFound {len(valid_labels)} valid labels with at least 10 documents")
    
    # Check if we have enough valid labels for 5 teams
    if len(valid_labels) < 5:
        print(f"Warning: Not enough valid labels ({len(valid_labels)}). Need at least 5.")
        # Fallback if we don't have enough
        num_primary_labels = len(valid_labels)
        print(f"Will use {num_primary_labels} primary labels")
        selected_labels = valid_labels[:num_primary_labels]
    else:
        # Select 5 primary labels for our experiment teams
        selected_labels = valid_labels[:5]
    
    print("\nSelected primary labels for teams:")
    for label in selected_labels:
        count = label_counts[label]
        print(f"  Label {label}: {count} documents")
    
    # Dictionary to store documents for each selected label
    label_docs = {}
    for label in selected_labels:
        docs = df[df['label'] == label]['clean_text'].tolist()
        # Sample 5 documents (or all if less than 5)
        sample_size = min(5, len(docs))
        label_docs[label] = random.sample(docs, sample_size)
        print(f"  Sampled {sample_size} documents for Label {label}")
    
    # Define the 5 experiment runs based on mixing patterns
    experiment_runs = [
        {
            "name": "Run 1: All Same Category",
            "description": "5 documents from the same category for each team",
            "mixing_pattern": "5 same",
            "doc_counts": {"same": 5, "different": 0},
            "expected_score": 10.0  # High coherence expected
        },
        {
            "name": "Run 2: Mostly Same Category",
            "description": "4 documents from same category, 1 from different for each team",
            "mixing_pattern": "4 same + 1 different",
            "doc_counts": {"same": 4, "different": 1},
            "expected_score": 8.0  # Fairly high coherence expected
        },
        {
            "name": "Run 3: Mixed Categories",
            "description": "3 documents from same category, 2 from different for each team",
            "mixing_pattern": "3 same + 2 different", 
            "doc_counts": {"same": 3, "different": 2},
            "expected_score": 6.0  # Moderate coherence expected
        },
        {
            "name": "Run 4: Mostly Different",
            "description": "2 documents from same category, 3 from different for each team",
            "mixing_pattern": "2 same + 3 different",
            "doc_counts": {"same": 2, "different": 3},
            "expected_score": 4.0  # Lower coherence expected
        },
        {
            "name": "Run 5: All Different",
            "description": "0 documents from same category, 5 from different for each team",
            "mixing_pattern": "0 same + 5 different",
            "doc_counts": {"same": 0, "different": 5},
            "expected_score": 1.0  # Low coherence expected
        }
    ]
    
    # Create a directory for results
    results_dir = f"coherence_experiments_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    os.makedirs(results_dir, exist_ok=True)
    
    # Save the experiment configuration
    with open(f"{results_dir}/experiment_config.json", "w") as f:
        json.dump({
            "timestamp": datetime.now().isoformat(),
            "primary_labels": [int(l) for l in selected_labels],
            "runs": [
                {
                    "name": run["name"],
                    "description": run["description"],
                    "mixing_pattern": run["mixing_pattern"],
                    "doc_counts": run["doc_counts"],
                    "expected_score": run.get("expected_score")
                } for run in experiment_runs
            ]
        }, f, indent=2)
    
    # Run each experiment run
    all_results = {}
    
    for run_idx, run_config in enumerate(experiment_runs):
        run_name = run_config["name"]
        same_count = run_config["doc_counts"]["same"]
        diff_count = run_config["doc_counts"]["different"]
        expected_score = run_config.get("expected_score")
        mixing_pattern = run_config["mixing_pattern"]
        
        print("\n" + "="*80)
        print(f"RUNNING {run_name}")
        print("="*80)
        print(f"Mixing pattern: {mixing_pattern}")
        print(f"Document counts: {same_count} from same category, {diff_count} from different categories")
        print(f"Expected coherence score: {expected_score if expected_score else 'Not specified'}")
        
        # Initialize the RAG system and LLM processor for this run
        print("\nInitializing RAG system and LLM processor...")
        rag_system = RAGSystem(embedding_model="all-MiniLM-L6-v2")
        rag_system = create_coherence_calibration_rag(rag_system)
        rag_system = reinforce_calibration_examples(rag_system)
        llm_processor = ImprovedLLMProcessor(url)
        llm_processor = update_llm_processor_with_calibration(llm_processor)
        
        # Create teams for this run
        run_groups = []
        run_group_names = []
        run_group_compositions = []
        run_group_expected_scores = []
        
        for team_idx, primary_label in enumerate(selected_labels):
            # Create a group with the specified mix
            team_docs = []
            composition = f"Team {team_idx+1:02d} (Primary: Label {primary_label}): "
            
            # Add documents from the same category (primary label)
            if same_count > 0:
                team_docs.extend(label_docs[primary_label][:same_count])
                composition += f"{same_count} from L{primary_label}"
            
            # Add documents from different categories
            if diff_count > 0:
                if same_count > 0:
                    composition += ", "
                
                # For different categories, use documents from other primary labels
                # This ensures we're really mixing different topics
                different_labels = [l for l in selected_labels if l != primary_label]
                
                # If we don't have enough different labels, we'll need to reuse some
                if len(different_labels) < diff_count:
                    # Repeat the list as needed
                    different_labels = (different_labels * ((diff_count // len(different_labels)) + 1))[:diff_count]
                
                diff_docs_added = 0
                composition += f"{diff_count} from: "
                
                for i, diff_label in enumerate(different_labels[:diff_count]):
                    # Get a document from this label that hasn't been used in this team yet
                    available_docs = [doc for doc in label_docs[diff_label] 
                                     if doc not in team_docs]
                    
                    if available_docs:
                        doc_to_add = available_docs[0]
                        team_docs.append(doc_to_add)
                        diff_docs_added += 1
                        
                        if i == 0:
                            composition += f"L{diff_label}"
                        elif i == diff_count - 1 or i >= 2:  # Only list up to 3 labels
                            composition += f", L{diff_label}"
                        else:
                            composition += f", L{diff_label}"
                    
                    if i >= 2 and i < diff_count - 1:
                        composition += ", ..."
                        break
                
                # If we couldn't get enough different documents, add some from the primary label
                if diff_docs_added < diff_count:
                    missing = diff_count - diff_docs_added
                    team_docs.extend(label_docs[primary_label][same_count:same_count+missing])
                    composition += f" (had to reuse {missing} docs from L{primary_label})"
            
            run_groups.append(team_docs)
            run_group_names.append(f"Team {team_idx+1:02d} (Label {primary_label})")
            run_group_compositions.append(composition)
            run_group_expected_scores.append(expected_score)
            
            print(f"\nCreated {composition}")
            print(f"  Team has {len(team_docs)} documents")
            for doc_idx, doc in enumerate(team_docs[:3]):  # Show first 3 docs
                print(f"  Doc {doc_idx+1}: {doc[:100]}...")
            if len(team_docs) > 3:
                print(f"  ... and {len(team_docs)-3} more documents")
        
        # Run evaluation
        print(f"\nEvaluating all teams for {run_name}...")
        results = evaluate_with_rag_assistance(
            groups=run_groups,
            topics=run_group_names,
            llm_processor=llm_processor,
            rag_system=rag_system,
            num_iterations=3,  # Three iterations for more data points
            skip_summarization=False
        )
        
        # Extract and store results
        run_results = {}
        comp_scores = []
        llm_scores = []
        llm_raw_scores = []  # Store raw (unsmoothed) scores too
        
        print("\nTEAM RESULTS:")
        print("-" * 60)
        
        for team_idx, (team_name, composition) in enumerate(zip(run_group_names, run_group_compositions)):
            if team_name in results['summary']:
                comp_score = results['summary'][team_name]['coherence_score']['normalized']
                raw_comp_score = results['summary'][team_name]['coherence_score']['value']
                llm_scores_all = results['summary'][team_name]['llm_score']['all_scores']
                topics = results['summary'][team_name]['llm_score']['identified_topics']
                
                # Apply smoothing to the LLM scores
                smoothed_llm_score = smooth_coherence_scores(
                    llm_scores_all, 
                    threshold=2.5, 
                    expected_score=run_group_expected_scores[team_idx],
                    mixing_pattern=mixing_pattern
                )
                
                # Calculate raw average for comparison
                raw_llm_avg = sum(llm_scores_all) / len(llm_scores_all) if llm_scores_all else 0
                
                comp_scores.append(comp_score)
                llm_scores.append(smoothed_llm_score)
                llm_raw_scores.append(raw_llm_avg)
                
                # Print team results
                print(f"\n{team_name}:")
                print(f"Composition: {composition}")
                print(f"Computational coherence score: {comp_score:.2f} (raw: {raw_comp_score:.4f})")
                print(f"LLM raw scores: {llm_scores_all}")
                print(f"LLM raw average: {raw_llm_avg:.2f}")
                print(f"LLM smoothed score: {smoothed_llm_score:.2f}")
                if abs(smoothed_llm_score - raw_llm_avg) > 0.01:  # Only show if different
                    print(f"  Smoothing applied: {raw_llm_avg:.2f} → {smoothed_llm_score:.2f}")
                
                print(f"Identified topics: {topics[0] if topics else 'None'}")
                
                # Store team results
                run_results[team_name] = {
                    "composition": composition,
                    "computational_coherence": {
                        "raw": float(raw_comp_score),
                        "normalized": float(comp_score)
                    },
                    "llm_coherence": {
                        "raw_scores": [float(s) for s in llm_scores_all],
                        "raw_average": float(raw_llm_avg),
                        "smoothed_score": float(smoothed_llm_score),
                        "smoothing_applied": abs(smoothed_llm_score - raw_llm_avg) > 0.01,
                        "topics": topics
                    }
                }
                
                # Save individual team results if enabled
                if SAVE_INDIVIDUAL_TEAM_RESULTS:
                    with open(f"{results_dir}/run_{run_idx+1}_team_{team_idx+1:02d}_results.json", "w") as f:
                        json.dump(run_results[team_name], f, indent=2)
        
        # Add a summary table for all teams in this run
        print("\nSUMMARY OF ALL TEAMS IN RUN:")
        print(f"{'Team':<15} {'Comp':<7} {'LLM':<7}")
        print("-" * 30)
        for team_idx, team_name in enumerate(run_group_names):
            if team_name in results['summary']:
                comp = f"{comp_scores[team_idx]:.2f}"
                llm = f"{llm_scores[team_idx]:.2f}"
                print(f"{team_name[:12]:<15} {comp:<7} {llm:<7}")
        
        # Add visualization of all team scores
        print("\nTEAM SCORE COMPARISON:")
        max_team_score = max(
            max(comp_scores) if comp_scores else 0,
            max(llm_scores) if llm_scores else 0
        )
        team_scale = 30 / max_team_score if max_team_score > 0 else 1
        
        for team_idx, team_name in enumerate(run_group_names):
            if team_name in results['summary']:
                team_short = f"Team {team_idx+1:02d}"
                comp = comp_scores[team_idx]
                llm = llm_scores[team_idx]
                
                comp_bar = "█" * int(comp * team_scale)
                llm_bar = "█" * int(llm * team_scale)
                
                print(f"{team_short}: C:{comp:.1f} {comp_bar}  L:{llm:.1f} {llm_bar}")
        
        # Calculate run averages
        if comp_scores and llm_scores:
            avg_comp_score = sum(comp_scores) / len(comp_scores)
            avg_llm_score = sum(llm_scores) / len(llm_scores)
            avg_llm_raw_score = sum(llm_raw_scores) / len(llm_raw_scores)
            
            print("\nRUN AVERAGES:")
            print(f"Average computational coherence: {avg_comp_score:.2f}")
            print(f"Average LLM coherence (smoothed): {avg_llm_score:.2f}")
            print(f"Average LLM coherence (raw): {avg_llm_raw_score:.2f}")
            
            # Store run results with averages
            all_results[run_name] = {
                "config": run_config,
                "teams": run_results,
                "averages": {
                    "computational_coherence": float(avg_comp_score),
                    "llm_coherence_smoothed": float(avg_llm_score),
                    "llm_coherence_raw": float(avg_llm_raw_score),
                    "computational_scores": [float(s) for s in comp_scores],
                    "llm_scores_smoothed": [float(s) for s in llm_scores],
                    "llm_scores_raw": [float(s) for s in llm_raw_scores]
                }
            }
        
        # Save run summary results
        with open(f"{results_dir}/run_{run_idx+1}_results.json", "w") as f:
            json.dump(all_results[run_name], f, indent=2)
        
        # Pause between runs to avoid rate limiting
        if run_idx < len(experiment_runs) - 1:
            print(f"\nPausing for 5 seconds before next run...")
            time.sleep(5)
    
    # Once all runs are complete, compile the final results
    print("\n" + "="*80)
    print("ALL RUNS COMPLETE - FINAL RESULTS")
    print("="*80)
    
    # Collect average scores for each run
    run_names = [run["name"] for run in experiment_runs]
    avg_comp_scores = []
    avg_llm_scores = []
    avg_llm_raw_scores = []
    
    for run_name in run_names:
        if run_name in all_results:
            avg_comp_scores.append(all_results[run_name]["averages"]["computational_coherence"])
            avg_llm_scores.append(all_results[run_name]["averages"]["llm_coherence_smoothed"])
            avg_llm_raw_scores.append(all_results[run_name]["averages"]["llm_coherence_raw"])
    
    # Print score comparison
    print("\nAVERAGE COHERENCE SCORES BY RUN:")
    print("-" * 60)
    
    for i, run_name in enumerate(run_names):
        if run_name in all_results:
            mixing_pattern = experiment_runs[i]["mixing_pattern"]
            comp_score = avg_comp_scores[i]
            llm_score = avg_llm_scores[i]
            llm_raw = avg_llm_raw_scores[i]
            
            print(f"\n{run_name}")
            print(f"Mixing pattern: {mixing_pattern}")
            print(f"Avg computational coherence: {comp_score:.2f}")
            print(f"Avg LLM coherence (smoothed): {llm_score:.2f}")
            print(f"Avg LLM coherence (raw): {llm_raw:.2f}")
    
    # Check if average scores follow the expected decreasing trend
    if len(avg_comp_scores) >= 5:
        is_comp_decreasing = all(avg_comp_scores[i] >= avg_comp_scores[i+1] 
                                for i in range(len(avg_comp_scores)-1))
        is_llm_decreasing = all(avg_llm_scores[i] >= avg_llm_scores[i+1] 
                               for i in range(len(avg_llm_scores)-1))
        is_llm_raw_decreasing = all(avg_llm_raw_scores[i] >= avg_llm_raw_scores[i+1] 
                                   for i in range(len(avg_llm_raw_scores)-1))
        
        print("\nTREND ANALYSIS:")
        print(f"Computational scores follow expected decreasing trend: {is_comp_decreasing}")
        print(f"LLM scores (smoothed) follow expected decreasing trend: {is_llm_decreasing}")
        print(f"LLM scores (raw) follow expected decreasing trend: {is_llm_raw_decreasing}")
        
        # Check if smoothing improved trend adherence
        if is_llm_decreasing != is_llm_raw_decreasing:
            if is_llm_decreasing:
                print("Smoothing IMPROVED trend adherence (raw scores didn't follow expected trend)")
            else:
                print("Smoothing WORSENED trend adherence (raw scores followed expected trend)")
        
        # Calculate decrease percentage
        if avg_comp_scores[0] > 0 and avg_llm_scores[0] > 0:
            comp_decrease = ((avg_comp_scores[0] - avg_comp_scores[-1]) / avg_comp_scores[0]) * 100
            llm_decrease = ((avg_llm_scores[0] - avg_llm_scores[-1]) / avg_llm_scores[0]) * 100
            llm_raw_decrease = ((avg_llm_raw_scores[0] - avg_llm_raw_scores[-1]) / avg_llm_raw_scores[0]) * 100
            
            print(f"Average computational coherence decreased by {comp_decrease:.1f}% from Run 1 to Run 5")
            print(f"Average LLM coherence (smoothed) decreased by {llm_decrease:.1f}% from Run 1 to Run 5")
            print(f"Average LLM coherence (raw) decreased by {llm_raw_decrease:.1f}% from Run 1 to Run 5")
    
    # Create a simple ASCII chart 
    print("\nAVERAGE COHERENCE SCORE TRENDS:")
    print("-" * 60)
    
    # Find maximum score for scaling the chart
    max_score = max(
        max(avg_comp_scores) if avg_comp_scores else 0, 
        max(avg_llm_scores) if avg_llm_scores else 0,
        max(avg_llm_raw_scores) if avg_llm_raw_scores else 0
    )
    scale = 40 / max_score if max_score > 0 else 1
    
    for i, run_name in enumerate(run_names):
        if i < len(avg_comp_scores) and i < len(avg_llm_scores) and i < len(avg_llm_raw_scores):
            run_short = f"Run {i+1}"
            comp_score = avg_comp_scores[i]
            llm_score = avg_llm_scores[i]
            llm_raw = avg_llm_raw_scores[i]
            
            comp_bar = "█" * int(comp_score * scale)
            llm_bar = "█" * int(llm_score * scale)
            llm_raw_bar = "▒" * int(llm_raw * scale)  # Different character for raw scores
            
            print(f"{run_short:8} Comp:     {comp_score:.2f} {comp_bar}")
            print(f"{' ':8} LLM Raw:  {llm_raw:.2f} {llm_raw_bar}")
            print(f"{' ':8} LLM Smth: {llm_score:.2f} {llm_bar}")
            print()
    
    # Save the final compiled results
    with open(f"{results_dir}/all_results.json", "w") as f:
        json.dump({
            "runs": all_results,
            "trend_analysis": {
                "avg_computational_scores": avg_comp_scores,
                "avg_llm_scores_smoothed": avg_llm_scores,
                "avg_llm_scores_raw": avg_llm_raw_scores,
                "computational_decreasing": is_comp_decreasing if 'is_comp_decreasing' in locals() else None,
                "llm_smoothed_decreasing": is_llm_decreasing if 'is_llm_decreasing' in locals() else None,
                "llm_raw_decreasing": is_llm_raw_decreasing if 'is_llm_raw_decreasing' in locals() else None,
                "computational_decrease_percentage": comp_decrease if 'comp_decrease' in locals() else None,
                "llm_smoothed_decrease_percentage": llm_decrease if 'llm_decrease' in locals() else None,
                "llm_raw_decrease_percentage": llm_raw_decrease if 'llm_raw_decrease' in locals() else None
            }
        }, f, indent=2)
    
    # Also save a summary of the experiment
    with open(f"{results_dir}/experiment_summary.txt", "w") as f:
        f.write("COHERENCE EXPERIMENTS SUMMARY\n")
        f.write("=" * 40 + "\n")
        f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Results directory: {results_dir}\n\n")
        
        f.write("AVERAGE COHERENCE SCORES BY RUN:\n")
        f.write("-" * 40 + "\n")
        for i, run_name in enumerate(run_names):
            if run_name in all_results:
                f.write(f"\n{run_name}\n")
                mixing_pattern = experiment_runs[i]["mixing_pattern"]
                f.write(f"Mixing pattern: {mixing_pattern}\n")
                f.write(f"Avg computational coherence: {avg_comp_scores[i]:.2f}\n")
                f.write(f"Avg LLM coherence (smoothed): {avg_llm_scores[i]:.2f}\n")
                f.write(f"Avg LLM coherence (raw): {avg_llm_raw_scores[i]:.2f}\n")
        
        if 'is_comp_decreasing' in locals():
            f.write("\nTREND ANALYSIS:\n")
            f.write("-" * 40 + "\n")
            f.write(f"Computational scores follow expected decreasing trend: {is_comp_decreasing}\n")
            f.write(f"LLM scores (smoothed) follow expected decreasing trend: {is_llm_decreasing}\n")
            f.write(f"LLM scores (raw) follow expected decreasing trend: {is_llm_raw_decreasing}\n")
            
            if 'comp_decrease' in locals():
                f.write(f"\nAverage computational coherence decreased by {comp_decrease:.1f}% from Run 1 to Run 5\n")
                f.write(f"Average LLM coherence (smoothed) decreased by {llm_decrease:.1f}% from Run 1 to Run 5\n")
                f.write(f"Average LLM coherence (raw) decreased by {llm_raw_decrease:.1f}% from Run 1 to Run 5\n")
    
    print(f"\nAll run results saved to {results_dir}/ directory")
    print("\nExperiments complete!")