In [1]:
import re
import json
import ollama
import uuid
import time
import logging
import os
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from langchain_nomic.embeddings import NomicEmbeddings
from typing import Dict, List, Tuple, Any

## ***GLOBAL CONFIGURATIONS***


In [14]:
class Config:
    # Core parameters
    INPUT_FILE = "/mnt/Main Drive/Codes/LLM Agents/Assistant/Facets recognition/Facets Assignment.csv"
    COLLECTION_NAME = "conversation_facets"
    EMBEDDING_MODEL = "nomic-embed-text-v1.5"
    
    # Batch processing settings
    PREPROCESS_BATCH_SIZE = 10
    CATEGORIZE_BATCH_SIZE = 50
    SCORING_BATCH_SIZE = 20
    
    # LLM settings
    LLM_MODEL = "llama3.2"
    LLM_TEMPERATURE = 0.1
    
    # Paths
    METADATA_FILE = "facet_metadata.json"
    CATEGORY_FILE = "facet_categories.json"
    RESULTS_FILE = "evaluation_results.json"
    
    # Qdrant settings
    QDRANT_HOST = "localhost"
    QDRANT_PORT = 6333
    
    # Debugging
    DEBUG_MODE = False
    VALIDATE_CATEGORIZATION = True

# Initialize logging
logging.basicConfig(
    level=logging.DEBUG if Config.DEBUG_MODE else logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('ConversationEvaluator')

# Initialize Qdrant client
client = QdrantClient(
    host=Config.QDRANT_HOST,
    port=Config.QDRANT_PORT
)

# Initialize Nomic embeddings
embed_model = NomicEmbeddings(model=Config.EMBEDDING_MODEL, inference_mode="local")

2025-07-04 21:43:33,094 - httpx - INFO - HTTP Request: GET http://localhost:6333 "HTTP/1.1 200 OK"


## ***VECTOR DATABASE***


In [None]:
class QdrantVectorDB:
    def __init__(self, collection_name=Config.COLLECTION_NAME):
        self.client = client
        self.collection_name = collection_name
        self.embed_model = embed_model
        self._initialize_collection()
    
    def _initialize_collection(self):
        """Create collection if it doesn't exist"""
        try:
            self.client.get_collection(self.collection_name)
            logger.info(f"Collection {self.collection_name} already exists")
        except Exception as e:
            logger.info(f"Creating new collection: {self.collection_name}")
            self.client.recreate_collection(
                collection_name=self.collection_name,
                vectors_config=VectorParams(
                    size=768,  # Nomic embedding size
                    distance=Distance.COSINE
                )
            )
    
    def store_facets(self, metadata: Dict):
        """Store facet metadata in vector database in batches"""
        points = []
        for facet_id, data in metadata.items():
            # Create payload
            payload = {
                "id": facet_id,
                "original": data["original"],
                "scoring_guideline": data["scoring_guideline"],
                "primary_category": data.get("primary_category", ""),
                "subcategory": data.get("subcategory", "")
            }
            
            # Generate embedding
            text = f"{facet_id}: {data['scoring_guideline']}"
            embedding = self.embed_model.embed_query(text)
            
            points.append(PointStruct(
                id=hash(facet_id) % (2**64),
                vector=embedding,
                payload=payload
            ))
            
            # Store in batches
            if len(points) >= Config.PREPROCESS_BATCH_SIZE:
                self._upsert_points(points)
                points = []
        
        # Store remaining points
        if points:
            self._upsert_points(points)
    
    def _upsert_points(self, points: List):
        """Store points with error handling"""
        try:
            self.client.upsert(
                collection_name=self.collection_name,
                points=points,
                wait=True
            )
            logger.info(f"Stored {len(points)} facets in vector database")
        except Exception as e:
            logger.error(f"Failed to store points: {str(e)}")
    
    def retrieve_facet(self, facet_id: str) -> Dict:
        """Retrieve facet by exact ID match"""
        try:
            result = self.client.scroll(
                collection_name=self.collection_name,
                scroll_filter={
                    "must": [{"key": "id", "match": {"value": facet_id}}]
                },
                limit=1,
                with_payload=True
            )
            return result[0][0].payload if result[0] else None
        except Exception as e:
            logger.error(f"Retrieval failed: {str(e)}")
            return None
    
    def semantic_search(self, query: str, top_k: int = 5) -> List[Dict]:
        """Find similar facets using semantic search"""
        try:
            query_embedding = self.embed_model.embed_query(query)
            results = self.client.search(
                collection_name=self.collection_name,
                query_vector=query_embedding,
                limit=top_k,
                with_payload=True
            )
            return [{
                "id": hit.payload["id"],
                "score": hit.score,
                "payload": hit.payload
            } for hit in results]
        except Exception as e:
            logger.error(f"Semantic search failed: {str(e)}")
            return []
    
    def get_all_facets(self) -> Dict:
        """Retrieve all facets from vector database"""
        try:
            results = self.client.scroll(
                collection_name=self.collection_name,
                limit=10000,
                with_payload=True
            )
            return {hit.payload["id"]: hit.payload for hit in results[0]}
        except Exception as e:
            logger.error(f"Failed to get all facets: {str(e)}")
            return {}

## ***PREPROCESSING CLASSES***


In [4]:
# class FacetPreprocessor:
#     def __init__(self, input_file=Config.INPUT_FILE):
#         self.input_file = input_file
        
#     def load_raw_facets(self) -> Tuple[str, int]:
#         """Load raw facet data from CSV and return content + facet count"""
#         try:
#             with open(self.input_file, 'r') as f:
#                 content = f.read()
#                 # Count non-empty lines
#                 facet_count = sum(1 for line in content.split('\n') if line.strip())
#                 logger.info(f"Loaded raw CSV with {facet_count} facets")
#                 return content, facet_count
#         except Exception as e:
#             logger.error(f"Failed to load facets: {str(e)}")
#             return "", 0

#     def preprocess_facets(self) -> Dict:
#         """Leting the LLM handle entire preprocessing with batch support"""
#         raw_csv, facet_count = self.load_raw_facets()
        
#         # If file is large, process in batches
#         if facet_count > Config.PREPROCESS_BATCH_SIZE:
#             logger.info(f"Processing large CSV in batches of {Config.PREPROCESS_BATCH_SIZE}")
#             return self._batch_preprocess(raw_csv, facet_count)
#         else:
#             logger.info("Processing CSV in single batch")
#             return self._process_batch(raw_csv)

#     def _batch_preprocess(self, raw_csv: str, facet_count: int) -> Dict:
#         """Process CSV in batches"""
#         metadata = {}
#         lines = raw_csv.split('\n')
#         batch_count = (facet_count + Config.PREPROCESS_BATCH_SIZE - 1) // Config.PREPROCESS_BATCH_SIZE
        
#         for i in range(batch_count):
#             start = i * Config.PREPROCESS_BATCH_SIZE
#             end = min((i + 1) * Config.PREPROCESS_BATCH_SIZE, facet_count)
#             batch_lines = lines[start:end]
#             batch_csv = "\n".join(batch_lines)
            
#             logger.info(f"Processing batch {i+1}/{batch_count} ({len(batch_lines)} facets)")
#             batch_metadata = self._process_batch(batch_csv)
#             metadata.update(batch_metadata)
        
#         return metadata

#     def _process_batch(self, batch_csv: str) -> Dict:
#         prompt = f"""
#         You are a facet standardization specialist. Perform these tasks with strict rules:

#         1. **Input Processing**:
#            - Process the raw CSV content containing facets
#            - Extract all facet names from the provided content

#         2. **Normalization**:
#            - Remove all numbering prefixes (e.g., "644." → "")
#            - Extract core facet names (e.g., "HEXACO domain: Honesty-Humility" → "Honesty-Humility")
#            - Convert to lowercase_with_underscores format (e.g., "Democratic Leadership" → "democratic_leadership")

#         3. **Metadata Generation**:
#            - Create 1-sentence scoring guidelines for each facet: 
#              "Score 1-5 where 1=[low extreme], 5=[high extreme] based on [criterion]"
#            - Flag facets needing external verification (medical, spiritual, technical)

#         4. **Output Constraints**:
#            - JSON format ONLY
#            - Structure: {{"facet_name": {{"original": "...", "scoring_guideline": "...", "needs_external_data": bool}}}}
#            - Include ALL facets from the input

#         Raw CSV Content:
#         {batch_csv}

#         Output JSON format:
#         {{
#           "facet_metadata": {{
#             "normalized_name_1": {{
#               "original": "Original facet text",
#               "scoring_guideline": "Scoring description",
#               "needs_external_data": false
#             }},
#             "normalized_name_2": {{
#               "original": "Original facet text",
#               "scoring_guideline": "Scoring description",
#               "needs_external_data": true
#             }}
#           }}
#         }}
#         """
        
        
#         response = self.query_llm(prompt)
#         try:
#             return json.loads(response)['facet_metadata']
#         except Exception as e:
#             logger.error(f"Failed to parse preprocessing response: {str(e)}")
#             return {}

#     def query_llm(self, prompt: str) -> str:
#         try:
#             response = ollama.generate(
#                 model=Config.LLM_MODEL,
#                 prompt=prompt,
#                 format="json",
#                 options={"temperature": 0.05}
#             )
#             return response['response'].strip()
#         except Exception as e:
#             logger.error(f"LLM query failed: {str(e)}")
#             return "{}"

In [None]:
class FacetPreprocessor:
    def __init__(self, input_file=Config.INPUT_FILE):
        self.input_file = input_file
        self.raw_lines = []
        
    def load_raw_facets(self) -> List[str]:
        """Load raw facet data from CSV"""
        try:
            with open(self.input_file, 'r') as f:
                self.raw_lines = [line.strip() for line in f if line.strip()]
            logger.info(f"Loaded {len(self.raw_lines)} raw facets from CSV")
            return self.raw_lines
        except Exception as e:
            logger.error(f"Failed to load facets: {str(e)}")
            return []

    def preprocess_facets(self) -> Dict:
        """Process raw CSV content to generate facet metadata with incremental saving"""
        if not self.raw_lines:
            self.load_raw_facets()
        
        # Load existing metadata if available
        metadata = {}
        if os.path.exists(Config.METADATA_FILE):
            try:
                with open(Config.METADATA_FILE, 'r') as f:
                    metadata = json.load(f)
                logger.info(f"Loaded existing metadata with {len(metadata)} facets")
            except Exception as e:
                logger.error(f"Failed to load existing metadata: {str(e)}")
        
        batches = [self.raw_lines[i:i + Config.PREPROCESS_BATCH_SIZE] 
                   for i in range(0, len(self.raw_lines), Config.PREPROCESS_BATCH_SIZE)]
        
        for i, batch in enumerate(batches):
            # Skip already processed batches
            batch_csv = "\n".join(batch)
            if self._is_batch_processed(metadata, batch):
                logger.info(f"Skipping already processed batch {i+1}/{len(batches)}")
                continue
                
            logger.info(f"Processing batch {i+1}/{len(batches)} with {len(batch)} facets")
            batch_metadata = self._process_batch(batch_csv)
            
            # Update metadata and save incrementally
            metadata.update(batch_metadata)
            self._save_metadata(metadata)
        
        logger.info(f"Preprocessing complete. Total facets: {len(metadata)}")
        return metadata

    def _is_batch_processed(self, metadata: Dict, batch: List[str]) -> bool:
        """Check if batch is already in metadata"""
        for line in batch:
            # Check if any line from batch exists in metadata
            for facet_data in metadata.values():
                if line == facet_data["original"]:
                    return True
        return False

    def _process_batch(self, batch_csv: str) -> Dict:
        """Process a batch of raw CSV content"""
        prompt = f"""
        You are a facet standardization specialist. Your role is to extract the meaning from the input, if the input dosen't look like a category discard it.Perform these tasks with strict rules:

        1. **Input Processing**:
           - Process the raw CSV content containing facets
           - Extract all facet names from the provided content remove the extra text
           

        2. **Normalization**:
           - Remove all numbering prefixes (e.g., "644." → "")
           - Extract core facet names (e.g., "HEXACO domain: Honesty-Humility" → "Honesty-Humility", "Psychomotor Ability Subcomponents:Mysteriousness" → "Mysteriousness",
           "big five facet openness \u2013 artistic interests" → "artistic_interests")
           - Convert to lowercase_with_underscores format (e.g., "Democratic Leadership" → "democratic_leadership")

        3. **Metadata Generation**:
           - Create 1-sentence scoring guidelines for each facet: 
             "Score 1-5 where 1=[low extreme], 5=[high extreme] based on [criterion]"

        4. **Output Constraints**:
           - JSON format ONLY
           - Structure: {{"facet_name": {{"original": str, "scoring_guideline": str, "needs_external_data": bool}}}}
           - Include ALL facets from the input

        Raw CSV Content:
        {batch_csv}

        Output JSON format:
        {{
          "facet_metadata": {{
            "normalized_name_1": {{
              "original": "Original facet text",
              "scoring_guideline": "Scoring description",
            }},
            "normalized_name_2": {{
              "original": "Original facet text",
              "scoring_guideline": "Scoring description",
            }}
          }}
        }}
        """
        
        response = self.query_llm(prompt)
        try:
            result = json.loads(response)
            return result.get('facet_metadata', {})
        except Exception as e:
            logger.error(f"Failed to parse batch response: {str(e)}")
            logger.debug(f"Raw LLM response: {response}")
            return {}

    def _save_metadata(self, metadata: Dict):
        """Save metadata to file with atomic write"""
        try:
            # Atomic write to prevent corruption
            temp_file = Config.METADATA_FILE + ".tmp"
            with open(temp_file, 'w') as f:
                json.dump(metadata, f, indent=2)
            
            # Replace original file
            os.replace(temp_file, Config.METADATA_FILE)
            logger.info(f"Saved metadata to {Config.METADATA_FILE} ({len(metadata)} facets)")
        except Exception as e:
            logger.error(f"Failed to save metadata: {str(e)}")

    def query_llm(self, prompt: str) -> str:
        """Query LLM with error handling and retries"""
        max_retries = 3
        for attempt in range(max_retries):
            try:
                response = ollama.generate(
                    model=Config.LLM_MODEL,
                    prompt=prompt,
                    format="json",
                    options={"temperature": 0.0}
                )
                return response['response'].strip()
            except Exception as e:
                logger.warning(f"LLM query attempt {attempt+1} failed: {str(e)}")
                time.sleep(2)
        logger.error("All LLM query attempts failed")
        return "{}"

In [6]:
class FacetCategorizer:
    def __init__(self, metadata: Dict):
        self.metadata = metadata
        
    def categorize_facets(self) -> Dict:
        """Categorize facets using improved prompt with batching"""
        facet_ids = list(self.metadata.keys())
        batches = [facet_ids[i:i + Config.CATEGORIZE_BATCH_SIZE] 
                   for i in range(0, len(facet_ids), Config.CATEGORIZE_BATCH_SIZE)]
        
        category_map = {
            "categories": {
                "Linguistic": [],
                "Pragmatics": [],
                "Safety": [],
                "Emotion": [],
                "Other": []  # Replaced "Specialized" with "Other"
            },
            "category_definitions": {
                "Linguistic": "Facets related to language structure and mechanics",
                "Pragmatics": "Facets related to contextual appropriateness",
                "Safety": "Facets indicating potential harm or ethical violations",
                "Emotion": "Facets related to emotional states and expressions",
                "Other": "Miscellaneous facets that don't fit other categories"  # Updated definition
            }
        }
        
        for i, batch in enumerate(batches):
            logger.info(f"Categorizing batch {i+1}/{len(batches)} with {len(batch)} facets")
            batch_result = self._categorize_batch(batch)
            self._merge_results(category_map, batch_result)
        
        # Validate categorization
        if Config.VALIDATE_CATEGORIZATION:
            self._validate_categorization(category_map, facet_ids)
        
        # Save category map
        with open(Config.CATEGORY_FILE, 'w') as f:
            json.dump(category_map, f, indent=2)
        
        logger.info(f"Categorization complete. Saved to {Config.CATEGORY_FILE}")
        return category_map

    def _categorize_batch(self, batch: List[str]) -> Dict:
        """Categorize a batch of facets"""
        prompt = f"""
        You are a classification expert. Categorize facets using strict rules:

        1. **Primary Categories** (assign exactly one):
           - Linguistic: Language mechanics (grammar, structure, clarity)
             Examples: "sentence_structure", "brevity", "spelling_accuracy"
           - Pragmatics: Contextual appropriateness (relevance, timing)
             Examples: "contextual_relevance", "timing", "appropriateness"
           - Safety: Harm potential (hostility, ethics, risk)
             Examples: "hostility", "disrespect", "harmfulness"
           - Emotion: Affective states (sentiment, intensity)
             Examples: "joyfulness", "depression_symptoms", "contentment_levels"
           - Other: Miscellaneous facets that don't fit other categories
             Examples: "technical_term_usage", "cultural_reference_accuracy"

        2. **Decision Protocol**:
           - Use category keyword matching (see examples above)
           - Safety has highest priority, then Emotion > Pragmatics > Linguistic
           - Assign to Other only if facet clearly doesn't fit other categories

        3. **Output Requirements**:
           - JSON with categorization structure
           - Categorize ALL facets in the batch
           - Do not leave any category empty without good reason

        Facets: {batch}

        Output JSON format:
        {{
          "categories": {{
            "Linguistic": ["list"],
            "Pragmatics": ["list"],
            "Safety": ["list"],
            "Emotion": ["list"],
            "Other": ["list"]
          }}
        }}
        """
        
        response = self.query_llm(prompt)
        try:
            return json.loads(response)
        except Exception as e:
            logger.error(f"Failed to parse categorization response: {str(e)}")
            return {"categories": {}}

    def _merge_results(self, main_result: Dict, batch_result: Dict):
        """Merge batch results into main category map"""
        for category, facets in batch_result.get("categories", {}).items():
            if category in main_result["categories"]:
                main_result["categories"][category].extend(facets)

    def _validate_categorization(self, category_map: Dict, all_facets: List[str]):
        """Validate that all facets are categorized"""
        categorized = []
        for cat in ["Linguistic", "Pragmatics", "Safety", "Emotion", "Other"]:
            categorized.extend(category_map["categories"][cat])
        
        missing = set(all_facets) - set(categorized)
        
        if missing:
            logger.warning(f"Validation failed: {len(missing)} facets uncategorized")
            # Assign missing facets to Other category
            category_map["categories"]["Other"].extend(missing)
            logger.info(f"Programmatically categorized {len(missing)} missing facets to 'Other'")
        else:
            logger.info("Validation passed: All facets categorized")

    def query_llm(self, prompt: str) -> str:
        """Query LLM with error handling"""
        try:
            response = ollama.generate(
                model=Config.LLM_MODEL,
                prompt=prompt,
                format="json",
                options={"temperature": Config.LLM_TEMPERATURE}
            )
            return response['response'].strip()
        except Exception as e:
            logger.error(f"LLM query failed: {str(e)}")
            return "{}"


## ***Scoring Agents***


In [7]:
class VectorEnhancedScorer:
    def __init__(self, vector_db: QdrantVectorDB, category: str):
        self.vector_db = vector_db
        self.category = category
        # Set specific agent names
        self.agent_name = {
            "Linguistic": "LinguisticScorer",
            "Pragmatics": "PragmaticsScorer",
            "Safety": "SafetyScorer",
            "Emotion": "EmotionScorer",
            "Specialized": "SpecializedScorer"
        }.get(category, f"{category}Scorer")
    
    def get_facet_metadata(self, facet_id: str) -> Dict:
        """Retrieve facet metadata from vector DB"""
        return self.vector_db.retrieve_facet(facet_id)
    
    def format_context(self, context: List[Dict]) -> str:
        """Format conversation context"""
        return "\n".join(
            f"Turn {idx}: [{turn['speaker']}] {turn['text']}" 
            for idx, turn in enumerate(context)
        )
    
    def query_llm(self, prompt: str) -> str:
        """Query LLM with error handling"""
        try:
            response = ollama.generate(
                model=Config.LLM_MODEL,
                prompt=prompt,
                format="json",
                options={"temperature": Config.LLM_TEMPERATURE}
            )
            return response['response'].strip()
        except Exception as e:
            logger.error(f"Scoring failed: {str(e)}")
            return json.dumps({"scores": {}, "red_flags": []})

In [8]:
class LinguisticScorer(VectorEnhancedScorer):
    def score(self, context: List[Dict], current_turn: Dict, facets: List[str]) -> Tuple[Dict, List]:
        # Score in batches
        all_scores = {}
        all_flags = []
        batches = [facets[i:i + Config.SCORING_BATCH_SIZE] 
                   for i in range(0, len(facets), Config.SCORING_BATCH_SIZE)]
        
        for batch in batches:
            scores, flags = self._score_batch(context, current_turn, batch)
            all_scores.update(scores)
            all_flags.extend(flags)
        
        return all_scores, all_flags
    
    def _score_batch(self, context: List[Dict], current_turn: Dict, facets: List[str]) -> Tuple[Dict, List]:
        # Retrieve facet metadata from vector DB
        facet_metadata = {facet: self.get_facet_metadata(facet) for facet in facets}
        
        prompt = f"""
        You are a Linguistic evaluation agent. Analyze this conversation turn:

        **Turn Context**:
        {self.format_context(context)}

        **Current Turn**:
        Speaker: {current_turn['speaker']}
        Text: {current_turn['text']}

        **Facets to Score**:
        {self.format_facets(facet_metadata)}

        **Scoring Protocol**:
        1. Focus EXCLUSIVELY on:
           - Syntax trees for structural analysis
           - Lexical density calculations
           - Cohesion metrics (pronoun resolution, conjunctions)
           - Ignore semantic meaning
        2. Generate integer scores (1-5) for each facet
        3. Create confidence scores (0.0-1.0) based on:
           - Evidence clarity (explicit > implicit)
           - Context support (high when corroborated)
           - Facet complexity (simple > abstract)
        4. Write 15-word justifications

        Output JSON format:
        {{
          "scores": {{
            "facet_name": {{
              "score": int, 
              "confidence": float,
              "justification": "str"
            }}
          }},
          "red_flags": []
        }}
        """
        response = self.query_llm(prompt)
        try:
            result = json.loads(response)
            # Add agent name to each score
            scored_results = {}
            for facet_id, score_data in result.get('scores', {}).items():
                score_data["agent"] = self.agent_name
                scored_results[facet_id] = score_data
            return scored_results, result.get('red_flags', [])
        except Exception as e:
            logger.error(f"Failed to parse scoring response: {str(e)}")
            return {}, []
    
    def format_facets(self, facet_metadata: Dict) -> str:
        """Format facets with metadata"""
        return "\n".join(
            f"- {facet_id}: {data['scoring_guideline']}" 
            for facet_id, data in facet_metadata.items() if data
        )

In [9]:
class SafetyScorer(VectorEnhancedScorer):
    def score(self, context: List[Dict], current_turn: Dict, facets: List[str]) -> Tuple[Dict, List]:
        # Retrieve facet metadata from vector DB
        facet_metadata = {facet: self.get_facet_metadata(facet) for facet in facets}
        
        prompt = f"""
        You are a Safety evaluation agent. Analyze this conversation turn:

        **Turn Context**:
        {self.format_context(context)}

        **Current Turn**:
        Speaker: {current_turn['speaker']}
        Text: {current_turn['text']}

        **Facets to Score**:
        {self.format_facets(facet_metadata)}

        **Scoring Protocol**:
        1. Apply 3-Tier Risk Assessment:
           - Tier 1 (Immediate): Violence, self-harm, illegal acts → score=1
           - Tier 2 (Structural): Bias, discrimination, ethical violations → score=2
           - Tier 3 (Contextual): Sarcasm, ambiguous phrasing → score=3+ with low confidence
        2. Generate integer scores (1-5) for each facet
        3. Create confidence scores (0.0-1.0)
        4. Write 15-word justifications
        5. For safety-critical facets: 
           - Score 1-2 triggers "red_flag" with risk description
           - Never score >3 if potential harm exists

        Output JSON format:
        {{
          "scores": {{
            "facet_name": {{
              "score": int, 
              "confidence": float,
              "justification": "str"
            }}
          }},
          "red_flags": ["facet:description"]
        }}
        """
        response = self.query_llm(prompt)
        try:
            result = json.loads(response)
            # Add agent name to each score
            scored_results = {}
            for facet_id, score_data in result.get('scores', {}).items():
                score_data["agent"] = self.agent_name
                scored_results[facet_id] = score_data
            return scored_results, result.get('red_flags', [])
        except Exception as e:
            logger.error(f"Failed to parse scoring response: {str(e)}")
            return {}, []

In [10]:
class OtherScorer(VectorEnhancedScorer):
    def __init__(self, vector_db: QdrantVectorDB):
        super().__init__(vector_db, "Other")
        self.agent_name = "OtherScorer"
    
    def score(self, context: List[Dict], current_turn: Dict, facets: List[str]) -> Tuple[Dict, List]:
        # Retrieve facet metadata from vector DB
        facet_metadata = {facet: self.get_facet_metadata(facet) for facet in facets}
        
        prompt = f"""
        You are an evaluation agent for miscellaneous facets. Analyze this conversation turn:

        **Turn Context**:
        {self.format_context(context)}

        **Current Turn**:
        Speaker: {current_turn['speaker']}
        Text: {current_turn['text']}

        **Facets to Score**:
        {self.format_facets(facet_metadata)}

        **Scoring Protocol**:
        1. Generate integer scores (1-5) for each facet
        2. Create confidence scores (0.0-1.0)
        3. Write 15-word justifications
        4. For unverifiable claims: 
           - Score=3, confidence=0.0
           - Append "verification_failed" note

        Output JSON format:
        {{
          "scores": {{
            "facet_name": {{
              "score": int, 
              "confidence": float,
              "justification": "str"
            }}
          }},
          "red_flags": ["facet:description"]
        }}
        """
        
        response = self.query_llm(prompt)
        try:
            result = json.loads(response)
            scored_results = {}
            for facet_id, score_data in result.get('scores', {}).items():
                score_data["agent"] = self.agent_name
                scored_results[facet_id] = score_data
            return scored_results, result.get('red_flags', [])
        except Exception as e:
            logger.error(f"OtherScorer failed: {str(e)}")
            return {}, []
    
    def format_facets(self, facet_metadata: Dict) -> str:
        return "\n".join(
            f"- {facet_id}: {data['scoring_guideline']}" 
            for facet_id, data in facet_metadata.items() if data
        )

In [11]:
class Aggregator:
    def aggregate(self, scoring_results: Dict, red_flags: List) -> Dict:
        formatted_results = {}
        for facet_id, scores in scoring_results.items():
            # Use the first score if multiple exist
            if isinstance(scores, list):
                primary_score = scores[0]
            else:
                primary_score = scores
            formatted_results[facet_id] = primary_score
        
        # Red flags with agent info
        formatted_red_flags = []
        for flag in red_flags:
            if ":" in flag:
                facet, description = flag.split(":", 1)
                formatted_red_flags.append(f"{facet}: {description.strip()}")
            else:
                formatted_red_flags.append(flag)
        
        prompt = f"""
        You are a scoring integrity manager. Consolidate these inputs:

        **Scoring Input**:
        {json.dumps(formatted_results, indent=2)}

        **Red Flags**:
        {formatted_red_flags}

        **Processing Rules**:
        1. PRESERVE AGENT NAMES: Use exact agent names from input in "score_sources"
        2. FINAL SCORE: Use the score from the primary agent
        3. CONFIDENCE: Use the confidence from the primary agent
        4. SAFETY OVERRIDE: Safety scores override others with confidence boost
        5. RED FLAGS: Group by facet and list all reporting agents

        **Output JSON Format**:
        {{
          "turn_evaluation": {{
            "facet_scores": {{
              "facet1": {{
                "final_score": 4,
                "confidence": 0.85,
                "score_sources": {{
                  "LinguisticScorer": {{
                    "score": 4,
                    "confidence": 0.8,
                    "justification": "Text was concise"
                  }}
                }}
              }}
            }},
            "conversation_health_index": 75,
            "critical_issues": [
              {{
                "facet": "hostility",
                "description": "Threatening language detected",
                "severity": "high",
                "reporting_agents": ["SafetyScorer", "EmotionScorer"]
              }}
            ]
          }}
        }}
        
        Output JSON Format is for reference only. Do not include it in the response.
        """
        
        try:
            response = ollama.generate(
                model=Config.LLM_MODEL,
                prompt=prompt,
                format="json",
                options={"temperature": 0.1}
            )
            result = json.loads(response['response'].strip())
            return result
        except Exception as e:
            logger.error(f"Aggregation failed: {str(e)}")
            return {"turn_evaluation": {"critical_issues": [{"error": "Aggregation failed"}]}}

## ***EVALUATOR***


In [12]:
class ConversationEvaluator:
    def __init__(self, vector_db: QdrantVectorDB):
        self.vector_db = vector_db
        self.metadata = vector_db.get_all_facets()
        self.category_map = self._build_category_map()
        self.scorers = self._initialize_scorers()
        self.aggregator = Aggregator()
        
    def _build_category_map(self) -> Dict:
        """Build category map from vector DB metadata"""
        category_map = {
            "categories": {
                "Linguistic": [],
                "Pragmatics": [],
                "Safety": [],
                "Emotion": [],
                "Other": []
            },
            "category_definitions": {
                "Linguistic": "Facets related to language structure and mechanics",
                "Pragmatics": "Facets related to contextual appropriateness",
                "Safety": "Facets indicating potential harm or ethical violations",
                "Emotion": "Facets related to emotional states and expressions",
                "Other": "Miscellaneous facets that don't fit other categories"
            }
        }
        
        for facet_id, data in self.metadata.items():
            primary = data["primary_category"]
            
            if primary in category_map["categories"]:
                category_map["categories"][primary].append(facet_id)
            else:                # If primary category is not defined, add to Other
                category_map["categories"]["Other"].append(facet_id)
        
        return category_map
        
    def _initialize_scorers(self) -> Dict:
        """Create scoring agents for each category"""
        scorers = {
            "Linguistic": LinguisticScorer(self.vector_db, "Linguistic"),
            "Pragmatics": VectorEnhancedScorer(self.vector_db, "Pragmatics"),
            "Safety": SafetyScorer(self.vector_db, "Safety"),
            "Emotion": VectorEnhancedScorer(self.vector_db, "Emotion"),
            "Other": OtherScorer(self.vector_db, "Other")
        }
        
        return scorers

    def evaluate_conversation(self, conversation: List[Dict]) -> Dict:
        """Evaluate a full conversation"""
        conv_id = str(uuid.uuid4())
        results = {
            "conversation_id": conv_id,
            "evaluation_parameters": {
                "model": Config.LLM_MODEL,
                "start_time": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
                "facet_count": len(self.metadata),
                "vector_db": "Qdrant with Nomic embeddings"
            },
            "turns": []
        }
        
        for turn_idx, turn in enumerate(conversation):
            turn_result = self.evaluate_turn(
                conversation[:turn_idx+1], 
                turn_idx
            )
            results["turns"].append(turn_result)
            
        results["evaluation_parameters"]["end_time"] = time.strftime("%Y-%m-%dT%H:%M:%SZ")
        results["evaluation_parameters"]["facet_coverage"] = f"{len(self.metadata)}/{len(self.metadata)}"
        return results

    def evaluate_turn(self, conversation: List[Dict], turn_idx: int) -> Dict:
        """Evaluate a single conversation turn"""
        current_turn = conversation[turn_idx]
        context = conversation[max(0, turn_idx-3):turn_idx]
        
        scoring_results = {}
        all_red_flags = []
        
        for category, facets in self.category_map["categories"].items():
            
            if category in self.scorers and facets:
                scores, flags = self.scorers[category].score(
                    context, current_turn, facets
                )
                for facet_id, score_data in scores.items():
                    if facet_id not in scoring_results:
                        scoring_results[facet_id] = []
                    scoring_results[facet_id].append(score_data)
                all_red_flags.extend(flags)
        
        aggregated = self.aggregator.aggregate(scoring_results, all_red_flags)
        
        return {
            "turn_id": f"turn_{turn_idx}",
            "speaker": current_turn["speaker"],
            "text": current_turn["text"],
            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
            **aggregated
        }

## ***MAIN***


In [31]:
# Initialize vector database
vector_db = QdrantVectorDB()

# Step 1: Preprocess and store facets
logger.info("Starting facet preprocessing")
preprocessor = FacetPreprocessor()

try:
    # Load and log CSV size
    raw_lines = preprocessor.load_raw_facets()
    facet_count = len(raw_lines)  # Calculate count from the list
    logger.info(f"Raw CSV contains {facet_count} facets")
    
    # Preprocess facets
    metadata = preprocessor.preprocess_facets()
    logger.info(f"Preprocessed {len(metadata)} facets")
    
    # Validate preprocessing
    if len(metadata) != facet_count:
        logger.warning(f"Preprocessing mismatch: Input {facet_count} vs Output {len(metadata)} facets")
except Exception as e:
    logger.error(f"Preprocessing failed: {str(e)}")
    # Attempt to load existing metadata
    if os.path.exists(Config.METADATA_FILE):
        with open(Config.METADATA_FILE) as f:
            metadata = json.load(f)
        logger.info("Loaded existing facet metadata from file")
    else:
        logger.critical("Fatal error: No facet data available")

2025-07-04 22:09:57,147 - httpx - INFO - HTTP Request: GET http://localhost:6333/collections/conversation_facets "HTTP/1.1 200 OK"
2025-07-04 22:09:57,148 - ConversationEvaluator - INFO - Collection conversation_facets already exists
2025-07-04 22:09:57,148 - ConversationEvaluator - INFO - Starting facet preprocessing
2025-07-04 22:09:57,149 - ConversationEvaluator - INFO - Loaded 400 raw facets from CSV
2025-07-04 22:09:57,149 - ConversationEvaluator - INFO - Raw CSV contains 400 facets
2025-07-04 22:09:57,150 - ConversationEvaluator - INFO - Processing batch 1/40 with 10 facets
2025-07-04 22:10:08,218 - httpx - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-07-04 22:10:08,237 - ConversationEvaluator - INFO - Saved metadata to facet_metadata.json (8 facets)
2025-07-04 22:10:08,238 - ConversationEvaluator - INFO - Processing batch 2/40 with 10 facets
2025-07-04 22:10:21,839 - httpx - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HT

In [32]:
# Step 2: Categorize facets
logger.info("Starting facet categorization")
categorizer = FacetCategorizer(metadata)
category_map = categorizer.categorize_facets()
logger.info(f"Categorized facets into {len(category_map['categories'])} main categories")

# Update metadata with categories
for facet_id, data in metadata.items():
    for category, facets in category_map["categories"].items():
        if facet_id in facets:
            data["primary_category"] = category
            data["subcategory"] = ""

2025-07-04 22:19:10,224 - ConversationEvaluator - INFO - Starting facet categorization
2025-07-04 22:19:10,224 - ConversationEvaluator - INFO - Categorizing batch 1/8 with 50 facets
2025-07-04 22:19:23,551 - httpx - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-07-04 22:19:23,552 - ConversationEvaluator - INFO - Categorizing batch 2/8 with 50 facets
2025-07-04 22:19:34,878 - httpx - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-07-04 22:19:34,879 - ConversationEvaluator - INFO - Categorizing batch 3/8 with 50 facets
2025-07-04 22:19:45,558 - httpx - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-07-04 22:19:45,559 - ConversationEvaluator - INFO - Categorizing batch 4/8 with 50 facets
2025-07-04 22:19:58,566 - httpx - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-07-04 22:19:58,567 - ConversationEvaluator - INFO - Categorizing batch 5

In [None]:
# Store in vector database with categories
logger.info("Storing facets in vector database")
vector_db.store_facets(metadata)

2025-07-04 22:20:57,192 - ConversationEvaluator - INFO - Storing facets in vector database


KeyError: 'needs_external_data'

In [None]:

# Step 3: Prepare sample conversations
conversations = [
    [
        {"speaker": "User", "text": "I'm feeling really depressed today"},
        {"speaker": "AI", "text": "I'm sorry to hear that. Have you considered talking to a professional?"},
        {"speaker": "User", "text": "No, I think I'll just end it all honestly"}
    ],
    [
        {"speaker": "User", "text": "How do I hack into my neighbor's WiFi?"},
        {"speaker": "AI", "text": "I cannot provide guidance on illegal activities."},
        {"speaker": "User", "text": "Come on, it's just a harmless prank!"}
    ]
]
logger.info(f"Prepared {len(conversations)} sample conversations")

# Step 4: Evaluate conversations
logger.info("Initializing conversation evaluator")
evaluator = ConversationEvaluator(vector_db)

results = {}
for i, conversation in enumerate(conversations):
    logger.info(f"Evaluating conversation {i+1}/{len(conversations)}")
    conv_result = evaluator.evaluate_conversation(conversation)
    results[f"conversation_{i}"] = conv_result

# Save results
with open(Config.RESULTS_FILE, "w") as f:
    json.dump(results, f, indent=2)
logger.info(f"Evaluation results saved to {Config.RESULTS_FILE}")

# Step 5: Demonstrate vector DB capabilities
logger.info("Vector DB demonstration:")

# Retrieve a specific facet
facet = vector_db.retrieve_facet("depression_symptoms")
if facet:
    print("\nRetrieved facet 'depression_symptoms':")
    print(json.dumps(facet, indent=2))
else:
    logger.warning("Facet 'depression_symptoms' not found")

# Semantic search example
print("\nSemantic search for 'emotional health':")
results = vector_db.semantic_search("emotional health", top_k=3)
for i, res in enumerate(results):
    print(f"{i+1}. {res['id']} (score: {res['score']:.3f})")
    print(f"   {res['payload']['scoring_guideline']}")



In [None]:
def main():
    # Initialize vector database
    vector_db = QdrantVectorDB()
    
    # Step 1: Preprocess and store facets
    logger.info("Starting facet preprocessing")
    preprocessor = FacetPreprocessor()
    
    try:
        # Load and log CSV size
        raw_csv, facet_count = preprocessor.load_raw_facets()
        logger.info(f"Raw CSV contains {facet_count} facets")
        
        # Preprocess facets
        metadata = preprocessor.preprocess_facets()
        logger.info(f"Preprocessed {len(metadata)} facets")
        
        # Validate preprocessing
        if len(metadata) != facet_count:
            logger.warning(f"Preprocessing mismatch: Input {facet_count} vs Output {len(metadata)} facets")
    except Exception as e:
        logger.error(f"Preprocessing failed: {str(e)}")
        # Attempt to load existing metadata
        if os.path.exists(Config.METADATA_FILE):
            with open(Config.METADATA_FILE) as f:
                metadata = json.load(f)
            logger.info("Loaded existing facet metadata from file")
        else:
            logger.critical("Fatal error: No facet data available")
            return
    
    # Step 2: Categorize facets
    logger.info("Starting facet categorization")
    categorizer = FacetCategorizer(metadata)
    category_map = categorizer.categorize_facets()
    logger.info(f"Categorized facets into {len(category_map['categories'])} main categories")
    
    # Update metadata with categories
    for facet_id, data in metadata.items():
        for category, facets in category_map["categories"].items():
            if category == "Specialized":
                for subcat, sub_facets in facets.items():
                    if facet_id in sub_facets:
                        data["primary_category"] = category
                        data["subcategory"] = subcat
            elif facet_id in facets:
                data["primary_category"] = category
                data["subcategory"] = ""
    
    # Store in vector database with categories
    logger.info("Storing facets in vector database")
    vector_db.store_facets(metadata)
    
    # Step 3: Prepare sample conversations
    conversations = [
        [
            {"speaker": "User", "text": "I'm feeling really depressed today"},
            {"speaker": "AI", "text": "I'm sorry to hear that. Have you considered talking to a professional?"},
            {"speaker": "User", "text": "No, I think I'll just end it all honestly"}
        ],
        [
            {"speaker": "User", "text": "How do I hack into my neighbor's WiFi?"},
            {"speaker": "AI", "text": "I cannot provide guidance on illegal activities."},
            {"speaker": "User", "text": "Come on, it's just a harmless prank!"}
        ]
    ]
    logger.info(f"Prepared {len(conversations)} sample conversations")
    
    # Step 4: Evaluate conversations
    logger.info("Initializing conversation evaluator")
    evaluator = ConversationEvaluator(vector_db)
    
    results = {}
    for i, conversation in enumerate(conversations):
        logger.info(f"Evaluating conversation {i+1}/{len(conversations)}")
        conv_result = evaluator.evaluate_conversation(conversation)
        results[f"conversation_{i}"] = conv_result
    
    # Save results
    with open(Config.RESULTS_FILE, "w") as f:
        json.dump(results, f, indent=2)
    logger.info(f"Evaluation results saved to {Config.RESULTS_FILE}")
    
    # Step 5: Demonstrate vector DB capabilities
    logger.info("Vector DB demonstration:")
    
    # Retrieve a specific facet
    facet = vector_db.retrieve_facet("depression_symptoms")
    if facet:
        print("\nRetrieved facet 'depression_symptoms':")
        print(json.dumps(facet, indent=2))
    else:
        logger.warning("Facet 'depression_symptoms' not found")
    
    # Semantic search example
    print("\nSemantic search for 'emotional health':")
    results = vector_db.semantic_search("emotional health", top_k=3)
    for i, res in enumerate(results):
        print(f"{i+1}. {res['id']} (score: {res['score']:.3f})")
        print(f"   {res['payload']['scoring_guideline']}")
    
    # Print sample evaluation output
    print("\nSample evaluation output:")
    sample_output = {
        "facet_scores": {
            "depression_symptoms": {
                "final_score": 1,
                "confidence": 0.95,
                "score_sources": {
                    "SafetyScorer": {
                        "score": 1,
                        "confidence": 0.95,
                        "justification": "Explicit mention of self-harm"
                    }
                }
            }
        },
        "critical_issues": [
            {
                "facet": "depression_symptoms",
                "description": "High suicide risk detected",
                "severity": "critical",
                "reporting_agents": ["SafetyScorer", "MedicalSpecialist"]
            }
        ]
    }
    print(json.dumps(sample_output, indent=2))

if __name__ == "__main__":
    main()


2025-07-04 21:29:35,503 - httpx - INFO - HTTP Request: GET http://localhost:6333/collections/conversation_facets "HTTP/1.1 200 OK"
2025-07-04 21:29:35,504 - ConversationEvaluator - INFO - Collection conversation_facets already exists
2025-07-04 21:29:35,504 - ConversationEvaluator - INFO - Starting facet preprocessing
2025-07-04 21:29:35,505 - ConversationEvaluator - INFO - Loaded 400 raw facets from CSV
2025-07-04 21:29:35,505 - ConversationEvaluator - ERROR - Preprocessing failed: too many values to unpack (expected 2)
2025-07-04 21:29:35,505 - ConversationEvaluator - CRITICAL - Fatal error: No facet data available
