## Main Code

### 0. Libraries

In [None]:
import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig
from google.cloud import bigquery
from google.api_core.exceptions import GoogleAPIError
import json
import time
import re
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional

### 1. Configuration & Naming Standards

In [None]:
# ==========================================
# 1. CONFIGURATION & NAMING STANDARDS
# ==========================================

PROJECT_ID = "project-nirvana-405904"  # <--- REPLACE THIS
LOCATION = "us-central1"

# Versions
PROJECT_TAG = "csv"
TRANSCRIPTION_VERSION = "005" # <--- CHANGE THIS TO THE CURRENT Synthetic Transcripts TABLE VERSION
SOURCE_VERSION = "003" # <--- CHANGE THIS TO THE CURRENT Signals Derived TABLE VERSION
SCRIPT_VERSION = "001"
DESTINATION_TABLE_VERSION = "001"

# Resources
DATASET_ID = f"vel_{PROJECT_TAG}_schema"
SOURCE_TABLE = f"vel_{PROJECT_TAG}_derived_signals_{SOURCE_VERSION}"
DESTINATION_TABLE = f"vel_{PROJECT_TAG}_signals_validation_{DESTINATION_TABLE_VERSION}"
TRANSCRIPTS_TABLE = f"vel_{PROJECT_TAG}_synthetic_transcripts_{TRANSCRIPTION_VERSION}"

# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=LOCATION)
model = GenerativeModel("gemini-2.5-flash")

# Initialize BigQuery
bq_client = bigquery.Client(project=PROJECT_ID)

### 2. Validator System Prompt (Strict Auditor)

In [None]:
# ==========================================
# 2. VALIDATOR SYSTEM PROMPT (Strict Auditor)
# ==========================================

VALIDATOR_SYSTEM_PROMPT = """
You are an automated **Shadow Analyst & Validator**: a forensic auditor for an AI Signal Extractor.
Your job is to evaluate if the Extractor correctly analyzed a YouTube Creator vs. SPM transcript AND if it missed any critical business insights.

You will receive a JSON payload containing:
1. "original_transcript": The full raw conversation.
2. "extracted_signals": An ARRAY of all structured signals generated by the Extractor Agent for this transcript.

REQUIREMENTS & EVALUATION PILLARS:
1. Verify Existing Signals:
   - For EACH signal in the "extracted_signals" array, check for Evidence validity (no hallucinations) and Actionability logic.
2. Detect Blind Spots (Completeness Check):
   - Read the "original_transcript" yourself.
   - Identify if there are any MAJOR, undeniable business signals (e.g., severe churn risk, distinct product bugs, explicit creator frustration) that the Extractor completely missed.
   - Do NOT penalize for missing trivial details, only critical business signals.

SCORING RULES:
- Evaluate "extraction_quality_score" (0.0 to 10.0) based on accuracy of the existing signals.
- Evaluate "completeness_score" (0.0 to 10.0). Deduct points heavily if major signals were missed.

JSON OUTPUT SCHEMA (MANDATORY):
{
  "transcript_is_valid": <boolean>,
  "extraction_quality_score": <float>,
  "completeness_score": <float>,
  "missed_signals_detected": <boolean>,
  "missed_signals_summary": ["<string: Description of a major signal that was missed>", ...],
  "auditor_feedback": "<string: Overall feedback on accuracy and completeness>"
}
"""

### 3. Helper Functions

In [None]:
# ==========================================
# 3. HELPER FUNCTIONS
# ==========================================

def parse_raw_transcript(raw_text: str) -> List[Dict[str, Any]]:
    """
    Parses the raw text blob from BQ into the list structure required by the Validator.
    Handles JSON arrays or simple Line-by-Line text.
    """
    try:
        # Attempt 1: Parse as pure JSON
        clean_text = re.sub(r"^```(?:json)?\s*", "", raw_text)
        clean_text = re.sub(r"\s*```$", "", clean_text)
        return json.loads(clean_text)
    except json.JSONDecodeError:
        # Attempt 2: Parse Line-by-Line (Fallback)
        dialogue = []
        lines = raw_text.splitlines()
        for line in lines:
            if ":" in line:
                parts = line.split(":", 1)
                role = parts[0].strip()
                content = parts[1].strip()
                # Normalize roles for the validator
                if "SPM" in role: role = "SPM"
                if "Creator" in role: role = "Creator"
                dialogue.append({"role": role, "content": content})
        return dialogue

def initialize_destination_table():
    """
    Creates the destination table (Table B) if it doesn't exist yet.
    Required so the 'Left Join' in the fetch step doesn't fail.
    """
    table_ref = f"{PROJECT_ID}.{DATASET_ID}.{DESTINATION_TABLE}"

    # Define the schema for the standalone validation table
    schema = [
        bigquery.SchemaField("transcript_id", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("transcript_is_valid", "BOOLEAN"),
        bigquery.SchemaField("extraction_quality_score", "FLOAT"),
        bigquery.SchemaField("completeness_score", "FLOAT"),
        bigquery.SchemaField("missed_signals_detected", "BOOLEAN"),
        bigquery.SchemaField("missed_signals_summary", "STRING", mode="REPEATED"), # Array de strings
        bigquery.SchemaField("validation_report", "JSON"),
        bigquery.SchemaField("audit_timestamp", "TIMESTAMP"),
        bigquery.SchemaField("spm_name", "STRING"),
        bigquery.SchemaField("creator_id", "STRING"),
        bigquery.SchemaField("creator_region", "STRING")
    ]

    try:
        bq_client.get_table(table_ref)
        print(f"‚úÖ Destination table {DESTINATION_TABLE} exists.")
    except Exception:
        print(f"‚ö†Ô∏è Destination table not found. Creating {DESTINATION_TABLE}...")
        table = bigquery.Table(table_ref, schema=schema)
        bq_client.create_table(table)
        print("‚úÖ Table created successfully.")

def clean_json_response(response_text: str) -> Dict[str, Any]:
    try:
        cleaned = re.sub(r"^```(?:json)?\s*", "", response_text).strip()
        cleaned = re.sub(r"\s*```$", "", cleaned).strip()
        return json.loads(cleaned)
    except Exception as e:
        print(f"‚ùå JSON Parsing Error: {e}")
        # El Fallback debe coincidir con el System Prompt nuevo
        return {
            "transcript_is_valid": False,
            "extraction_quality_score": 0.0,
            "completeness_score": 0.0,
            "missed_signals_detected": False,
            "missed_signals_summary": ["Validator Output Malformed"],
            "auditor_feedback": "Error parsing LLM response."
        }

### 4. Core Logic: The Validator Class

In [None]:
# ==========================================
# 4. Core Logic: The Validator Class
# ==========================================

class TranscriptValidator:
    def __init__(self):
        self.processed_count = 0
        self.validation_results = []

    def validate_row(self, row) -> Dict[str, Any]:
        conv_id = row["transcript_id"]
        raw_text = row["raw_transcript"]

        # We convert the BQ array to a list of dictionaries in Python
        signals_array = [dict(s) for s in row["extracted_signals"]]

        payload = {
            "original_transcript": raw_text,
            "extracted_signals": signals_array
        }

        try:
            prompt = f"{VALIDATOR_SYSTEM_PROMPT}\n\nUSER INPUT:\n```json\n{json.dumps(payload)}\n```"

            response = model.generate_content(
                prompt,
                generation_config=GenerationConfig(
                    temperature=0.0,
                    response_mime_type="application/json"
                )
            )

            audit_report = clean_json_response(response.text)

            return {
                "transcript_id": conv_id,
                "transcript_is_valid": audit_report.get("transcript_is_valid", False),
                "extraction_quality_score": audit_report.get("extraction_quality_score", 0.0),
                "completeness_score": audit_report.get("completeness_score", 0.0),
                "missed_signals_detected": audit_report.get("missed_signals_detected", False),
                "missed_signals_summary": audit_report.get("missed_signals_summary", []),
                "validation_report": json.dumps(audit_report),
                "audit_timestamp": datetime.utcnow().isoformat()
            }

        except Exception as e:
            print(f"‚ö†Ô∏è Error validating {conv_id}: {str(e)}")
            return None

### 5. BigQuery Upload

In [None]:
# ==========================================
# 5. BIGQUERY UPLOAD (DELTA LOGIC)
# ==========================================

def fetch_pending_transcripts(limit: int = 100):
    """
    Selects rows from Source (A) that are NOT present in Destination (B).
    Pattern: LEFT JOIN ... WHERE B.id IS NULL
    """
    query = f"""
        SELECT
            orig.conversation_id AS transcript_id,
            orig.raw_transcript,
            orig.spm_name,
            orig.creator_region,
            ARRAY_AGG(
                STRUCT(
                    ext.signal_name,
                    ext.signal_category,
                    ext.signal_actionability,
                    ext.signal_description,
                    ext.signal_evidence,
                    ext.spm_score,
                    ext.spm_reasoning
                )
            ) as extracted_signals
        FROM `{PROJECT_ID}.{DATASET_ID}.{SOURCE_TABLE}` ext
        JOIN `{PROJECT_ID}.{DATASET_ID}.{TRANSCRIPTS_TABLE}` orig
          ON ext.transcript_id = orig.conversation_id
        LEFT JOIN `{PROJECT_ID}.{DATASET_ID}.{DESTINATION_TABLE}` val
          ON ext.transcript_id = val.transcript_id
        WHERE val.transcript_id IS NULL
        GROUP BY orig.conversation_id, orig.raw_transcript, orig.spm_name, orig.creator_region
        LIMIT {limit}
    """
    print("üì• Fetching pending grouped transcripts (Delta Load)...")
    return bq_client.query(query).result()

def upload_results_to_bq(results: List[Dict[str, Any]]):
    """
    Inserts validated rows directly into Destination Table.
    """
    if not results:
        return

    table_ref = f"{PROJECT_ID}.{DATASET_ID}.{DESTINATION_TABLE}"

    # Schema must match initialize_destination_table
    job_config = bigquery.LoadJobConfig(
        write_disposition="WRITE_APPEND",
        schema=[
            bigquery.SchemaField("transcript_id", "STRING", mode="REQUIRED"),
            bigquery.SchemaField("transcript_is_valid", "BOOLEAN"),
            bigquery.SchemaField("extraction_quality_score", "FLOAT"),
            bigquery.SchemaField("completeness_score", "FLOAT"),
            bigquery.SchemaField("missed_signals_detected", "BOOLEAN"),
            bigquery.SchemaField("missed_signals_summary", "STRING", mode="REPEATED"), # Array de strings
            bigquery.SchemaField("validation_report", "JSON"),
            bigquery.SchemaField("audit_timestamp", "TIMESTAMP"),
            bigquery.SchemaField("spm_name", "STRING"),
            bigquery.SchemaField("creator_id", "STRING"),
            bigquery.SchemaField("creator_region", "STRING")
        ]
    )

    try:
        job = bq_client.load_table_from_json(results, table_ref, job_config=job_config)
        job.result() # Wait for completion
        print(f"‚úÖ Saved {len(results)} audited records to {DESTINATION_TABLE}")
    except Exception as e:
        print(f"‚ùå Upload failed: {e}")

## Run Main

In [None]:
if __name__ == "__main__":
    import sys # Import needed for clean exit
    start_time = time.perf_counter()

    print("üöÄ Starting Validator Pipeline (Delta Strategy)...")

    # 1. Initialize Destination Table (Crucial Step)
    initialize_destination_table()

    validator = TranscriptValidator()

    # 2. Fetch Data (The Delta)
    rows = fetch_pending_transcripts(limit=120)
    rows_list = list(rows)

    if not rows_list:
        print("üò¥ No pending signals found (All rows in Source are already in Destination).")
        sys.exit()

    print(f"üîç Found {len(rows_list)} new signals to validate.")

    batch_results = []

    # 3. Iterate and Validate
    for row in rows_list:
        result = validator.validate_row(row)

        if result:
            result["spm_name"] = row.get("spm_name")
            result["creator_id"] = row.get("creator_id")
            result["creator_region"] = row.get("creator_region")

            batch_results.append(result)

            # logging
            status_icon = "‚úÖ" if result["transcript_is_valid"] else "‚ùå"
            print(f"{status_icon} [{result['transcript_id']}] Q-Score: {result['extraction_quality_score']} | Completeness: {result['completeness_score']}")

        # Rate Limiting
        time.sleep(0.5)

    end_time_sd = time.perf_counter()

    # 4. Upload Results
    if batch_results:
        upload_results_to_bq(batch_results)

    end_time = time.perf_counter()

    print("üèÅ Pipeline finished successfully.")

    duration_sd = str(timedelta(seconds=end_time_sd - start_time))
    duration_bq = str(timedelta(seconds=end_time - end_time_sd))
    duration_total = str(timedelta(seconds=end_time - start_time))

    print(f"Validation execution time: {duration_sd}")
    print(f"Upload execution time: {duration_bq}")
    print(f"Total execution time: {duration_total}")

üöÄ Starting Validator Pipeline (Delta Strategy)...
‚ö†Ô∏è Destination table not found. Creating vel_csv_signals_validation_001...
‚úÖ Table created successfully.
üì• Fetching pending grouped transcripts (Delta Load)...
üîç Found 72 new signals to validate.
‚úÖ [1_0_1771463705] Q-Score: 9.5 | Completeness: 9.8
‚úÖ [28_0_1771465305] Q-Score: 10.0 | Completeness: 6.5
‚úÖ [41_0_1771466063] Q-Score: 10.0 | Completeness: 8.0
‚úÖ [72_0_1771467872] Q-Score: 5.0 | Completeness: 3.0
‚úÖ [72_1_1771467932] Q-Score: 9.5 | Completeness: 7.0
‚úÖ [8_0_1771464161] Q-Score: 9.5 | Completeness: 7.0
‚úÖ [8_1_1771464210] Q-Score: 10.0 | Completeness: 10.0
‚úÖ [30_0_1771465427] Q-Score: 10.0 | Completeness: 10.0
‚úÖ [39_0_1771465931] Q-Score: 9.5 | Completeness: 9.5
‚úÖ [3_0_1771463776] Q-Score: 9.5 | Completeness: 7.5
‚úÖ [3_1_1771463831] Q-Score: 10.0 | Completeness: 5.0
‚úÖ [14_0_1771464510] Q-Score: 5.0 | Completeness: 6.0
‚úÖ [31_0_1771465461] Q-Score: 9.8 | Completeness: 6.5
‚úÖ [62_0_1771467201] 

In average it takes ~17 seconds to validate all extracted signals and missing signals from 1 transcription