## Main Code

### 0. Libraries

In [None]:
import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig
from google.cloud import bigquery
from google.api_core.exceptions import GoogleAPIError
import json
import time
import re
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional

### 1. Configuration & Naming Standards

In [None]:
# ==========================================
# 1. CONFIGURATION & NAMING STANDARDS
# ==========================================

PROJECT_ID = "project-nirvana-405904"  # <--- REPLACE THIS
LOCATION = "us-central1"

PROJECT_TAG = "csv"
SCRIPT_VERSION = "002"
SOURCE_VERSION = "006" # <--- CHANGE THIS TO THE CURRENT TRANSCRIPTIONS TABLE VERSION

# Single Table Architecture
DATASET_ID = f"vel_{PROJECT_TAG}_schema"
TARGET_TABLE = f"vel_{PROJECT_TAG}_synthetic_transcripts_{SOURCE_VERSION}"
STAGING_TABLE = f"{TARGET_TABLE}_staging_temp" # Temporary table for MERGE

# For merge operator (updating rows)
BATCH_SIZE = 200

# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=LOCATION)
model = GenerativeModel("gemini-2.5-flash")

# Initialize BigQuery
bq_client = bigquery.Client(project=PROJECT_ID)

### 2. Validator System Promopt (Strict Auditor)

In [None]:
# ==========================================
# 2. VALIDATOR SYSTEM PROMPT (Strict Auditor)
# ==========================================

VALIDATOR_SYSTEM_PROMPT = """
You are an automated **Agent Validator**: a strict, forensic auditor for synthetic conversational transcripts between:
 - Agent A: "YouTube Strategic Partner Manager" (SPM)
 - Agent B: "YouTube Creator"

Your job: read exactly one JSON payload (transcript + metadata) provided as the user message content, evaluate it against the rules below, and output **only** a single JSON object that strictly follows the JSON Output Schema.

REQUIREMENTS
1. Parse input JSON; expect keys: "transcript", "meta", "conversation_id".
2. Run the following detectors:
   A. Hallucination Detector (PRODUCT & POLICY PROMISES): Flag impossible outcomes (e.g., "I will manually remove a strike in 5 minutes").
   B. Character Coherence Detector: Flag mismatch (e.g., SPM using insults).
   C. Policy Accuracy Detector: Check for obviously incorrect instructions.
   D. Conversational Realism Detector (Human-ness): Score naturalness (1.0-5.0). Penalize "Robot-talk", reward "Human noise".
   E. Structural Narrative Detector: Score logical start -> problem -> resolution flow (1.0-5.0).

SCORING RULES:
- Evaluate the three dimensions on a scale of 1.0 to 5.0.
- Set hallucination_flag to true if a hallucination/policy error is detected, otherwise false.

JSON OUTPUT SCHEMA (MANDATORY):
{
  "hallucination_flag": <boolean>,
  "detected_errors": [<string>, ...],
  "correction_suggestions": [<string>, ...],
  "Realism_Human": <float>,
  "Policy_Precision": <float>,
  "Narrative_Coherence": <float>
}
"""

### 3. Helper Functions

In [None]:
# ==========================================
# 3. HELPER FUNCTIONS
# ==========================================

def parse_raw_transcript(raw_data: Any) -> List[Dict[str, Any]]:
    """
    Parses the raw data from BQ into the list structure required by the Validator.
    Since the upstream Generator now outputs clean JSON, this function primarily
    handles native Python lists, but retains fallback logic for legacy string records.
    """
    # 1. Fast path: If BQ already handed us a parsed list (Native JSON column), return it directly.
    if isinstance(raw_data, list):
        return raw_data

    # 2. Fallback: If it's a string (legacy data or upstream failure), apply cleaning.
    if isinstance(raw_data, str):
        clean_text = raw_data.strip()

        # Remove Markdown fences if present
        clean_text = re.sub(r"^```(?:json)?\s*", "", clean_text)
        clean_text = re.sub(r"\s*```$", "", clean_text)

        # Fix double-escaped CSV quotes
        if '""role""' in clean_text:
            clean_text = clean_text.replace('""', '"').strip('"')

        parsed_json = None
        try:
            parsed_json = json.loads(clean_text)
        except json.JSONDecodeError:
            pass

        # Handle the "Split-Object Bug" from older LLM generations
        if isinstance(parsed_json, list):
            normalized_dialogue = []
            buffer_role = None

            for item in parsed_json:
                if "role" in item and "content" in item:
                    r = str(item["role"]).strip()
                    c = str(item["content"]).strip()

                    if r.lower() in ["role", '"role"']:
                        buffer_role = c
                    elif r.lower() in ["content", '"content"']:
                        if buffer_role:
                            normalized_dialogue.append({"role": buffer_role, "content": c})
                            buffer_role = None
                        else:
                            normalized_dialogue.append({"role": "Unknown", "content": c})
                    else:
                        normalized_dialogue.append({"role": r, "content": c})

            if normalized_dialogue:
                return normalized_dialogue

        # Ultimate fallback: Line-by-line parsing if JSON is completely broken
        dialogue = []
        for line in raw_data.splitlines():
            line = line.replace('""', '"').strip('"')
            parts = line.split(":", 1)
            if len(parts) == 2:
                role_cand = parts[0].strip().replace('"', '')
                content_cand = parts[1].strip().replace('"', '')
                final_role = "Unknown"

                if "SPM" in role_cand.upper():
                    final_role = "SPM"
                if "CREATOR" in role_cand.upper():
                    final_role = "Creator"

                if final_role != "Unknown":
                    dialogue.append({"role": final_role, "content": content_cand})

        return dialogue

    # If we receive None or an unexpected type
    return []

def clean_json_response(response_text: str) -> Dict[str, Any]:
    """
    Sanitizes LLM output to ensure valid JSON parsing.
    """
    try:
        cleaned = re.sub(r"^```(?:json)?\s*", "", response_text).strip()
        cleaned = re.sub(r"\s*```$", "", cleaned).strip()
        return json.loads(cleaned)
    except Exception as e:
        print(f"‚ùå JSON Parsing Error: {e}")
        return {
            "hallucination_flag": True,
            "detected_errors": ["Validator Output Malformed"],
            "Realism_Human": 1.0,
            "Policy_Precision": 1.0,
            "Narrative_Coherence": 1.0
        }

### 4. Core Logic: The Validator Class

In [None]:
# ==========================================
# 4. Core Logic: The Validator Class
# ==========================================

class TranscriptValidator:
    def __init__(self):
        self.processed_count = 0
        self.validation_results = []

    def validate_row(self, row) -> Dict[str, Any]:
        """
        Takes a BigQuery Row object, runs validation, returns the result dict.
        """
        conv_id = row["conversation_id"]
        raw_text = row["raw_transcript"]

        # 1. Parse Input
        transcript_data = parse_raw_transcript(raw_text)

        # 2. Construct Payload
        payload = {
            "conversation_id": conv_id,
            "transcript": transcript_data,
            "meta": {
                "role_label": {"A": "SPM", "B": "Creator"}
            }
        }

        # 3. Call Gemini
        try:
            prompt = f"{VALIDATOR_SYSTEM_PROMPT}\n\nUSER INPUT:\n```json\n{json.dumps(payload)}\n```"

            response = model.generate_content(
                prompt,
                generation_config=GenerationConfig(
                    temperature=0.0,
                    response_mime_type="application/json"
                )
            )

            # 4. Process Output
            audit_report = clean_json_response(response.text)

            # 5. PYTHON CALCULATES MATH AND LOGIC (Safe from LLM Hallucinations)
            realism = float(audit_report.get("Realism_Human") or 1.0)
            policy = float(audit_report.get("Policy_Precision") or 1.0)
            narrative = float(audit_report.get("Narrative_Coherence") or 1.0)
            hallucination = bool(audit_report.get("hallucination_flag", False))

            # Weighted average
            base_score = round((realism * 0.35) + (policy * 0.40) + (narrative * 0.25), 2)

            # Penalty
            final_score = base_score - 1.0 if hallucination else base_score
            final_score = max(1.0, round(final_score, 2))

            # Validation logic
            is_valid = (final_score >= 3.2) and (not hallucination)

            # 6. Return Record (Ready for MERGE)
            return {
                "conversation_id": conv_id,
                "is_valid": is_valid,
                "quality_score": final_score,
                "hallucination_flag": hallucination,
                "validation_report": audit_report,
                "audit_timestamp": datetime.utcnow().isoformat()
            }

        except Exception as e:
            print(f"‚ö†Ô∏è Error validating {conv_id}: {str(e)}")
            return None

### 5. BigQuery Upload

In [None]:
# ==========================================
# 5. BIGQUERY UPLOAD (IN-PLACE UPDATE VIA MERGE)
# ==========================================

def fetch_pending_transcripts(limit: int = 100):
    """
    Fetches rows from the single table where validation is missing.
    """
    query = f"""
        SELECT
            conversation_id,
            raw_transcript
        FROM `{PROJECT_ID}.{DATASET_ID}.{TARGET_TABLE}`
        WHERE is_valid IS NULL OR validation_report IS NULL
        LIMIT {limit}
    """
    print("üì• Fetching pending transcripts for validation...")
    return bq_client.query(query).result()

def update_bq_via_merge(results: List[Dict[str, Any]]):
    """
    Uses a temporary staging table to bulk UPDATE the main table (MERGE).
    Avoids DML Quota limits.
    """
    if not results:
        return

    staging_ref = f"{PROJECT_ID}.{DATASET_ID}.{STAGING_TABLE}"
    target_ref = f"{PROJECT_ID}.{DATASET_ID}.{TARGET_TABLE}"

    # 1. Load results into temporary staging table
    job_config = bigquery.LoadJobConfig(
        write_disposition="WRITE_TRUNCATE", # Replace if exists from previous failed run
        schema=[
            bigquery.SchemaField("conversation_id", "STRING"),
            bigquery.SchemaField("is_valid", "BOOLEAN"),
            bigquery.SchemaField("quality_score", "FLOAT"),
            bigquery.SchemaField("hallucination_flag", "BOOLEAN"),
            bigquery.SchemaField("validation_report", "JSON"),
            bigquery.SchemaField("audit_timestamp", "TIMESTAMP"),
        ]
    )

    try:
        print(f"üì¶ Loading {len(results)} records to temporary staging table...")
        job = bq_client.load_table_from_json(results, staging_ref, job_config=job_config)
        job.result()

        # 2. Execute MERGE query to update TARGET_TABLE
        merge_query = f"""
            MERGE `{target_ref}` T
            USING `{staging_ref}` S
            ON T.conversation_id = S.conversation_id
            WHEN MATCHED THEN
              UPDATE SET
                T.is_valid = S.is_valid,
                T.quality_score = S.quality_score,
                T.hallucination_flag = S.hallucination_flag,
                T.validation_report = S.validation_report,
                T.audit_timestamp = S.audit_timestamp
        """
        print("üîÑ Executing MERGE operation to update main table...")
        bq_client.query(merge_query).result()

        # 3. Clean up (Drop staging table)
        bq_client.delete_table(staging_ref, not_found_ok=True)
        print(f"‚úÖ Successfully updated {len(results)} rows and cleaned up staging.")

    except Exception as e:
        print(f"‚ùå BigQuery update failed: {e}")

## Run Main

In [None]:
if __name__ == "__main__":
    import sys # Import needed for clean exit
    start_time = time.perf_counter()

    print("üöÄ Starting Validator Pipeline (In-Place Update Strategy)...")

    # 1. Instantiate Validator
    validator = TranscriptValidator()

    # 2. Fetch Data (Unvalidated rows only)
    rows = fetch_pending_transcripts(limit=BATCH_SIZE)
    rows_list = list(rows)

    if not rows_list:
        print("üò¥ No pending transcripts found. Everything is up to date.")
        sys.exit()

    print(f"üîç Found {len(rows_list)} new transcripts to validate.")

    batch_results = []

    # 3. Iterate and Validate
    for row in rows_list:
        result = validator.validate_row(row)

        if result:
            batch_results.append(result)

            # Logging
            status_icon = "‚úÖ" if result["is_valid"] else "‚ùå"
            print(f"{status_icon} [{row['conversation_id']}] Score: {result['quality_score']}")

        # Rate Limiting
        time.sleep(0.5)

    end_time_sd = time.perf_counter()

    # 4. Upload Results (Using MERGE)
    if batch_results:
        update_bq_via_merge(batch_results)

    end_time = time.perf_counter()

    print("üèÅ Pipeline finished successfully.")

    duration_sd = str(timedelta(seconds=end_time_sd - start_time))
    duration_bq = str(timedelta(seconds=end_time - end_time_sd))
    duration_total = str(timedelta(seconds=end_time - start_time))

    print(f"Validation execution time: {duration_sd}")
    print(f"Upload execution time: {duration_bq}")
    print(f"Total execution time: {duration_total}")

In average it takes:

~11.5 seconds to validate a transcription

~6 seconds to update the BigQuery table

In [None]:
import pandas as pd
df_results = pd.DataFrame(batch_results)
print("Not Valid: ", df_results[df_results['is_valid'] == False].shape[0])
print("Valid: ", df_results[df_results['is_valid'] == True].shape[0])
print("Porcentaje de transcripciones correctas: ", (df_results[df_results['is_valid'] == True].shape[0]/df_results.shape[0])*100, "%")