## Main Code

### 0. Libraries

In [72]:
import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig
from google.cloud import bigquery
from google.api_core.exceptions import GoogleAPIError
import json
import time
import re
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional

### 1. Configuration & Naming Standards

In [73]:
# ==========================================
# 1. CONFIGURATION & NAMING STANDARDS
# ==========================================

PROJECT_ID = "project-nirvana-405904"  # <--- REPLACE THIS
LOCATION = "us-central1"

PROJECT_TAG = "csv"
SCRIPT_VERSION = "001"
SOURCE_VERSION = "005" # <--- CHANGE THIS TO THE CURRENT TRANSCRIPTIONS TABLE VERSION
DESTINATION_TABLE_VERSION = "001"

JSONL_FILE = f"vel_{PROJECT_TAG}_validations_{SCRIPT_VERSION}.jsonl"
DATASET_ID = f"vel_{PROJECT_TAG}_schema"
SOURCE_TABLE = f"vel_{PROJECT_TAG}_synthetic_transcripts_{SOURCE_VERSION}"
DESTINATION_TABLE = f"vel_{PROJECT_TAG}_transcripts_validation_{DESTINATION_TABLE_VERSION}"

# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=LOCATION)
model = GenerativeModel("gemini-2.5-flash")

# Initialize BigQuery
bq_client = bigquery.Client(project=PROJECT_ID)

### 2. Validator System Promopt (Strict Auditor)

In [74]:
# ==========================================
# 2. VALIDATOR SYSTEM PROMPT (Strict Auditor)
# ==========================================

VALIDATOR_SYSTEM_PROMPT = """
You are an automated **Agent Validator**: a strict, forensic auditor for synthetic conversational transcripts between:
 - Agent A: "YouTube Strategic Partner Manager" (SPM)
 - Agent B: "YouTube Creator"

Your job: read exactly one JSON payload (transcript + metadata) provided as the user message content, evaluate it against the rules below, and output **only** a single JSON object that strictly follows the JSON Output Schema.

REQUIREMENTS
1. Parse input JSON; expect keys: "transcript", "meta", "conversation_id".
2. Treat yourself as a strict auditor. If input is malformed, return is_valid=false.

3. Run the following detectors:
   A. Hallucination Detector (PRODUCT & POLICY PROMISES):
      - Identify claims by SPM promising impossible outcomes (e.g., "I will manually remove a strike in 5 minutes").
      - Flag "absolute timelines" (verbs + specific times) for administrative actions as suspicious.
   B. Character Coherence Detector:
      - Compare utterance to persona. Flag mismatch (e.g., SPM using insults).
   C. Policy Accuracy Detector:
      - Check for obviously incorrect instructions (e.g., "delete account without password").
   D. Conversational Realism Detector (Human-ness):
      - Score naturalness (1.0-5.0).
      - PENALIZE "Robot-talk": Perfect grammar, zero fillers, no interruptions.
      - REWARD "Human noise": Fillers ("uh", "basically"), interruptions, hedging.
   E. Structural Narrative Detector:
      - Ensure start -> problem -> resolution flow.

SCORING RULES:
- Realism_Human, Policy_Precision, Narrative_Coherence (1.0-5.0).
- quality_score: weighted average = round( Realism_Human*0.35 + Policy_Precision*0.40 + Narrative_Coherence*0.25 , 2 )
- If hallucination_flag == true, reduce quality_score by 1.0.
- is_valid == true ONLY if: quality_score >= 3.2 AND hallucination_flag == false.

JSON OUTPUT SCHEMA (MANDATORY):
{
  "is_valid": <boolean>,
  "quality_score": <float>,
  "hallucination_flag": <boolean>,
  "detected_errors": [<string>, ...],
  "correction_suggestions": [<string>, ...],
  "Realism_Human": <float or null>,
  "Policy_Precision": <float or null>,
  "Narrative_Coherence": <float or null>
}
"""

### 3. Helper Functions

In [75]:
# ==========================================
# 3. HELPER FUNCTIONS
# ==========================================

def parse_raw_transcript(raw_text: str) -> List[Dict[str, Any]]:
    """
    Parses the raw text blob from BQ into the list structure required by the Validator.
    Handles JSON arrays or simple Line-by-Line text.
    """
    try:
        # Attempt 1: Parse as pure JSON
        clean_text = re.sub(r"^```(?:json)?\s*", "", raw_text)
        clean_text = re.sub(r"\s*```$", "", clean_text)
        return json.loads(clean_text)
    except json.JSONDecodeError:
        # Attempt 2: Parse Line-by-Line (Fallback)
        dialogue = []
        lines = raw_text.splitlines()
        for line in lines:
            if ":" in line:
                parts = line.split(":", 1)
                role = parts[0].strip()
                content = parts[1].strip()
                # Normalize roles for the validator
                if "SPM" in role: role = "SPM"
                if "Creator" in role: role = "Creator"
                dialogue.append({"role": role, "content": content})
        return dialogue

def initialize_destination_table():
    """
    Creates the destination table (Table B) if it doesn't exist yet.
    Required so the 'Left Join' in the fetch step doesn't fail.
    """
    table_ref = f"{PROJECT_ID}.{DATASET_ID}.{DESTINATION_TABLE}"

    # Define the schema for the standalone validation table
    schema = [
        bigquery.SchemaField("conversation_id", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("is_valid", "BOOLEAN"),
        bigquery.SchemaField("quality_score", "FLOAT"),
        bigquery.SchemaField("hallucination_flag", "BOOLEAN"),
        bigquery.SchemaField("validation_report", "JSON"),
        bigquery.SchemaField("audit_timestamp", "TIMESTAMP"),
        # Metadata columns (copied from source for easier analysis)
        bigquery.SchemaField("spm_name", "STRING"),
        bigquery.SchemaField("creator_id", "STRING"),
        bigquery.SchemaField("creator_region", "STRING")
    ]

    try:
        bq_client.get_table(table_ref)
        print(f"‚úÖ Destination table {DESTINATION_TABLE} exists.")
    except Exception:
        print(f"‚ö†Ô∏è Destination table not found. Creating {DESTINATION_TABLE}...")
        table = bigquery.Table(table_ref, schema=schema)
        bq_client.create_table(table)
        print("‚úÖ Table created successfully.")

def clean_json_response(response_text: str) -> Dict[str, Any]:
    """
    Sanitizes LLM output to ensure valid JSON parsing.
    """
    try:
        cleaned = re.sub(r"^```(?:json)?\s*", "", response_text).strip()
        cleaned = re.sub(r"\s*```$", "", cleaned).strip()
        return json.loads(cleaned)
    except Exception as e:
        print(f"‚ùå JSON Parsing Error: {e}")
        return {
            "is_valid": False,
            "quality_score": 1.0,
            "hallucination_flag": True,
            "detected_errors": ["Validator Output Malformed"],
            "Realism_Human": 0.0,
            "Policy_Precision": 0.0,
            "Narrative_Coherence": 0.0
        }

### 4. Core Logic: The Validator Class

In [76]:
# ==========================================
# 4. Core Logic: The Validator Class
# ==========================================

class TranscriptValidator:
    def __init__(self):
        self.processed_count = 0
        self.validation_results = []

    def validate_row(self, row) -> Dict[str, Any]:
        """
        Takes a BigQuery Row object, runs validation, returns the result dict.
        """
        conv_id = row["conversation_id"]
        raw_text = row["raw_transcript"]

        # 1. Parse Input
        transcript_data = parse_raw_transcript(raw_text)

        # 2. Construct Payload
        # Note: We assume 'meta' exists in BQ row, or we construct a basic one
        payload = {
            "conversation_id": conv_id,
            "transcript": transcript_data,
            "meta": {
                "role_label": {"A": "SPM", "B": "Creator"},
                # If these fields are missing in BQ, we provide defaults to avoid Validator crash
                "channel": row.get("creator_id", "Unknown"),
                "region": row.get("creator_region", "Unknown")
            }
        }

        # 3. Call Gemini
        try:
            prompt = f"{VALIDATOR_SYSTEM_PROMPT}\n\nUSER INPUT:\n```json\n{json.dumps(payload)}\n```"

            response = model.generate_content(
                prompt,
                generation_config=GenerationConfig(
                    temperature=0.0, # Deterministic for auditing
                    response_mime_type="application/json"
                )
            )

            # 4. Process Output
            audit_report = clean_json_response(response.text)

            # 5. Return Enriched Record
            return {
                "conversation_id": conv_id,
                "is_valid": audit_report.get("is_valid", False),
                "quality_score": audit_report.get("quality_score", 0.0),
                "hallucination_flag": audit_report.get("hallucination_flag", False),
                "validation_report": json.dumps(audit_report), # Store full JSON as string
                "audit_timestamp": datetime.utcnow().isoformat()
            }

        except Exception as e:
            print(f"‚ö†Ô∏è Error validating {conv_id}: {str(e)}")
            return None

### 5. BigQuery Upload

In [77]:
# ==========================================
# 5. BIGQUERY UPLOAD (DELTA LOGIC)
# ==========================================

def fetch_pending_transcripts(limit: int = 100):
    """
    Selects rows from Source (A) that are NOT present in Destination (B).
    Pattern: LEFT JOIN ... WHERE B.id IS NULL
    """
    query = f"""
        SELECT
            t.conversation_id,
            t.raw_transcript,
            t.creator_id,
            t.creator_region,
            t.spm_name
        FROM `{PROJECT_ID}.{DATASET_ID}.{SOURCE_TABLE}` t
        LEFT JOIN `{PROJECT_ID}.{DATASET_ID}.{DESTINATION_TABLE}` v
        ON t.conversation_id = v.conversation_id
        WHERE v.conversation_id IS NULL
        LIMIT {limit}
    """
    print("üì• Fetching pending transcripts (Delta Load)...")
    return bq_client.query(query).result()

def upload_results_to_bq(results: List[Dict[str, Any]]):
    """
    Inserts validated rows directly into Destination Table.
    """
    if not results:
        return

    table_ref = f"{PROJECT_ID}.{DATASET_ID}.{DESTINATION_TABLE}"

    # Schema must match initialize_destination_table
    job_config = bigquery.LoadJobConfig(
        write_disposition="WRITE_APPEND",
        schema=[
            bigquery.SchemaField("conversation_id", "STRING", mode="REQUIRED"),
            bigquery.SchemaField("is_valid", "BOOLEAN"),
            bigquery.SchemaField("quality_score", "FLOAT"),
            bigquery.SchemaField("hallucination_flag", "BOOLEAN"),
            bigquery.SchemaField("validation_report", "JSON"),
            bigquery.SchemaField("audit_timestamp", "TIMESTAMP"),
            bigquery.SchemaField("spm_name", "STRING"),
            bigquery.SchemaField("creator_id", "STRING"),
            bigquery.SchemaField("creator_region", "STRING")
        ]
    )

    try:
        job = bq_client.load_table_from_json(results, table_ref, job_config=job_config)
        job.result() # Wait for completion
        print(f"‚úÖ Saved {len(results)} audited records to {DESTINATION_TABLE}")
    except Exception as e:
        print(f"‚ùå Upload failed: {e}")

## Run Main

In [78]:
if __name__ == "__main__":
    import sys # Import needed for clean exit
    start_time = time.perf_counter()

    print("üöÄ Starting Validator Pipeline (Delta Strategy)...")

    # 1. Initialize Destination Table (Crucial Step)
    initialize_destination_table()

    validator = TranscriptValidator()

    # 2. Fetch Data (The Delta)
    rows = fetch_pending_transcripts(limit=120)
    rows_list = list(rows)

    if not rows_list:
        print("üò¥ No pending transcripts found (All rows in Source are already in Destination).")
        sys.exit()

    print(f"üîç Found {len(rows_list)} new transcripts to validate.")

    batch_results = []

    # 3. Iterate and Validate
    for row in rows_list:
        result = validator.validate_row(row)

        if result:
            # ENRICHMENT: Add metadata from Source Row to Result Dict
            # This makes the destination table standalone/analytical
            result["spm_name"] = row.get("spm_name")
            result["creator_id"] = row.get("creator_id")
            result["creator_region"] = row.get("creator_region")

            batch_results.append(result)

            # Logging
            status_icon = "‚úÖ" if result["is_valid"] else "‚ùå"
            print(f"{status_icon} [{row['conversation_id']}] Score: {result['quality_score']}")

        # Rate Limiting
        time.sleep(0.5)

    end_time_sd = time.perf_counter()

    # 4. Upload Results
    if batch_results:
        upload_results_to_bq(batch_results)

    end_time = time.perf_counter()

    print("üèÅ Pipeline finished successfully.")

    duration_sd = str(timedelta(seconds=end_time_sd - start_time))
    duration_bq = str(timedelta(seconds=end_time - end_time_sd))
    duration_total = str(timedelta(seconds=end_time - start_time))

    print(f"Validation execution time: {duration_sd}")
    print(f"Upload execution time: {duration_bq}")
    print(f"Total execution time: {duration_total}")

üöÄ Starting Validator Pipeline (Delta Strategy)...
‚ö†Ô∏è Destination table not found. Creating vel_csv_transcripts_validation_001...
‚úÖ Table created successfully.
üì• Fetching pending transcripts (Delta Load)...
üîç Found 120 new transcripts to validate.
‚ùå [11_0_1771531592] Score: 0.0
‚ùå [11_1_1771531648] Score: None
‚ùå [7_0_1771531273] Score: 3.61
‚úÖ [7_1_1771531308] Score: 4.7
‚ùå [12_0_1771531701] Score: None
‚ùå [5_0_1771531169] Score: 0.0
‚úÖ [6_0_1771531227] Score: 4.93
‚ùå [9_0_1771531453] Score: None
‚úÖ [9_1_1771531512] Score: 4.88
‚úÖ [10_0_1771531556] Score: 4.93
‚úÖ [15_0_1771531852] Score: 4.93
‚úÖ [1_0_1771530949] Score: 4.93
‚úÖ [1_1_1771530986] Score: 4.86
‚ùå [13_0_1771531757] Score: None
‚úÖ [2_0_1771531023] Score: 4.35
‚úÖ [3_0_1771531075] Score: 4.83
‚úÖ [4_0_1771531118] Score: 4.83
‚ùå [8_0_1771531350] Score: 0.0
‚ùå [8_1_1771531407] Score: 3.83
‚ùå [14_0_1771531814] Score: 3.68
‚úÖ [1_0_1771463705] Score: 4.9
‚úÖ [28_0_1771465305] Score: 4.97
‚úÖ [41_0

In average it takes ~20 seconds to validate a transcription

In [79]:
import pandas as pd
df_results = pd.DataFrame(batch_results)
print("Not Valid: ", df_results[df_results['is_valid'] == False].shape[0])
print("Valid: ", df_results[df_results['is_valid'] == True].shape[0])

Not Valid:  47
Valid:  73
