## Main Code

### 0. Libraries

In [None]:
!pip install faker

In [None]:
import vertexai
from vertexai.generative_models import GenerativeModel
from google.cloud import bigquery
from google.api_core.exceptions import NotFound, ResourceExhausted
import json
import random
import time
import re
import uuid
from datetime import datetime, timedelta
from faker import Faker
from typing import Tuple, List, Dict, Any

### 1. Configuration & Naming Standards

In [None]:
# ==========================================
# 1. CONFIGURATION & NAMING STANDARDS
# ==========================================

PROJECT_ID = "project-nirvana-405904"  # <--- REPLACE THIS
LOCATION = "us-central1"

PROJECT_TAG = "csv"
SCRIPT_VERSION = "006"
TABLE_VERSION = "006"

JSONL_FILE = f"vel_{PROJECT_TAG}_transcripts_{SCRIPT_VERSION}.jsonl"
DATASET_ID = f"vel_{PROJECT_TAG}_schema"
TABLE_ID = f"vel_{PROJECT_TAG}_synthetic_transcripts_{TABLE_VERSION}"

# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=LOCATION)
model = GenerativeModel("gemini-2.5-flash")

# Initialize Faker
fake = Faker()

# With seed the 1¬∞ conversation generated will have the same metadata as other days (just ID's and raw_transcriptions will be different).
# So, we commented these lines to avoid "similar" transcriptions metadata (You could found a same creator_name, topics, region, etc. with different creator_id).
#Faker.seed(42)
#random.seed(42)

### 2. Data Definitions (Personas, Products, Chaos)

In [None]:
# ==========================================
# 2. DATA DEFINITIONS (Personas, Products, Chaos)
# ==========================================

PERSONAS = {
    "Legacy Enterprise": {
        "description": "Established 7+ years ago. High subs (2M-10M) but declining views.",
        "pain_points": ["Subscribers not getting notifications", "Fear of irrelevance"],
        "metrics": {"subs_range": (2000000, 10000000), "trend": "Declining"},
        "spm_goal": "Strategic Pivot (modernization, community posts)",
        "tone": "Professional, concerned, nostalgic"
    },
    "Short-Form Specialist": {
        "description": "Rapid growth via Shorts (500k-5M subs). High views, low revenue.",
        "pain_points": ["Monetization gap (low RPM)", "Brand safety"],
        "metrics": {"subs_range": (500000, 5000000), "trend": "Explosive"},
        "spm_goal": "Diversification (long-form, sponsorships)",
        "tone": "Energetic, impatient, volume-focused"
    },
    "Niche Professional": {
        "description": "High-value verticals (Finance, Tech). Lower views but high RPM.",
        "pain_points": ["Limited Ads (Yellow Icon)", "Feature access"],
        "metrics": {"subs_range": (100000, 500000), "trend": "Steady"},
        "spm_goal": "Product Adoption (Shopping, Memberships)",
        "tone": "Transactional, data-driven, calm"
    },
    "Viral Newcomer": {
        "description": "Sudden massive spike. Lacks infrastructure. Overwhelmed.",
        "pain_points": ["Identity Verification", "Copyright Claims"],
        "metrics": {"subs_range": (10000, 200000), "trend": "Viral"},
        "spm_goal": "Operational Health (Policy basics, Security)",
        "tone": "Excited, chaotic, anxious, informal"
    },
    "Cross-Platform Hustler": {
        "description": "Massive following on TikTok/Twitch, trying to port success to YouTube.",
        "pain_points": ["Algorithm differences", "Conversion from Shorts to Long-form"],
        "metrics": {"subs_range": (100000, 1000000), "trend": "Growing"},
        "spm_goal": "Platform Loyalty (Live Streaming, Premieres)",
        "tone": "Business-focused, impatient, constantly comparing platforms"
    },
    "Burned-out Veteran": {
        "description": "Highly successful but mentally exhausted from the upload grind.",
        "pain_points": ["Upload Cadence Stress", "Team Scaling"],
        "metrics": {"subs_range": (1000000, 5000000), "trend": "Stagnant"},
        "spm_goal": "Sustainability (Evergreen content, pausing without penalty)",
        "tone": "Exhausted, cynical but open to help, overwhelmed"
    }
}

TOPICS_CATALOG = [
    # Monetization
    "[Monetization > Ad Revenue Optimization] RPM",
    "[Monetization > Ad Revenue Optimization] CPM",
    "[Monetization > Ad Revenue Optimization] Geo Mix",
    "[Monetization > Ad Revenue Optimization] Seasonality",
    "[Monetization > Ad Revenue Optimization] Long Form vs Shorts",
    "[Monetization > Fan Funding Optimization] Channel Memberships",
    "[Monetization > Fan Funding Optimization] Super Chats",
    "[Monetization > Fan Funding Optimization] Super Thanks",
    "[Monetization > Fan Funding Optimization] Recurring Revenue",
    "[Monetization > Fan Funding Optimization] Churn",
    "[Monetization > Commerce Optimization] Shopping",
    "[Monetization > Commerce Optimization] Affiliate",
    "[Monetization > Commerce Optimization] Product Tagging",
    "[Monetization > Commerce Optimization] Conversion",
    "[Monetization > Brand Revenue Strategy] Brand Deals",
    "[Monetization > Brand Revenue Strategy] Brand Connect",
    "[Monetization > Brand Revenue Strategy] Sponsor Integrations",
    "[Monetization > Brand Revenue Strategy] Platform Ads",

    # Content & Formats
    "[Content & Formats > Shorts Strategy] Shorts growth vs revenue",
    "[Content & Formats > Shorts Strategy] Shorts Collab",
    "[Content & Formats > Shorts Strategy] Shorts Experimentation",
    "[Content & Formats > Live and Event Strategy] Live Streaming",
    "[Content & Formats > Live and Event Strategy] Premieres",
    "[Content & Formats > Live and Event Strategy] Redirect Strategy",
    "[Content & Formats > Live and Event Strategy] Real-Time Monetization",
    "[Content & Formats > Live and Event Strategy] Scheduled Launches",
    "[Content & Formats > Content Packaging] Titles, Thumbnails",
    "[Content & Formats > Content Packaging] Hooks",
    "[Content & Formats > Content Packaging] Chapters",
    "[Content & Formats > Content Packaging] Video Structure Strategy",

    # Tools and Policy
    "[Tools and Policy > Copyright and content] Claims vs Strikes",
    "[Tools and Policy > Copyright and content] Disputes",
    "[Tools and Policy > Copyright and content] Copyright Risk Management",
    "[Tools and Policy > Brand Safety and Ads] Yellow Icon",
    "[Tools and Policy > Brand Safety and Ads] Advertiser Suitability",
    "[Tools and Policy > Brand Safety and Ads] Self Certification",
    "[Tools and Policy > Brand Safety and Ads] Ad Restrictions",

    # Creator Health and Ops
    "[Creator Health and Ops > Sustainability and Ops] Burnout",
    "[Creator Health and Ops > Sustainability and Ops] Upload Cadence Stress",
    "[Creator Health and Ops > Sustainability and Ops] Team Scaling",
    "[Creator Health and Ops > Sustainability and Ops] Prod Workflow Strain",

    # Relationship and Strategic Support
    "[Relationship and Strategic Support > Strategic Partnership and Support] Feedback on SPM support",
    "[Relationship and Strategic Support > Strategic Partnership and Support] Need for Escalation",
    "[Relationship and Strategic Support > Strategic Partnership and Support] Milestone Logistics",
    "[Relationship and Strategic Support > Strategic Partnership and Support] Awards/Events",

    # Analytics and Growth
    "[Analytics and Growth > Retention and Discovery] Audience Retention Curves",
    "[Analytics and Growth > Retention and Discovery] Returning Viewers",
    "[Analytics and Growth > Retention and Discovery] Engagement Depth",
    "[Analytics and Growth > Retention and Discovery] Subscriber Conversion",
    "[Analytics and Growth > Traffic and Discovery] Browse vs Search vs Suggested",
    "[Analytics and Growth > Traffic and Discovery] Growth Volatility",
    "[Analytics and Growth > Traffic and Discovery] Traffic Source Dependency",
    "[Analytics and Growth > Topic and Demand Discovery] Research Tab",
    "[Analytics and Growth > Topic and Demand Discovery] Keyword Demand",
    "[Analytics and Growth > Topic and Demand Discovery] Content Ideation off Audience",
    "[Analytics and Growth > Topic and Demand Discovery] Search Trends Buff",
    "[Analytics and Growth > Performance Metric Interpretation] Confusion/Discussion on RPM vs CPM",
    "[Analytics and Growth > Performance Metric Interpretation] Impression",
    "[Analytics and Growth > Performance Metric Interpretation] CTR",
    "[Analytics and Growth > Performance Metric Interpretation] Analytics Insights to Gaps"
]

SCENARIOS = [
    {"type": "Escalation & Crisis Resolution", "duration_minutes": 45, "focus": "Urgent issue resolution", "chaos_probability": 0.6},
    {"type": "Quarterly Business Review", "duration_minutes": 60, "focus": "Strategic planning", "chaos_probability": 0.2},
    {"type": "Annual Strategy Workshop", "duration_minutes": 90, "focus": "Long-term growth & multiple product adoption", "chaos_probability": 0.1},
    {"type": "Content Strategy Deep Dive", "duration_minutes": 120, "focus": "Comprehensive content overhaul", "chaos_probability": 0.3}
]

CHAOS_INSTRUCTIONS = {
    "Technical Glitch": "The Creator's audio crackles. They have to repeat themselves. The SPM sounds slightly annoyed but hides it.",
    "Interruption": "A loud background noise (dog, doorbell) forces a 2-turn pause. The conversation loses its momentum.",
    "Jargon Misunderstanding": "The Creator thinks 'RPM' is 'Revenue Per Million' and gets excited; the SPM must correct the math.",
    "Tangent": "The Creator spends 3 turns complaining about a recent movie or the weather. The SPM has to struggle to bring the conversation back to business.",
    "Budget Stress": "The Creator focuses obsessively on 'how much this costs' and ignores the security/policy benefits for several turns.",
    "None": "Standard professional flow with minimal verbal fillers."
}

# --- NUEVA LISTA DE SPMs CON IDs √öNICOS ---
SPM_DB = [{"name": fake.name(), "id": f"SPM_{uuid.uuid4().hex[:6].upper()}"} for _ in range(20)]

NICHES = [
    # Originals
    "Gaming", "Beauty", "Tech Review", "Finance", "Vlog/Lifestyle", "Cooking",

    # Entertainment & Culture
    "True Crime", "Comedy/Sketch", "Movie/TV Reviews", "Commentary/Drama", "Animation", "ASMR",

    # Education & Personal Development
    "Education/Edutainment", "Productivity", "Self-Help/Motivation", "BookTube/Literature", "Language Learning",

    # Health, Wellness & Sports
    "Fitness/Workout", "Mental Health/Wellness", "Sports/Highlights",

    # Hobbies & Skills
    "DIY/Crafts", "Art/Drawing", "Music Production/Covers", "Photography/Videography", "Automotive/Cars", "Gardening",

    # Specific Lifestyle
    "Travel", "Fashion", "Parenting/Family", "Pets/Animals", "Real Estate", "Minimalism"
]

CHANNEL_PREFIXES = {
    # Originals
    "Gaming": ["Pixel", "Neon", "Retro", "Speed", "Shadow", "Elite", "Pro", "Quest", "Cyber", "Glitch"],
    "Beauty": ["Glow", "Pure", "Luxe", "Velvet", "Chic", "Radiant", "Bella", "Silk", "Trend", "Glam"],
    "Tech Review": ["Future", "Smart", "Tech", "Binary", "Gadget", "Silicon", "Digital", "Byte", "Hardware", "Logic"],
    "Finance": ["Wealth", "Market", "Crypto", "Asset", "Value", "Capital", "Bull", "Fiscal", "Invest", "Money"],
    "Vlog/Lifestyle": ["Daily", "Urban", "Wild", "Simple", "Happy", "Travel", "LifeWith", "Vibe", "Core", "Just"],
    "Cooking": ["Tasty", "Chef", "Golden", "Spicy", "Fresh", "Yummy", "Kitchen", "Baked", "Savory", "Sweet"],

    # Entertainment & Culture
    "True Crime": ["Dark", "Cold", "Mystery", "Night", "Shadow", "Silent", "Crime", "Deep", "Hidden", "Case"],
    "Comedy/Sketch": ["Funny", "Laugh", "Joker", "Sketch", "HaHa", "Epic", "Wild", "Crazy", "Chuckle", "Giggle"],
    "Movie/TV Reviews": ["Screen", "Film", "Cinema", "Plot", "Binge", "Review", "Frame", "Movie", "Scene", "Roll"],
    "Commentary/Drama": ["Tea", "Spill", "Topic", "Point", "Open", "Real", "Truth", "Talk", "Voice", "Fair"],
    "Animation": ["Toon", "Ink", "Draw", "Frame", "Vector", "Pixel", "Motion", "Sketch", "Doodle", "Art"],
    "ASMR": ["Soft", "Quiet", "Tingle", "Gentle", "Pure", "Zen", "Calm", "Relax", "Deep", "Echo"],

    # Education & Personal Development
    "Education/Edutainment": ["Smart", "Learn", "Brain", "Fact", "Know", "Mind", "Study", "Bright", "Quick", "Pure"],
    "Productivity": ["Focus", "Prime", "Flow", "Efficient", "Peak", "Done", "Work", "Plan", "Swift", "Method"],
    "Self-Help/Motivation": ["Rise", "Growth", "Vibe", "Soul", "Path", "Goal", "Strong", "Ever", "Higher", "Will"],
    "BookTube/Literature": ["Page", "Novel", "Chapter", "Ink", "Book", "Shelf", "Read", "Verse", "Text", "Story"],
    "Language Learning": ["Fluent", "Speak", "Word", "Lingo", "Poly", "Talk", "Global", "Native", "Bridge", "Key"],

    # Health, Wellness & Sports
    "Fitness/Workout": ["Iron", "Core", "Flex", "Power", "Active", "Fit", "Grind", "Pulse", "Strong", "Titan"],
    "Mental Health/Wellness": ["Soul", "Peace", "Calm", "Mindful", "Zen", "Heart", "Heal", "Space", "Balance", "Clear"],
    "Sports/Highlights": ["Game", "Pro", "Goal", "Sport", "Fast", "Clutch", "Apex", "Field", "Court", "Fan"],

    # Hobbies & Skills
    "DIY/Crafts": ["Handy", "Make", "Build", "Craft", "Home", "Tool", "Create", "Design", "Fix", "Pro"],
    "Art/Drawing": ["Ink", "Brush", "Canvas", "Color", "Palette", "Stroke", "Art", "Visual", "Hue", "Paint"],
    "Music Production/Covers": ["Beat", "Sound", "Audio", "Note", "Studio", "Vocal", "Melody", "Track", "Wave", "Mix"],
    "Photography/Videography": ["Lens", "Shutter", "Frame", "Capture", "View", "Focus", "Angle", "Snap", "Raw", "Flash"],
    "Automotive/Cars": ["Turbo", "Shift", "Drive", "Motor", "Engine", "Auto", "Gear", "Torque", "Piston", "Race"],
    "Gardening": ["Green", "Root", "Leaf", "Bloom", "Soil", "Nature", "Wild", "Seed", "Farm", "Eco"],

    # Specific Lifestyle
    "Travel": ["Wander", "Nomad", "Route", "Global", "Trip", "Atlas", "Vista", "Way", "Map", "Bound"],
    "Fashion": ["Style", "Trend", "Vogue", "Luxe", "Fit", "Chic", "Mode", "Clout", "Drip", "Look"],
    "Parenting/Family": ["Home", "Nest", "Kind", "Parent", "Little", "Sweet", "Daily", "Joy", "Root", "Life"],
    "Pets/Animals": ["Paws", "Wild", "Tail", "Pet", "Fur", "Bark", "Cute", "Safe", "Critter", "Nature"],
    "Real Estate": ["Pro", "Estate", "Home", "Land", "Key", "Prime", "Urban", "Metro", "Yield", "Prop"],
    "Minimalism": ["Pure", "Simple", "Less", "Clean", "Zen", "Blank", "Core", "Base", "Sleek", "Plain"]
}

CHANNEL_SUFFIXES = {
    # Originals
    "Gaming": ["Plays", "Gaming", "Arcade", "Zone", "Quest", "TV", "Live", "Station", "Hub", "Verse"],
    "Beauty": ["Beauty", "Cosmetics", "Skin", "Style", "Makeup", "Looks", "Secrets", "Studio", "Room", "Diaries"],
    "Tech Review": ["Reviews", "Lab", "Unboxed", "Flow", "Hub", "Central", "Insights", "Talks", "Breakdown", "Zone"],
    "Finance": ["Watch", "Flow", "Capital", "Sense", "Moves", "Tips", "Guru", "Strategies", "Roadmap", "Hustle"],
    "Vlog/Lifestyle": ["Vlogs", "Life", "Journeys", "Adventures", "Stories", "Days", "Moments", "World", "Lens", "Focus"],
    "Cooking": ["Kitchen", "Eats", "Bites", "Table", "Recipes", "Bakery", "Cooks", "Flavors", "Delights", "Spot"],

    # Entertainment & Culture
    "True Crime": ["Files", "Vault", "Tales", "Secrets", "Chronicles", "Reports", "Incident", "Theory", "Investigation", "Records"],
    "Comedy/Sketch": ["Show", "Laughs", "Comedy", "Sketches", "Humor", "Jokes", "Fun", "Gags", "Vibes", "Stuff"],
    "Movie/TV Reviews": ["Box", "Reel", "Critic", "Review", "Analysis", "Fix", "Central", "Guide", "Tube", "Watch"],
    "Commentary/Drama": ["Tea", "Central", "Opinions", "Talk", "Drama", "Exposed", "Buzz", "DeepDive", "News", "Reality"],
    "Animation": ["Studio", "Toons", "Anims", "Shorts", "World", "Design", "Creation", "Works", "Box", "Lab"],
    "ASMR": ["Tingles", "Whispers", "Sleep", "Relax", "Therapy", "Sound", "Calm", "Vibes", "Zen", "Zone"],

    # Education & Personal Development
    "Education/Edutainment": ["University", "Facts", "Explained", "101", "Hub", "Academy", "Class", "Lab", "School", "Portal"],
    "Productivity": ["System", "Methods", "Growth", "Hacks", "Journal", "Pro", "Labs", "Mastery", "Success", "Way"],
    "Self-Help/Motivation": ["Mindset", "Daily", "Vision", "Impact", "Path", "Way", "Journey", "Life", "Core", "Rise"],
    "BookTube/Literature": ["Shelf", "Reads", "Library", "Review", "Lovers", "Corner", "Nook", "Verse", "Pages", "World"],
    "Language Learning": ["Method", "Course", "Talk", "Path", "Way", "Fluent", "Academy", "Bridge", "World", "Steps"],

    # Health, Wellness & Sports
    "Fitness/Workout": ["Gym", "Fit", "Performance", "Coaching", "Muscle", "Results", "System", "Flow", "Life", "Training"],
    "Mental Health/Wellness": ["Healing", "Mind", "Spirit", "Peace", "Wellness", "Daily", "Path", "Soul", "Well", "Center"],
    "Sports/Highlights": ["Highlights", "TV", "Fan", "Replay", "Clips", "Scout", "Report", "Talk", "Hub", "Zone"],

    # Hobbies & Skills
    "DIY/Crafts": ["Fix", "Made", "Workshop", "Project", "Ideas", "Craft", "Home", "Builds", "Solutions", "Lab"],
    "Art/Drawing": ["Art", "Gallery", "Sketchbook", "Studio", "Draws", "Ink", "Portfolio", "Workshop", "Space", "Concept"],
    "Music Production/Covers": ["Music", "Records", "Beats", "Covers", "Sound", "Studio", "Tracks", "Session", "Mix", "Production"],
    "Photography/Videography": ["Photo", "Video", "Media", "Visuals", "Productions", "Studio", "Lens", "Shoots", "Academy", "Lab"],
    "Automotive/Cars": ["Garage", "Mods", "Review", "Spec", "Works", "Drive", "Customs", "Builds", "Performance", "Hub"],
    "Gardening": ["Garden", "Farm", "Plants", "Land", "Nature", "Green", "Roots", "Yard", "Acres", "Patch"],

    # Specific Lifestyle
    "Travel": ["Travels", "Explores", "Wander", "Nomad", "Diary", "Vlogs", "Trip", "World", "Adventure", "Way"],
    "Fashion": ["Hauls", "Style", "Edit", "Fashion", "Looks", "Collection", "Trends", "Wardrobe", "Wear", "Fit"],
    "Parenting/Family": ["Family", "Kids", "Mom", "Dad", "Life", "Diaries", "Days", "Nest", "Home", "Chaos"],
    "Pets/Animals": ["Pets", "Dogs", "Cats", "Wild", "Rescue", "Life", "Care", "World", "Tails", "Buddies"],
    "Real Estate": ["Investing", "Prop", "Homes", "Market", "Advisors", "Yield", "Strategies", "Living", "Metro", "View"],
    "Minimalism": ["Life", "Simple", "Essentials", "Living", "Mind", "Way", "Home", "Space", "Minimal", "Method"]
}

REGIONS = {
    "North America (USA)": {
        "code": "en-US",
        "style": "Direct, energetic, uses American idioms (bucks, awesome, dude).",
        "cultural_context": "Western business casual."
    },
    "Europe (UK)": {
        "code": "en-GB",
        "style": "Polite, perhaps slightly dry humor, uses British terms (cheers, mate, brilliant).",
        "cultural_context": "European formality mixed with wit."
    },
    "Asia Pacific (India)": {
        "code": "en-IN",
        "style": "Respectful, expressive, formal but warm, uses specific Indian English phrasing.",
        "cultural_context": "High deference to authority/policy."
    },
    "Latin America (Brazil)": {
        "code": "en-BR",
        "style": "Warm, engaging, enthusiastic, speaks English with a slight Portuguese cadence or phrasing.",
        "cultural_context": "Relationship-focused."
    },
    "Europe (Germany)": {
        "code": "en-DE",
        "style": "Direct, precise, efficient, less small talk, focuses on facts.",
        "cultural_context": "Efficiency-focused."
    }
}

### 3. Helper Functions

In [None]:
# ==========================================
# 3. HELPER FUNCTIONS
# ==========================================

def parse_and_clean_transcript(raw_text: str, start_time: datetime, expected_duration_minutes: int) -> Tuple[List[Dict[str, str]], str]:
    """
    Data Cleansing at Source: Parses the LLM output, fixes structural errors,
    and returns a strict list of {"role": "...", "content": "..."} objects.
    """
    dialogue_list = []

    try:
        # 1. Limpieza de Markdown y Double Escaping
        clean_text = raw_text
        clean_text = re.sub(r"^```(?:json)?\s*", "", clean_text)
        clean_text = re.sub(r"\s*```$", "", clean_text)
        clean_text = re.sub(r'""(role|content)""', r'"\1"', clean_text)

        dialogue_list_raw = json.loads(clean_text)

        if not isinstance(dialogue_list_raw, list):
            raise ValueError("Parsed JSON is not an array.")

        # 2. Repair Logic (Split-Object Hallucination)
        current_speaker = None
        for item in dialogue_list_raw:
            if "role" in item and "content" in item:
                # Caso ideal
                if item["role"] in ["SPM", "Creator", "Action"] and item["role"] != "role":
                    dialogue_list.append({"role": item["role"], "content": item["content"]})
                    current_speaker = None
                    continue

                # Rescate de Split-Object
                if item["role"] == "role" or item.get("content") in ["SPM", "Creator"]:
                    current_speaker = item.get("content")
                    if current_speaker not in ["SPM", "Creator", "Action"]:
                        current_speaker = None
                elif item["role"] == "content" and current_speaker:
                    dialogue_list.append({"role": current_speaker, "content": item["content"]})
                    current_speaker = None

    except Exception as e:
        print(f"‚ö†Ô∏è JSON Parsing Error (Fallback active): {e}")
        dialogue_list = []

        # --- NEW REGEX FOR BROKEN JSON ---
        # Matches the pattern "role": "X", "content": "Y" ignoring line breaks and structural errors
        pattern = r'"role"\s*:\s*"([^"]+)"\s*,\s*"content"\s*:\s*"(.*?)(?<!\\)"'

        matches = re.finditer(pattern, raw_text, re.DOTALL | re.IGNORECASE)
        for match in matches:
            role_found = match.group(1).strip()
            content_found = match.group(2).strip()

            # Normalize Role
            if "spm" in role_found.lower():
                role_found = "SPM"
            elif "creator" in role_found.lower():
                role_found = "Creator"
            else:
                continue # Ignore garbage matches

            # Clean escaped line breaks from the LLM
            content_found = content_found.replace('\\n', '\n').replace('\\"', '"')

            dialogue_list.append({"role": role_found, "content": content_found})

    # 3. C√°lculo de duraci√≥n total de la llamada (manteniendo la varianza)
    variance_factor = random.uniform(0.90, 1.10)
    total_seconds = int(expected_duration_minutes * 60 * variance_factor)
    end_time_iso = (start_time + timedelta(seconds=total_seconds)).isoformat()

    return dialogue_list, end_time_iso


def normalize_record_for_jsonl(rec: Dict[str, Any]) -> Dict[str, Any]:
    """
    Clean up types for BigQuery ingestion.
    Simplified to remove validation specific fields.
    """
    normalized = rec.copy()

    for k in ("recording_start", "recording_end"):
        v = normalized.get(k)
        if isinstance(v, datetime):
            normalized[k] = v.isoformat()
        else:
            try:
                datetime.fromisoformat(v)
                normalized[k] = v
            except Exception:
                normalized[k] = ""

    try:
        normalized["duration_minutes"] = float(normalized.get("duration_minutes", 0.0))
    except Exception:
        normalized["duration_minutes"] = 0.0

    return normalized

### 4. Generation Engine

In [None]:
# ==========================================
# 4. GENERATION ENGINE
# ==========================================

class CreatorAgent:
    def __init__(self, persona_name: str):
        # Unique ID: Combination of timestamp and hash to avoid collisions (e.g., C_173000_A1B2C3)
        unique_hash = uuid.uuid4().hex[:6].upper()
        self.id = f"C_{int(time.time())}_{unique_hash}"

        self.persona_name = persona_name
        self.persona_data = PERSONAS[persona_name]
        self.niche = random.choice(NICHES)
        self.experience_level = self.persona_data.get("experience_level", "intermediate")
        self.subs = random.randint(self.persona_data['metrics']['subs_range'][0], self.persona_data['metrics']['subs_range'][1])

        self.region_name = random.choice(list(REGIONS.keys()))
        self.region_data = REGIONS[self.region_name]

        prefix = random.choice(CHANNEL_PREFIXES[self.niche])
        suffix = random.choice(CHANNEL_SUFFIXES[self.niche])
        unique_tail = str(random.randint(1, 999)) if random.random() < 0.6 else fake.word().capitalize()
        self.channel_name = f"{prefix}{suffix}{unique_tail}"

        self.creator_display_name = fake.name()

        # Assigning the SPM using the new SPM_DB
        assigned_spm = random.choice(SPM_DB)
        self.assigned_spm_name = assigned_spm["name"]
        self.assigned_spm_id = assigned_spm["id"]

        self.history: List[str] = []

class SimulationEngine:
    def __init__(self, target_rows=10):
        self.target_rows = target_rows
        self.transcript_db = []

    def generate_system_prompt(self, creator, scenario, chaos_key, topic_list, is_followup):
        history_context = f"PREVIOUS CONTEXT: {creator.history[-1]}" if is_followup and creator.history else "This is the first call."
        chaos_instruction = CHAOS_INSTRUCTIONS.get(chaos_key, "No interruptions.")
        duration = scenario.get('duration_minutes', 60)

        # Calculate turns based on duration (approx 1.2 turns per minute for long calls to stay within token limits)
        target_turns = int(duration * 1.2)

        # Format the list of topics into a bulleted string
        agenda_items = "\n".join([f"   - {t}" for t in topic_list])

        # Dynamic Structure Guide based on DURATION
        if duration >= 90: # Very Long Call (1.5 - 2 hours)
            structure_guide = f"""
            **MANDATORY STRUCTURE (DEEP DIVE CALL - {duration} mins):**
            1. [0-10 min] Warm-up, relationship building, and catching up on personal life/culture.
            2. [10-20 min] Channel Health Check: Reviewing analytics (Views, RPM, Subscribers).
            3. [20-80 min] **THE AGENDA (Core Discussion):** Discuss the following topics sequentially. Allow natural transitions between them:
            {agenda_items}
            4. [80-100 min] Brainstorming session for future content ideas based on these topics.
            5. [100-{duration} min] Defining clear action items and closing.
            """
        elif duration >= 45: # Standard QBR (45-60 mins)
            structure_guide = f"""
            **MANDATORY STRUCTURE (STRATEGY CALL - {duration} mins):**
            1. [0-5 min] Intro and check-in.
            2. [5-15 min] Performance Review.
            3. [15-{duration-10} min] **Main Topics:** Cover these points in detail:
            {agenda_items}
            4. [{duration-10}-{duration} min] Q&A and Closing.
            """
        else: # Short Call (Crisis)
            structure_guide = f"""
            **MANDATORY STRUCTURE (SHORT/CRISIS CALL - {duration} mins):**
            1. [0-5 min] Immediate Triage: Address the main crisis ({topic_list[0]}).
            2. [5-{duration-5} min] Brief check on secondary topics if time permits:
            {agenda_items}
            3. [{duration-5}-{duration} min] Wrap up.
            """

        realism_guidelines = """
        HUMAN REALISM GUIDELINES:
        - Add false starts, partial overlaps, self-interruptions.
        - Include jargon misunderstanding.
        - Allow topic drift.
        - Make it feel recorded live, not scripted.
        """

        # BLOCK: JSON INTEGRITY RULES
        # Explicitly forbids the split-object hallucination observed in the incident report.
        json_integrity_rules = """
        **CRITICAL JSON FORMATTING RULES (STRICT ENFORCEMENT):**
        1. **NO SPLIT OBJECTS:** Do NOT separate the role and the content into different objects.
           - ‚ùå INCORRECT: [{"role": "role", "content": "SPM"}, {"role": "content", "content": "Hello"}]
           - ‚úÖ CORRECT:   [{"role": "SPM", "content": "Hello"}]
        2. **NO DOUBLE QUOTES ON KEYS:** Do not use CSV-style double escaping.
           - ‚ùå INCORRECT: [{""role"": ""SPM""}]
           - ‚úÖ CORRECT:   [{"role": "SPM"}]
        3. **SINGLE OBJECT PER TURN:** Each list item must contain BOTH "role" and "content".
        4. **NO DOUBLE QUOTES IN CONTENT:** Do not use double quotes (") within the content strings. If quotation is needed, use single quotes (') instead. This ensures the JSON remains parsable.
        """

        return f"""
        Generate a realistic, verbatim call transcript between a YouTube SPM and a Creator.

        {realism_guidelines}

        **ROLE 1: SPM ({creator.assigned_spm_name})**
        - Goal: Cover the full agenda of {len(topic_list)} topics while maintaining the relationship.
        - Tone: Professional, uses jargon (YPP, CTR), empathetic.

        **ROLE 2: CREATOR ({creator.channel_name})**
        - Persona: {creator.persona_name} ({creator.persona_data['tone']}).
        - Niche: {creator.niche}
        - Region: {creator.region_name} ({creator.region_data['style']})
        - Pain Point: {creator.persona_data['pain_points'][0]}.

        **SCENARIO:** {scenario['type']} ({duration} minutes).
        **CHAOS:** {chaos_instruction}
        **CONTEXT:** {history_context}

        **TOPICS TO DISCUSS (AGENDA):**
        {agenda_items}

        {structure_guide}

        {json_integrity_rules}

        **INSTRUCTIONS:**
        1. **LENGTH:** At least **{target_turns} dialogue turns**. DO NOT SUMMARIZE.
        2. **DEPTH:** You must touch upon ALL topics listed in the Agenda.
        3. **FORMAT:** Output a VALID JSON ARRAY:
           [
             {{"role": "SPM", "content": "..."}},
             {{"role": "Creator", "content": "..."}}
           ]
        """

    def run(self):
        print(f"üöÄ Starting generation. Target: {self.target_rows} rows...")

        rows_generated = 0
        all_possible_topics = TOPICS_CATALOG

        while rows_generated < self.target_rows:
            creator = CreatorAgent(random.choice(list(PERSONAS.keys())))
            num_calls = random.choices([1, 2, 3, 4], weights=[50, 30, 15, 5], k=1)[0]

            print(f"Processing {creator.channel_name} ({creator.region_name}) - Planning {num_calls} call(s)...")

            for call_idx in range(num_calls):
                if rows_generated >= self.target_rows: break

                scenario = random.choice(SCENARIOS)
                duration = scenario.get('duration_minutes', 60)
                chaos_key = "None" if random.random() > scenario['chaos_probability'] else random.choice(list(CHAOS_INSTRUCTIONS.keys()))

                # --- TOPIC SELECTION LOGIC ---
                if duration >= 120: num_topics = random.randint(12, 18)
                elif duration >= 90: num_topics = random.randint(8, 12)
                elif duration >= 60: num_topics = random.randint(5, 8)
                else: num_topics = random.randint(3, 5)

                selected_topics = random.sample(all_possible_topics, min(num_topics, len(all_possible_topics)))
                primary_topic = selected_topics[0]
                full_topic_string = "; ".join(selected_topics)

                start_dt = datetime.now() - timedelta(days=random.randint(1, 30))

                # Clean ID format
                conv_id = f"{creator.id}_Call{call_idx + 1}"

                # --- OPTION B: RETRY LOOP (ANTI-GAPS) ---
                MAX_RETRIES = 3
                attempts = 0
                call_success = False
                quotas_attempts = 1

                while attempts < MAX_RETRIES and not call_success:
                    attempts += 1
                    try:
                        # 1. Generate Raw Transcript
                        prompt = self.generate_system_prompt(creator, scenario, chaos_key, selected_topics, call_idx > 0)

                        response = model.generate_content(
                            prompt,
                            generation_config={
                                "temperature": 0.7,
                                "response_mime_type": "application/json",
                                "max_output_tokens": 8192
                            }
                        )

                        try:
                            raw_transcript_json_string = response.text
                        except ValueError:
                            parts = response.candidates[0].content.parts
                            raw_transcript_json_string = "".join([part.text for part in parts])

                        # 2. Parse & Clean
                        clean_dialogue_list, real_end_time_iso = parse_and_clean_transcript(
                            raw_transcript_json_string,
                            start_dt,
                            duration
                        )

                        # --- SUCCESS EVALUATION ---
                        if len(clean_dialogue_list) == 0:
                            print(f"  ‚ö†Ô∏è Attempt {attempts}/{MAX_RETRIES} failed for {conv_id} (Empty list). Retrying...")
                            time.sleep(1)
                            continue # Returns to the start of the while loop to retry

                        # If it reaches here, it successfully rescued data
                        call_success = True

                        # 3. Summarize
                        summary = model.generate_content(f"Summarize in 1 sentence: {raw_transcript_json_string[:2000]}").text.strip()
                        creator.history.append(summary)

                        # 4. Calculate duration
                        try:
                            start_iso = start_dt
                            end_iso = datetime.fromisoformat(real_end_time_iso)
                            duration_minutes_real = round((end_iso - start_iso).total_seconds() / 60.0, 2)
                        except Exception:
                            duration_minutes_real = float(duration)

                        # 5. Build Record
                        record = {
                            "conversation_id": conv_id,
                            "creator_id": creator.id,
                            "channel_name": creator.channel_name,
                            "creator_niche": creator.niche,
                            "creator_persona": creator.persona_name,
                            "creator_tone": creator.persona_data['tone'],
                            "spm_id": creator.assigned_spm_id,
                            "spm_name": creator.assigned_spm_name,
                            "creator_region": creator.region_name,
                            "language_code": creator.region_data['code'],
                            "scenario": scenario['type'],
                            "product_topic": full_topic_string,
                            "duration_minutes": duration_minutes_real,
                            "recording_start": start_dt.isoformat(),
                            "recording_end": real_end_time_iso,
                            "raw_transcript": clean_dialogue_list,

                            # Pre-allocated Validator Columns (NULL)
                            "is_valid": None,
                            "quality_score": None,
                            "hallucination_flag": None,
                            "validation_report": None,
                            "audit_timestamp": None
                        }

                        self.transcript_db.append(record)
                        rows_generated += 1

                        if attempts > 1:
                            print(f"  ‚úÖ Successfully generated on attempt {attempts}: {conv_id} ({len(selected_topics)} topics)")
                        else:
                            print(f"  ‚úÖ Generated: {conv_id} ({len(selected_topics)} topics, ~{duration_minutes_real} mins)")

                        time.sleep(1)

                    except ResourceExhausted as e:
                        # If Google pauses us, we wait a few seconds to clear the minute window
                        print(f"  ‚è≥ Quota limit reached (ResourceExhausted). Pausing for {2**quotas_attempts} seconds...")
                        time.sleep(2**quotas_attempts)
                        attempts -= 1  # We don't burn an attempt if it was a quota issue
                        quotas_attempts += 1

                    except Exception as e:
                        # For any other random API errors
                        print(f"  ‚ùå General API Error on attempt {attempts}/{MAX_RETRIES} for {conv_id}: {e}")
                        time.sleep(1)

                # If it exited the while loop and call_success is still False, give up on that call
                if not call_success:
                    print(f"üíÄ ABORTED: Unable to generate {conv_id} after {MAX_RETRIES} attempts. Leaving intentional gap.")

        # Final Save
        with open(JSONL_FILE, 'w', encoding='utf-8') as f:
            for entry in self.transcript_db:
                clean_entry = normalize_record_for_jsonl(entry)
                f.write(json.dumps(clean_entry, ensure_ascii=False, separators=(',', ':')) + '\n')
        print(f"‚úÖ Saved {rows_generated} transcripts to: {JSONL_FILE}")

### 5. BigQuery Upload

In [None]:
# ==========================================
# 5. BIGQUERY UPLOAD
# ==========================================

def upload_to_bigquery():
    client = bigquery.Client(project=PROJECT_ID)
    dataset_ref = client.dataset(DATASET_ID)

    try:
        client.get_dataset(dataset_ref)
    except NotFound:
        dataset = bigquery.Dataset(dataset_ref)
        dataset.location = "US"
        client.create_dataset(dataset)
        print(f"‚úÖ Created dataset {DATASET_ID}")

    table_ref = dataset_ref.table(TABLE_ID)
    job_config = bigquery.LoadJobConfig()
    job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON

    # --- CAMBIO CLAVE: SCHEMA EXPL√çCITO ---
    job_config.autodetect = False
    job_config.schema = [
        # Base Columns
        bigquery.SchemaField("conversation_id", "STRING"),
        bigquery.SchemaField("creator_id", "STRING"),
        bigquery.SchemaField("channel_name", "STRING"),
        bigquery.SchemaField("creator_niche", "STRING"),
        bigquery.SchemaField("creator_persona", "STRING"),
        bigquery.SchemaField("creator_tone", "STRING"),
        bigquery.SchemaField("spm_id", "STRING"),
        bigquery.SchemaField("spm_name", "STRING"),
        bigquery.SchemaField("creator_region", "STRING"),
        bigquery.SchemaField("language_code", "STRING"),
        bigquery.SchemaField("scenario", "STRING"),
        bigquery.SchemaField("product_topic", "STRING"),
        bigquery.SchemaField("duration_minutes", "FLOAT"),
        bigquery.SchemaField("recording_start", "TIMESTAMP"),
        bigquery.SchemaField("recording_end", "TIMESTAMP"),
        bigquery.SchemaField("raw_transcript", "JSON"),

        # Validator Columns (Reserved / Null initially)
        bigquery.SchemaField("is_valid", "BOOLEAN"),
        bigquery.SchemaField("quality_score", "FLOAT"),
        bigquery.SchemaField("hallucination_flag", "BOOLEAN"),
        bigquery.SchemaField("validation_report", "JSON"),
        bigquery.SchemaField("audit_timestamp", "TIMESTAMP")
    ]

    job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND

    with open(JSONL_FILE, "rb") as source_file:
        job = client.load_table_from_file(source_file, table_ref, job_config=job_config)

    job.result()
    print(f"üéâ Success! Loaded {job.output_rows} rows.")

## Run Main

In [None]:
if __name__ == "__main__":
    start_time = time.perf_counter()

    # A. Create synthetic data
    sim = SimulationEngine(target_rows=100)
    sim.run()
    end_time_sd = time.perf_counter()

    # B. Upload to BigQuery
    upload_to_bigquery()

    end_time = time.perf_counter()
    duration_sd = str(timedelta(seconds=end_time_sd - start_time))
    duration_bq = str(timedelta(seconds=end_time - end_time_sd))
    duration_total = str(timedelta(seconds=end_time - start_time))

    print(f"Synthetic Data execution time (HH:MM:SS): {duration_sd}")
    print(f"Upload to BigQuery execution time (HH:MM:SS): {duration_bq}")
    print(f"Total execution time (HH:MM:SS): {duration_total}")

In average it takes:

~45 seconds to create a transcription

~3 seconds to upload new transcriptions