In [100]:
import json
from langchain_neo4j import Neo4jGraph
import json
import hashlib
from typing import List, Literal, Optional
from openai import OpenAI
from pydantic import BaseModel, ValidationError
from google import genai
from google.genai import types
import re
import os
from dotenv import load_dotenv

In [101]:
# icp_query = """
# MATCH (ic:IdealTargetCustomer)
# RETURN 
#     ic.segment AS segment,
#     ic.age_range AS age_range,
#     ic.location AS location,
#     ic.job_titles AS job_titles,
#     ic.company_size AS company_size,
#     ic.purchase_frequency AS purchase_frequency,
#     ic.preferences AS preferences,
#     ic.top_pain_points AS pain_points,
#     ic.objection_templates AS objection_templates,
#     ic.affinity_language_style AS affinity_language_style,
#     ic.meeting_conversion_rate AS meeting_conversion_rate,
#     ic.personal_interests AS personal_interests,
#     ic.professional_interests AS professional_interests
# """

# results = graph.query(icp_query)
# count = 1

# icp_segments = []
# for r in results:
    
#     if r["segment"] == "General":
#         continue  # Skip fallback node
    
#     icp_segments.append({
#         f"segment {count}": r["segment"],
#         "age_range": r.get("age_range", ""),
#         "location": r.get("location", ""),
#         "job_titles": r.get("job_titles", []),
#         "company_size": r.get("company_size", ""),
#         "purchase_frequency": r.get("purchase_frequency", ""),
#         "preferences": r.get("preferences", []),
#         "pain_points": r.get("pain_points", []),
#         "objection_templates": r.get("objection_templates", ""),
#         "affinity_language_style": r.get("affinity_language_style", ""),
#         "personal_interests": r.get("personal_interests", ""),
#         "professional_interests": r.get("professional_interests", ""),
#     })
#     count += 1

# icp_segments

In [102]:
# ==============================================================================
# 1. THE BULLETPROOF VALIDATOR (PYDANTIC MODELS) - UPDATED
# ==============================================================================
class CallSession(BaseModel):
    session_id: str
    outcome: Literal["Meeting Scheduled", "Rejected", "Gatekeeper Block", "Voicemail", "Follow-up Required", "Wrong Person", "Call Dropped"]
    product_focus: str
    matched_icp_segment: Optional[str] = None

class Participant(BaseModel):
    name: str
    role: Literal["Agent", "Recipient", "Gatekeeper"]
    organization: Optional[str] = None

class DialogueTurn(BaseModel):
    turn_number: int
    speaker_name: str
    text: str
    # UPDATED to match the new, specific turn types from the final prompt
    turn_type: Literal[
        "Opening", 
        "Closing", 
        "Gatekeeper_Dialogue", 
        "Agent_Question",
        "Agent_Response",
        "Rapport_Building",
        "Customer_Question", 
        "Customer_Response", 
        "Customer_Objection", 
        "Customer_Pain_Point", 
        "Customer_Buying_Signal", 
        "Technical_Issue",

    ]

class DialogueGraphData(BaseModel):
    call_session: CallSession
    participants: List[Participant]
    dialogue_turns: List[DialogueTurn]

# ==============================================================================
# 2. THE UPDATED SCORER
# ==============================================================================

def score_dialogue_extraction(validated_data: DialogueGraphData) -> dict:
    score = 0
    notes = []

    score += len(validated_data.participants) * 5
    if any(p.role == "Gatekeeper" for p in validated_data.participants):
        score += 10
        notes.append("SUCCESS: Gatekeeper correctly identified.")
    
    if validated_data.call_session.outcome == "Meeting Scheduled":
        score += 10
        notes.append("SUCCESS: Meeting outcome captured.")
        
    customer_objections = [t for t in validated_data.dialogue_turns if t.turn_type == 'Customer_Objection']
    customer_pain_points = [t for t in validated_data.dialogue_turns if t.turn_type == 'Customer_Pain_Point']
    customer_buying_signals = [t for t in validated_data.dialogue_turns if t.turn_type == 'Customer_Buying_Signal']

    score += len(customer_objections) * 5
    score += len(customer_pain_points) * 10
    score += len(customer_buying_signals) * 8

    if customer_objections:
        notes.append(f"SUCCESS: Identified {len(customer_objections)} customer objection(s).")
    if customer_pain_points:
        notes.append(f"SUCCESS: Identified {len(customer_pain_points)} customer pain point(s).")
        
    status = "High-confidence" if score > 30 else "Review Recommended"

    return {"final_score": score, "status": status, "notes": notes}

# # ==============================================================================
# # 3. THE ADVANCED INGESTION SCRIPT - UPDATED
# # ==============================================================================

# def ingest_dialogue_flow(graph: Neo4jGraph, validated_data: DialogueGraphData, quality_report: dict):
#     session_query = """
#     MERGE (cs:CallSession {session_id: $session_id})
#     SET cs += $session_details,
#         cs.quality_score = $quality_score,
#         cs.quality_status = $quality_status
    
#     MERGE (p:Product {name: $product_name})
#     MERGE (cs)-[:FOCUSES_ON]->(p)
    
#     WITH cs
#     UNWIND $participants AS participant_data
#     MERGE (person:Person {name: participant_data.name})
#     SET person.role = participant_data.role
#     MERGE (person)-[:PARTICIPATED_IN]->(cs)
#     """
#     graph.query(session_query, params={
#         'session_id': validated_data.call_session.session_id,
#         'session_details': validated_data.call_session.dict(),
#         'quality_score': quality_report['final_score'],
#         'quality_status': quality_report['status'],
#         'product_name': validated_data.call_session.product_focus,
#         'participants': [p.dict() for p in validated_data.participants]
#     })
#     print(f"Ingested CallSession {validated_data.call_session.session_id} with score {quality_report['final_score']}")

#     previous_turn_node_id = None
#     last_customer_objection_id = None

#     for turn in sorted(validated_data.dialogue_turns, key=lambda x: x.turn_number):
#         turn_type_label = "".join(filter(str.isalnum, turn.turn_type))
#         turn_query = f"""
#         MATCH (cs:CallSession {{session_id: $session_id}})
#         MATCH (speaker:Person {{name: $speaker_name}})
#         CREATE (turn_node:{turn_type_label} {{text: $text, turn_number: $turn_number}})
#         CREATE (speaker)-[:MADE_BY]->(turn_node)
#         CREATE (turn_node)-[:RAISED_IN]->(cs)
#         WITH turn_node
#         RETURN id(turn_node) AS current_turn_id
#         """
#         result = graph.query(turn_query, params={
#             'session_id': validated_data.call_session.session_id,
#             'speaker_name': turn.speaker_name, 'text': turn.text,
#             'turn_number': turn.turn_number
#         })
#         current_turn_node_id = result[0]['current_turn_id']

#         if previous_turn_node_id is not None:
#             graph.query(
#                 "MATCH (p) WHERE id(p) = $prev_id MATCH (c) WHERE id(c) = $curr_id CREATE (p)-[:NEXT]->(c)",
#                 {'prev_id': previous_turn_node_id, 'curr_id': current_turn_node_id}
#             )
        
#         # **UPDATED LOGIC** to link Agent_Response to Customer_Objection
#         if turn.turn_type == "Agent_Response" and last_customer_objection_id is not None:
#             graph.query(
#                 "MATCH (r) WHERE id(r) = $resp_id MATCH (o) WHERE id(o) = $obj_id CREATE (r)-[:RESPONDS_TO]->(o)",
#                 {'resp_id': current_turn_node_id, 'obj_id': last_customer_objection_id}
#             )
#             last_customer_objection_id = None # Reset after a response is linked

#         previous_turn_node_id = current_turn_node_id
#         # **UPDATED LOGIC** to track the last objection
#         if turn.turn_type == "Customer_Objection":
#             last_customer_objection_id = current_turn_node_id

#     print(f"Built dialogue chain for CallSession {validated_data.call_session.session_id}")

# ==============================================================================
# 3. THE ADVANCED INGESTION SCRIPT - FINAL, ROBUST VERSION
# ==============================================================================
def ingest_dialogue_flow(graph: Neo4jGraph, validated_data: DialogueGraphData, quality_report: dict):
    # 1. Create/Merge the CallSession and link it to a Product
    session_query = """
    MERGE (cs:CallSession {session_id: $session_id})
    SET cs += $session_details,
        cs.quality_score = $quality_score,
        cs.quality_status = $quality_status
    MERGE (p:Product {name: $product_name})
    MERGE (cs)-[:FOCUSES_ON]->(p)
    """
    graph.query(session_query, params={
        'session_id': validated_data.call_session.session_id,
        'session_details': validated_data.call_session.dict(),
        'quality_score': quality_report['final_score'],
        'quality_status': quality_report['status'],
        'product_name': validated_data.call_session.product_focus
    })
    print(f"Ingested CallSession {validated_data.call_session.session_id} with score {quality_report['final_score']}")

    # 2. Ensure all participants from the list exist as nodes and are linked to the call
    # This step is a good "pre-creation" step to set their roles correctly.
    participants_query = """
    MATCH (cs:CallSession {session_id: $session_id})
    UNWIND $participants AS participant_data
    MERGE (p:Person {name: participant_data.name})
    SET p.role = participant_data.role
    MERGE (p)-[:PARTICIPATED_IN]->(cs)
    """
    graph.query(participants_query, params={
        'session_id': validated_data.call_session.session_id,
        'participants': [p.dict() for p in validated_data.participants]
    })

    # 3. Build the conversation chain
    previous_turn_node_id = None
    last_customer_objection_id = None

    for turn in sorted(validated_data.dialogue_turns, key=lambda x: x.turn_number):
        turn_type_label = "".join(filter(str.isalnum, turn.turn_type))
        
        # **THIS IS THE CRITICAL FIX**
        # We use MERGE for the speaker. This finds the speaker if they exist
        # OR creates them if the LLM hallucinated a speaker name not in the
        # original participants list. This prevents the MATCH from failing.
        turn_query = f"""
        MATCH (cs:CallSession {{session_id: $session_id}})
        MERGE (speaker:Person {{name: $speaker_name}})
        
        CREATE (turn_node:{turn_type_label} {{
            text: $text, 
            turn_number: $turn_number
        }})
        
        CREATE (speaker)-[:MADE_BY]->(turn_node)
        CREATE (turn_node)-[:RAISED_IN]->(cs)
        
        WITH turn_node
        RETURN id(turn_node) AS current_turn_id
        """
        
        result = graph.query(turn_query, params={
            'session_id': validated_data.call_session.session_id,
            'speaker_name': turn.speaker_name,
            'text': turn.text,
            'turn_number': turn.turn_number
        })

        # This check prevents the IndexError
        if not result or 'current_turn_id' not in result[0]:
            print(f"WARNING: Turn creation failed for turn number {turn.turn_number}. Skipping relationship creation for this turn.")
            continue # Skip to the next turn

        current_turn_node_id = result[0]['current_turn_id']

        if previous_turn_node_id is not None:
            graph.query(
                "MATCH (p) WHERE id(p) = $prev_id MATCH (c) WHERE id(c) = $curr_id CREATE (p)-[:NEXT]->(c)",
                {'prev_id': previous_turn_node_id, 'curr_id': current_turn_node_id}
            )
        
        # Updated logic to use the specific turn types
        if turn.turn_type == "Agent_Response" and last_customer_objection_id is not None:
            graph.query(
                "MATCH (r) WHERE id(r) = $resp_id MATCH (o) WHERE id(o) = $obj_id CREATE (r)-[:RESPONDS_TO]->(o)",
                {'resp_id': current_turn_node_id, 'obj_id': last_customer_objection_id}
            )
            last_customer_objection_id = None

        previous_turn_node_id = current_turn_node_id
        if turn.turn_type == "Customer_Objection":
            last_customer_objection_id = current_turn_node_id

    print(f"Built dialogue chain for CallSession {validated_data.call_session.session_id}")


# ==============================================================================
# 4. FUNCTION TO GET THE NEXT AVAILABLE SESSION ID FROM NEO4J
# ==============================================================================
def get_next_session_id(graph: Neo4jGraph) -> int:
    """
    Queries the graph to find the highest existing session_id and returns the next number.
    """
    query = """
    MATCH (cs:CallSession)
    // We extract the numeric part of the session_id
    WITH toInteger(split(cs.session_id, '_')[-1]) AS session_num
    RETURN max(session_num) AS max_id
    """
    try:
        result = graph.query(query)
        if result and result[0]['max_id'] is not None:
            # If there are existing calls, return the max ID + 1
            return result[0]['max_id'] + 1
        else:
            # If the graph is empty, start at 1
            return 1
    except Exception as e:
        print(f"Could not query for max session ID. Defaulting to 1. Error: {e}")
        return 1
    


# ==============================================================================
# 5. FUNCTION TO BUILD THE DIALOGUE FLOW JSON USING THE NER PROMPT
# ==============================================================================
def dialogue_flow_ner(apiClient_key, transcript_text):
    """
    Create the dialogue flow of the raw transcript text into the graph.
    """

    ner_system_prompt = """You are a master data architect. Your task is to convert a call transcript into a structured JSON object representing the dialogue flow. Follow the schema and definitions with extreme precision.

### CRITICAL RULES:
1.  **MODEL THE DIALOGUE FLOW:** The `dialogue_turns` array MUST be a chronological sequence of the entire conversation.
2.  **USE THE PROVIDED DEFINITIONS:** You MUST use the exact `turn_type` definitions provided below. Do not deviate.

---

### JSON SCHEMA:

1.  **`call_session` (Object):**
    *   `session_id`: A unique ID for the call (e.g., "call_transcript_1").
    *   `outcome`: You MUST choose EXACTLY one value from this list: `["Meeting Scheduled", "Rejected", "Gatekeeper Block", "Voicemail", "Follow-up Required", "Wrong Person", "Call Dropped"]`.
    *   `product_focus`: The main product discussed (e.g., "NLA Security Solution").

2.  **`participants` (Array of Objects):**
    *   `name`: Full name of the participant (e.g., "Arison Josh", "Dale Spear", "Unnamed Gatekeeper").
    *   `role`: MUST be one of: `["Agent", "Recipient", "Gatekeeper"]`.
    *   `organization`: The organization they belong to.

3.  **`dialogue_turns` (Array of Chronological Objects):**
    *   Each object represents one speaking turn and must have:
        *   `turn_number`: A sequential integer starting from 1.
        *   `speaker_name`: The name of the speaker.
        *   `text`: The EXACT quote.
        *   `turn_type`: **You MUST choose EXACTLY one value from this list based on these PLAIN ENGLISH definitions:**
            ### Agent-Centric Labels
            - **Opening**: The agent's very first lines to the decision-maker. (e.g., "Hi Sarah, this is James from Acme Corp. Thanks for taking the time.")
            - **Closing**: The agent's last few lines before the call ends. (e.g., "Thankyou for the time, I'll send over the info by email.")
            - **Gatekeeper_Dialogue**: Any exchange that happens `before` the decision-maker is on the line (includes both agent + gatekeeper). (e.g., "Can I speak with Mr. Davis?", "He's not available right now")
            - **Agent_Question**: When the agent asks a question to gather information or move the conversation forward. (e.g., "Are you currently using outside counsel for compliance?")
            - **Agent_Response**: When the agent directly answers a customer's question, objection, pain point, or buying signal. (e.g., "We already have a vendor. → Agent: Totally understand. Many of our clients started in the same place.")
            - **Rapport_Building**: Non-business, relationship-focused statements or light humor. (e.g., "Hope you had a good weekend.", "I'm a pancake person too.")

            ### Customer-Centric Labels
            - **Customer_Question**: Customer asks for clarity, detail, or next steps. (e.g., "Can you explain how that works", "What's the cost?")
            - **Customer_Response**: Neutral or factual statements that do not qualify as a question, objection, pain point, or buying signal. (e.g., "No, I'm not the right person.", "I work in finance, not compliance.", "Okay.")
              ⚠️ **Note:** A simple "No" or "Not really" → goes here **unless** it explicitly conveys resistance to the product/meeting (in which case it is a `Customer_Objection`).
            - **Customer_Objection**: Customer resists, refuses, or dismisses the agent's proposal. (e.g., "We're not interested.", "We already have a provider.")
              ⚠️ **Important:** A normal "No" or short denial **does not count as an objection** unless it is clearly rejecting the agent's pitch or request.
            - **Customer_Pain_Point**: Customer explicitly mentions a business problem, risk, or dissatisfaction. (e.g., "We've had compliance gaps in the past.", "That would probably result in a lawsuit.")
            - **Customer_Buying_Signal**: Customer shows agreement, alignment, or willingness to continue. (e.g., "Yes, that sounds relevant," "Yes, I am available," "That's correct.").

            ### System / Meta Labels
            - **Technical_Issue**: Mentions of call quality problems, pauses, or audio glitches. (e.g., "Are you there?", "There's a long pause.")

---

Example of a Perfect Output Structure:
```json
{
  "call_session": {
    "session_id": "call_transcript_1",
    "outcome": "Meeting Scheduled",
    "product_focus": "Georgia Senate Bill 68 compliance solution"
  },
  "participants": [
    {
      "name": "Arison",
      "role": "Agent",
      "organization": "NLA Investigative Division"
    },
    {
      "name": "Unnamed Gatekeeper",
      "role": "Gatekeeper",
      "organization": "Not Provided"
    },
    {
      "name": "Dale Spear",
      "role": "Recipient",
      "organization": "Not Provided"
    }
  ],
  "dialogue_turns": [
    {
      "speaker": "Agent",
      "text": "Hi Sarah, this is James from Acme Corp. Thanks for taking the time.",
      "labels": "Opening"
    },
    {
      "speaker": "Customer",
      "text": "What does your company do exactly?",
      "labels": "Customer_Question"
    },
    {
      "speaker": "Agent",
      "text": "Great question — we help firms reduce compliance risk by automating workflows.",
      "labels": "Agent_Response"
    },
    {
      "speaker": "Customer",
      "text": "We've had issues in the past with missed filings.",
      "labels": "Customer_Pain_Point"
    },
    {
      "speaker": "Agent",
      "text": "That's exactly where we can help — most clients see a 40% reduction in errors.",
      "labels": "Agent_Response"
    },
    {
      "speaker": "Customer",
      "text": "Yes, that sounds relevant. Can you send me more info?",
      "labels": "Customer_Buying_Signal"
    },
    {
      "speaker": "Agent",
      "text": "Absolutely, I'll follow up with details this afternoon.",
      "labels": "Closing"
    }
  ]
}```"""

    user_prompt = f"""Transcript:
    {transcript_text}
"""

    
    client = OpenAI(api_key=apiClient_key)
    response = client.responses.create(
        model="gpt-5-mini",
        instructions=ner_system_prompt,
        input=user_prompt,
    )
    output = response.output_text
    clean_text = re.sub(r"^```json\s*|\s*```$", "", output.strip())
    
    
    return clean_text
    # return json.loads(clean_text)


# ==============================================================================
# 6. **NEW**: THE ICP CLASSIFIER AND LINKER
# ==============================================================================
def classify_and_link_icp(graph, api_key, transcript_text, recipient_name):
    """
    Classifies a transcript against existing ICPs in the graph and links the recipient.
    """

    # Step A: Classify the transcript using the LLM
    system_classification_prompt = f"""You are an expert sales analyst. 
Classify the call transcript into one of these ICP segments. 

Available Segments:
1. Retail-Enterprise
   - Typical job titles: VP Procurement, Chief Security Officer, Property Manager
   - Key pain points: cost, vendor lock-in, integration, compliance
   - Industries: retail chains, consumer goods, property management

2. Healthcare-Enterprise
   - Typical job titles: CIO, CTO, Director IT Security, CCSFP
   - Key pain points: HIPAA compliance, patient data security, interoperability
   - Industries: hospitals, healthcare systems

3. Manufacturing-Enterprise
   - Typical job titles: COO, VP Safety, Head of Logistics
   - Key pain points: downtime, workforce compliance, legacy system upgrades
   - Industries: factories, industrial facilities

4. Financial-SME
   - Typical job titles: Head of Security, Executive Protection Leader, Lawyer, Attorney, Partner (Law Firm)
   - Key pain points: integration, vendor support, budget
   - Industries: finance, small firms, family offices, law firms / legal practices

5. Film-Entertainment
   - Typical job titles: Studio Exec, Producer, Head of Distribution
   - Key pain points: IP leakage, production delays, talent management
   - Industries: studios, entertainment, media

Rules:
- Look for **industry keywords** and **job titles** first (highest priority).
- Use **pain points only if industry clues are missing**.
- If no match is clear, output "General".

**Response format:** Output only the segment name (e.g., Retail-Enterprise).
"""

    user_prompt = f"""Call Transcript:
    {transcript_text}
    """

    client = OpenAI(api_key=api_key)
    response = client.responses.create(
        model="gpt-5-mini",
        instructions=system_classification_prompt,
        input=user_prompt,
    )
    matched_segment = response.output_text
    
    print(f"LLM classified recipient profile as: {matched_segment}")

    # Step B: Create the relationship in the graph and increment the counter
    if matched_segment != "General":
        link_query = """
        MATCH (p:Person {name: $recipient_name})
        MATCH (ic:IdealTargetCustomer {segment: $segment})
        MERGE (p)-[:MATCHES_PROFILE]->(ic)
        SET ic.completed_call_count = coalesce(ic.completed_call_count, 0) + 1
        """
        graph.query(link_query, params={'recipient_name': recipient_name, 'segment': matched_segment})
        print(f"Linked '{recipient_name}' to '{matched_segment}' and incremented counter.")
        
    return matched_segment


# ==============================================================================
# FINAL MAIN WORKFLOW FUNCTION - UPDATED WITH YOUR IDEA
# ==============================================================================
def process_single_transcript(graph, api_key, raw_transcript_text, session_id,path):
    
    # STEP 1: EXTRACT (LLM Call 1 - The Heavy Lifter)
    try:
        llm_output = dialogue_flow_ner(api_key, raw_transcript_text)
    except:
        print(f"ERROR: Could not generate file. Aborting...")
        return None
    
    # Inject the correct sequential ID
    data = json.loads(llm_output)
    data['call_session']['session_id'] = f"call_transcript_{session_id}"
    llm_output_json_with_id = json.dumps(data)

    # Save raw LLM output to a file for auditing
    with open(f"{path}/call_transcript_{session_id}_raw.json", "w") as f:
        f.write(llm_output_json_with_id)
    print(f"[call_transcript_{session_id}] Raw LLM output saved.")

    # STEP 2(A): VALIDATE the initial structure
    try:
        validated_data = DialogueGraphData.parse_raw(llm_output_json_with_id)
        print(f"[{validated_data.call_session.session_id}] Initial Pydantic Validation SUCCESSFUL.")
    except ValidationError as e:
        print(f"[{validated_data.call_session.session_id}] VALIDATION FAILED: {e}")
        return
    
    # STEP 2(B): NORMALIZE SPEAKER NAMES
    participant_names = {p.name.lower().strip(): p.name for p in validated_data.participants}
    for turn in validated_data.dialogue_turns:
        norm = turn.speaker_name.lower().strip()
        if norm in participant_names:
            turn.speaker_name = participant_names[norm]
        else:
            print(f"WARNING: Speaker '{turn.speaker_name}' not found in participants "
                  f"for session '{validated_data.call_session.session_id}' "
                  f"(turn {turn.turn_number})")

    # STEP 3: **NEW** - CLASSIFY and ENRICH the validated data object
    recipient = next((p for p in validated_data.participants if p.role == "Recipient"), None)
    if recipient:
        # This function returns the name of the matched segment (e.g., "Healthcare-Enterprise")
        matched_segment = classify_and_link_icp(graph, api_key, raw_transcript_text, recipient.name)
        
        # We add the result directly to our data object.
        validated_data.call_session.matched_icp_segment = matched_segment
        print(f"[{validated_data.call_session.session_id}] Data enriched with ICP segment: '{matched_segment}'")

    # Save enriched data to a file for auditing
    with open(f"{path}/call_transcript_{session_id}_enriched.json", "w") as f:
        f.write(validated_data.model_dump_json(indent=2))
    print(f"[{validated_data.call_session.session_id}] Enriched data saved.")

    # STEP 4: SCORE the now-enriched, clean data
    quality_report = score_dialogue_extraction(validated_data)
    print(f"[{validated_data.call_session.session_id}] Quality Score Calculated: {quality_report['final_score']}")

    # Save quality report to a file for auditing
    with open(f"{path}/call_transcript_{session_id}_quality_report.json", "w") as f:
        json.dump(quality_report, f, indent=2)
    print(f"[{validated_data.call_session.session_id}] Quality report saved.")

    # STEP 5: INGEST the final, enriched data
    ingest_dialogue_flow(graph, validated_data, quality_report)
    
    print(f"[{validated_data.call_session.session_id}] Full processing complete.")
    
    # Return the final, complete data object for inspection
    return validated_data

In [103]:
# ==============================================================================
# MAIN EXECUTION BLOCK
# ==============================================================================

# --- Connection Details ---
load_dotenv(override=True)

# Read environment variables
NEO4J_URL = os.getenv("NEO4J_URL")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORDD = os.getenv("NEO4J_PASSWORDD")
NEO4J_DATABASE1 = os.getenv("NEO4J_DATABASE1")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

print(NEO4J_USERNAME, NEO4J_PASSWORDD, NEO4J_URL, NEO4J_DATABASE1, OPENAI_API_KEY)

neo4j speartg123 bolt://127.0.0.1:7687 neo4j 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


In [104]:
# Initialize Neo4j connection
try:
    graph = Neo4jGraph(
        url=NEO4J_URL,
        username=NEO4J_USERNAME,
        password=NEO4J_PASSWORDD,
        database=NEO4J_DATABASE1
    )
    print("Successfully connected to Neo4j.")
except Exception as e:
    print(f"Failed to connect to Neo4j: {e}")
    exit()

Successfully connected to Neo4j.


In [106]:
print("Clearing existing graph for a fresh start...")
graph.query("MATCH (n) DETACH DELETE n")

Clearing existing graph for a fresh start...


[]

In [107]:
graph.refresh_schema()
print(graph.schema)

Node properties:

Relationship properties:

The relationships:



In [108]:
icp_query = """CREATE (ideal1:IdealTargetCustomer {
  segment: "Retail-Enterprise",
  age_range: "35-50",
  location: "Major US Urban Centers",
  job_titles: ["VP Procurement", "Chief Security Officer"],
  company_size: "500+ employees",
  purchase_frequency: "High",
  preferences: ["compliance", "risk reduction", "scalability"],
  top_pain_points: ["cost", "vendor lock-in", "integration"],
  objection_templates: '[
    { type: "cost", response: "ROI-focused pricing plans" },
    { type: "integration", response: "White-glove onboarding and support" }
  ]',
  affinity_language_style: "consultative",
  meeting_conversion_rate: 0.82,
  personal_interests: '[
    { label: "golf", url: "https://www.pgatour.com/" },
    { label: "modern art", url: "https://www.tate.org.uk/art" }
  ]',
  professional_interests: '[
    { label: "physical security", url: "https://www.asisonline.org/" },
    { label: "supply chain digitalization", url: "https://www.supplychaindive.com/" }
  ]',
  linkedin_url: "https://www.linkedin.com/in/brigitte-broyard-8966bb6/",
  twitter_url: "",
  x_url: "",
  facebook_url: "",
  meta_url: "",
  blog_url: "",
  reddit_url: "",
  substack_url: "",
  completed_call_count: 0
}),
(ideal2:IdealTargetCustomer {
  segment: "Healthcare-Enterprise",
  age_range: "40-60",
  location: "Nationwide urban & suburban hospitals",
  job_titles: [
  "CIO",
  "CTO",
  "Director IT Security",
  "CISA",
  "CCSFP",
  "Security Governance, Risk, and Compliance Initiatives Program Manager"],
  company_size: "1000+ beds",
  purchase_frequency: "Medium",
  preferences: ["patient data security", "automation", "regulatory alignment"],
  top_pain_points: ["HIPAA compliance", "system interoperability", "reputation risk"],
  objection_templates: '[
    { type: "compliance", response: "Regular third-party audits" },
    { type: "system upgrade", response: "Custom integrations & 24/7 support" }
  ]',
  affinity_language_style: "authoritative, data-driven",
  meeting_conversion_rate: 0.76,
  personal_interests: '[
    { label: "triathlon", url: "https://www.triathlete.com/" },
    { label: "classical piano", url: "https://www.classicfm.com/discover-music/periods-genres/classical/" }
  ]',
  professional_interests: '[
    { label: "HIPAA best practices", url: "https://www.hhs.gov/hipaa/index.html" },
    { label: "medical device cybersecurity", url: "https://www.mddionline.com/cybersecurity" }
  ]',
  linkedin_url: "https://www.linkedin.com/in/veronica-orlando-cisa-ccsfp-81884b4/",
  twitter_url: "",
  x_url: "",
  facebook_url: "",
  meta_url: "",
  blog_url: "",
  reddit_url: "",
  substack_url: "/",
  completed_call_count: 0
}),
(ideal3:IdealTargetCustomer {
  segment: "Financial-SME",
  age_range: "28-45",
  location: "Global financial centers",
  job_titles: ["Head of Security", "Director, Global Protective Services", "Head of EP”, “Executive Protection Leader"],
  company_size: "50-300 employees",
  purchase_frequency: "Low",
  preferences: ["integration", "cloud security", "cost efficiency"],
  top_pain_points: ["integration", "vendor support", "budget"],
  objection_templates: '[
    { type: "integration", response: "private security solutions" },
    { type: "cost", response: "Flexible subscription tiers" }
  ]',
  affinity_language_style: "technical",
  meeting_conversion_rate: 0.68,
  personal_interests: '[
    { label: "mountain biking", url: "https://www.singletracks.com/" },
    { label: "equestrian", url: "https://www.usef.org/media/equestrian-magazine" }
  ]',
  professional_interests: '[
    { label: "family office", url: "https://andsimple.co/family-office-events/" },
    { label: "cloud security", url: "https://www.cloudsecurityalliance.org/" }
  ]',
  linkedin_url: "https://www.linkedin.com/in/carl-agnelli-211998105/",
  twitter_url: "",
  x_url: "",
  facebook_url: "",
  meta_url: "",
  blog_url: "",
  reddit_url: "",
  substack_url: "",
  completed_call_count: 0
}),
(ideal4:IdealTargetCustomer {
  segment: "Manufacturing-Enterprise",
  age_range: "38-58",
  location: "Industrial hubs, Midwest & Asia-Pacific",
  job_titles: ["COO", "Head of Logistics", "VP Safety"],
  company_size: "1000-5000 employees",
  purchase_frequency: "Medium",
  preferences: ["efficiency", "workplace safety", "IoT integration"],
  top_pain_points: ["downtime", "workforce compliance", "legacy system upgrades"],
  objection_templates: '[
    { type: "downtime", response: "Rapid deployment, minimal workflow impact" },
    { type: "safety", response: "Certified protocols, OSH training" }
  ]',
  affinity_language_style: "pragmatic, operational",
  meeting_conversion_rate: 0.75,
  personal_interests: '[
    { label: "classic cars", url: "https://www.hemmings.com/" },
    { label: "Brazilian Jiu Jitsu", url: "https://www.nabjjf.com/belt-system" }
  ]',
  professional_interests: '[
    { label: "lean manufacturing", url: "https://www.lean.org/" },
    { label: "industrial IoT", url: "https://www.iiconsortium.org/" }
  ]',
  linkedin_url: "https://www.linkedin.com/in/peter-reyher/",
  twitter_url: "",
  x_url: "",
  facebook_url: "",
  meta_url: "",
  blog_url: "",
  reddit_url: "",
  substack_url: "",
  completed_call_count: 0
}),
(ideal5:IdealTargetCustomer {
  segment: "Film-Entertainment",
  age_range: "28-55",
  location: "Los Angeles, New York, London, Atlanta",
  job_titles: [
    "Studio Executive",
    "Executive Producer",
    "Film Finance Director",
    "Head of Distribution",
    "Head of Production",
    "VP Content Acquisition"
  ],
  company_size: "50-10,000 employees",
  purchase_frequency: "Project-based/Recurring for major launches",
  preferences: [
    "creative talent access",
    "rights management",
    "distributed/remote collaboration tools",
    "content security",
    "analytics for audience targeting"
  ],
  top_pain_points: [
    "intellectual property leakage",
    "production delays",
    "streaming/OTT competition",
    "regulatory compliance",
    "talent management"
  ],
  objection_templates: '[
    { type: "cost", response: "Flexible licensing and package deals for multi-project studios" },
    { type: "integration", response: "Platform APIs connect directly to industry-standard tools (Avid, Adobe, Final Draft)" },
    { type: "security", response: "MPAA-compliant encryption and traceable watermarking" },
    { type: "resistance to change", response: "White-glove onboarding with industry consultants" }
  ]',
  affinity_language_style: "creative, credible, industry-savvy",
  meeting_conversion_rate: 0.74,
  personal_interests: '[
    { label: "art house cinema", url: "https://www.indiewire.com/" },
    { label: "live music", url: "https://www.billboard.com/" },
    { label: "international travel", url: "https://www.cntraveler.com/" }
  ]',
  professional_interests: '[
    { label: "film distribution trends", url: "https://variety.com/v/film/" },
    { label: "rights management", url: "https://www.wipo.int/" },
    { label: "digital post-production", url: "https://www.hollywoodreporter.com/tech/digital/" }
  ]',
  linkedin_url: "https://www.linkedin.com/in/brenda-hope-lindo-a8a93165//",
  twitter_url: "",
  x_url: "",
  facebook_url: "",
  meta_url: "",
  blog_url: "",
  reddit_url: "",
  substack_url: "",
  completed_call_count: 0
}),
(general:IdealTargetCustomer {
  segment: "General",
  description: "Fallback segment for unmatched or ambiguous calls",
  completed_call_count: 0
});"""

print("Creating ICP Profiles for a fresh start...")
graph.query(icp_query)

Creating ICP Profiles for a fresh start...


[]

In [109]:
get_next_session_id(graph) 

1

In [110]:

# Get the starting ID for this batch
current_session_id = get_next_session_id(graph) 

print(f"\n--- Processing Transcript # {current_session_id} ---")

with open(f"/home/GraphRAG/call transcripts/call {current_session_id}.txt", "r") as f:
    transcript_text = f.read()

# create seprate path to save outputs
path = f"/home/GraphRAG/call outputs/call_{current_session_id}"
if not os.path.exists(path):
    os.makedirs(path)

process_single_transcript(graph, OPENAI_API_KEY, transcript_text, current_session_id, path)



--- Processing Transcript # 1 ---


[call_transcript_1] Raw LLM output saved.
[call_transcript_1] Initial Pydantic Validation SUCCESSFUL.


/tmp/ipykernel_435710/20923151.py:509: PydanticDeprecatedSince20: The `parse_raw` method is deprecated; if your data is JSON use `model_validate_json`, otherwise load the data then use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  validated_data = DialogueGraphData.parse_raw(llm_output_json_with_id)


LLM classified recipient profile as: Retail-Enterprise
Linked 'Dale Spear' to 'Retail-Enterprise' and incremented counter.
[call_transcript_1] Data enriched with ICP segment: 'Retail-Enterprise'
[call_transcript_1] Enriched data saved.
[call_transcript_1] Quality Score Calculated: 83
[call_transcript_1] Quality report saved.
Ingested CallSession call_transcript_1 with score 83


/tmp/ipykernel_435710/20923151.py:162: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  'session_details': validated_data.call_session.dict(),
/tmp/ipykernel_435710/20923151.py:180: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  'participants': [p.dict() for p in validated_data.participants]


Built dialogue chain for CallSession call_transcript_1
[call_transcript_1] Full processing complete.


DialogueGraphData(call_session=CallSession(session_id='call_transcript_1', outcome='Meeting Scheduled', product_focus='Georgia Senate Bill 68 compliance solution', matched_icp_segment='Retail-Enterprise'), participants=[Participant(name='Arison', role='Agent', organization='NLA Investigative Division'), Participant(name='Unnamed Gatekeeper', role='Gatekeeper', organization='Not Provided'), Participant(name='Dale Spear', role='Recipient', organization='Not Provided')], dialogue_turns=[DialogueTurn(turn_number=1, speaker_name='Arison', text='Hi. Is compliance available?', turn_type='Gatekeeper_Dialogue'), DialogueTurn(turn_number=2, speaker_name='Unnamed Gatekeeper', text='Is not available.', turn_type='Gatekeeper_Dialogue'), DialogueTurn(turn_number=3, speaker_name='Arison', text='Thank you for letting me know. Could you please connect me with someone who can assist with compliance regarding Georgia Senate bill 8?', turn_type='Gatekeeper_Dialogue'), DialogueTurn(turn_number=4, speaker_n

In [111]:
get_next_session_id(graph)

2

In [112]:

# Get the starting ID for this batch
current_session_id = get_next_session_id(graph) 

print(f"\n--- Processing Transcript # {current_session_id} ---")

with open(f"/home/GraphRAG/call transcripts/call {current_session_id}.txt", "r") as f:
    transcript_text = f.read()

# create seprate path to save outputs
path = f"/home/GraphRAG/call outputs/call_{current_session_id}"
if not os.path.exists(path):
    os.makedirs(path)

process_single_transcript(graph, OPENAI_API_KEY, transcript_text, current_session_id, path)



--- Processing Transcript # 2 ---
[call_transcript_2] Raw LLM output saved.
[call_transcript_2] Initial Pydantic Validation SUCCESSFUL.


/tmp/ipykernel_435710/20923151.py:509: PydanticDeprecatedSince20: The `parse_raw` method is deprecated; if your data is JSON use `model_validate_json`, otherwise load the data then use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  validated_data = DialogueGraphData.parse_raw(llm_output_json_with_id)


LLM classified recipient profile as: Retail-Enterprise
Linked 'Eric Spear' to 'Retail-Enterprise' and incremented counter.
[call_transcript_2] Data enriched with ICP segment: 'Retail-Enterprise'
[call_transcript_2] Enriched data saved.
[call_transcript_2] Quality Score Calculated: 101
[call_transcript_2] Quality report saved.
Ingested CallSession call_transcript_2 with score 101


/tmp/ipykernel_435710/20923151.py:162: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  'session_details': validated_data.call_session.dict(),
/tmp/ipykernel_435710/20923151.py:180: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  'participants': [p.dict() for p in validated_data.participants]


Built dialogue chain for CallSession call_transcript_2
[call_transcript_2] Full processing complete.


DialogueGraphData(call_session=CallSession(session_id='call_transcript_2', outcome='Meeting Scheduled', product_focus='Georgia Senate Bill 68 compliance solution', matched_icp_segment='Retail-Enterprise'), participants=[Participant(name='Arison', role='Agent', organization='NLA Investigative Division'), Participant(name='Unnamed Gatekeeper', role='Gatekeeper', organization='Not Provided'), Participant(name='Eric Spear', role='Recipient', organization='Not Provided')], dialogue_turns=[DialogueTurn(turn_number=1, speaker_name='Arison', text='Hi, is compliance available?', turn_type='Gatekeeper_Dialogue'), DialogueTurn(turn_number=2, speaker_name='Unnamed Gatekeeper', text="Uh, no. Compliance isn't in today. Who's calling?", turn_type='Gatekeeper_Dialogue'), DialogueTurn(turn_number=3, speaker_name='Arison', text="It's Arison. Thank you for asking.", turn_type='Gatekeeper_Dialogue'), DialogueTurn(turn_number=4, speaker_name='Unnamed Gatekeeper', text='And who are you calling with?', turn_

In [113]:
get_next_session_id(graph)

3

In [114]:

# Get the starting ID for this batch
current_session_id = get_next_session_id(graph) 

print(f"\n--- Processing Transcript # {current_session_id} ---")

with open(f"/home/GraphRAG/call transcripts/call {current_session_id}.txt", "r") as f:
    transcript_text = f.read()

# create seprate path to save outputs
path = f"/home/GraphRAG/call outputs/call_{current_session_id}"
if not os.path.exists(path):
    os.makedirs(path)

process_single_transcript(graph, OPENAI_API_KEY, transcript_text, current_session_id, path)



--- Processing Transcript # 3 ---
[call_transcript_3] Raw LLM output saved.
[call_transcript_3] Initial Pydantic Validation SUCCESSFUL.


/tmp/ipykernel_435710/20923151.py:509: PydanticDeprecatedSince20: The `parse_raw` method is deprecated; if your data is JSON use `model_validate_json`, otherwise load the data then use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  validated_data = DialogueGraphData.parse_raw(llm_output_json_with_id)
/tmp/ipykernel_435710/20923151.py:162: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  'session_details': validated_data.call_session.dict(),
/tmp/ipykernel_435710/20923151.py:180: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  'participants': [p

LLM classified recipient profile as: Retail-Enterprise
Linked 'Dale Spear' to 'Retail-Enterprise' and incremented counter.
[call_transcript_3] Data enriched with ICP segment: 'Retail-Enterprise'
[call_transcript_3] Enriched data saved.
[call_transcript_3] Quality Score Calculated: 75
[call_transcript_3] Quality report saved.
Ingested CallSession call_transcript_3 with score 75




Built dialogue chain for CallSession call_transcript_3
[call_transcript_3] Full processing complete.


DialogueGraphData(call_session=CallSession(session_id='call_transcript_3', outcome='Meeting Scheduled', product_focus='Georgia Senate Bill 68 compliance solution', matched_icp_segment='Retail-Enterprise'), participants=[Participant(name='Arison', role='Agent', organization='NLA Investigative Division'), Participant(name='Unnamed Gatekeeper', role='Gatekeeper', organization='Not Provided'), Participant(name='Dale Spear', role='Recipient', organization='Not Provided')], dialogue_turns=[DialogueTurn(turn_number=1, speaker_name='Arison', text='Hi. Is compliance available?', turn_type='Gatekeeper_Dialogue'), DialogueTurn(turn_number=2, speaker_name='Unnamed Gatekeeper', text='Is not available.', turn_type='Gatekeeper_Dialogue'), DialogueTurn(turn_number=3, speaker_name='Arison', text='Thank you for letting me know. Could you please connect me with someone who can assist with compliance regarding Georgia Senate bill 8?', turn_type='Gatekeeper_Dialogue'), DialogueTurn(turn_number=4, speaker_n

In [115]:
get_next_session_id(graph)

4

In [116]:

# Get the starting ID for this batch
current_session_id = get_next_session_id(graph) 

print(f"\n--- Processing Transcript # {current_session_id} ---")

with open(f"/home/GraphRAG/call transcripts/call {current_session_id}.txt", "r") as f:
    transcript_text = f.read()

# create seprate path to save outputs
path = f"/home/GraphRAG/call outputs/call_{current_session_id}"
if not os.path.exists(path):
    os.makedirs(path)

process_single_transcript(graph, OPENAI_API_KEY, transcript_text, current_session_id, path)



--- Processing Transcript # 4 ---
[call_transcript_4] Raw LLM output saved.
[call_transcript_4] Initial Pydantic Validation SUCCESSFUL.


/tmp/ipykernel_435710/20923151.py:509: PydanticDeprecatedSince20: The `parse_raw` method is deprecated; if your data is JSON use `model_validate_json`, otherwise load the data then use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  validated_data = DialogueGraphData.parse_raw(llm_output_json_with_id)
/tmp/ipykernel_435710/20923151.py:162: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  'session_details': validated_data.call_session.dict(),
/tmp/ipykernel_435710/20923151.py:180: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  'participants': [p

LLM classified recipient profile as: Retail-Enterprise
Linked 'Ted' to 'Retail-Enterprise' and incremented counter.
[call_transcript_4] Data enriched with ICP segment: 'Retail-Enterprise'
[call_transcript_4] Enriched data saved.
[call_transcript_4] Quality Score Calculated: 62
[call_transcript_4] Quality report saved.
Ingested CallSession call_transcript_4 with score 62




Built dialogue chain for CallSession call_transcript_4
[call_transcript_4] Full processing complete.


DialogueGraphData(call_session=CallSession(session_id='call_transcript_4', outcome='Meeting Scheduled', product_focus='Georgia Senate Bill 68 and 69 compliance solution', matched_icp_segment='Retail-Enterprise'), participants=[Participant(name='Arison', role='Agent', organization='NLA Investigative Division'), Participant(name='Ted', role='Recipient', organization='Not Provided')], dialogue_turns=[DialogueTurn(turn_number=1, speaker_name='Arison', text='Hi, is compliance available?', turn_type='Gatekeeper_Dialogue'), DialogueTurn(turn_number=2, speaker_name='Ted', text="Hey. Uh, who's calling?", turn_type='Gatekeeper_Dialogue'), DialogueTurn(turn_number=3, speaker_name='Arison', text="It's Arison. Thank you for asking.", turn_type='Gatekeeper_Dialogue'), DialogueTurn(turn_number=4, speaker_name='Ted', text="Uh, you're welcome. Who'd you guys call them with?", turn_type='Gatekeeper_Dialogue'), DialogueTurn(turn_number=5, speaker_name='Arison', text='NLA investigative division. Thank you