In [19]:
import py_files.llm_pipeline.twcs_processor as processor
import pandas as pd
import json
from Notebooks.VectorDBStructure.query import query_similar
from py_files.llm_pipeline.reranker import CrossEncoderReranker
from sklearn.preprocessing import StandardScaler
from typing import List, Dict

In [None]:
import openai
from dotenv import load_dotenv
load_dotenv()
import os

from py_files.CONFIG import ENDBOT_PROMPT

# Load environment variables from .env file
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Set OpenAI API key
client = openai.OpenAI(api_key=OPENAI_API_KEY)

In [21]:
user_input = "I accidentally booked the same flight twice—VX666 and VX667. Please refund one."

In [22]:
user_input = processor.TWCSProcessor._clean_single(user_input)
user_input_processed = processor.TWCSProcessor._convert_to_conversation(user_input)

In [23]:
# convert the user input to a pandas dataframe
user_input_df = pd.DataFrame([[user_input,user_input_processed]], columns=['cleaned_conversation','structured_conversations'])

In [24]:
user_input_df

Unnamed: 0,cleaned_conversation,structured_conversations
0,I accidentally booked the same flight twiceVX6...,"{'conversation': [{'role': 'Customer', 'messag..."


In [25]:
from py_files.llm_pipeline.llm_extractor import LLMExtractor
pipe = LLMExtractor(dataframe = user_input_df)

# only products / issue-types / services
df1 = pipe.extract_entities()

# pack them into a single JSON field
df2 = pipe.process_entities_json()

# create RDF triples
df3 = pipe.extract_relationships()

2025-06-16 16:20:49,610 [INFO] LLMExtractor: Loaded data – 1 rows
2025-06-16 16:20:49,611 [INFO] LLMExtractor: STEP 1 – Extracting issue‑types, products, services
LLM steps:   0%|          | 0/1 [00:00<?, ?it/s]2025-06-16 16:20:50,352 [INFO] httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM steps: 100%|██████████| 1/1 [00:00<00:00,  1.35it/s]
LLM steps:   0%|          | 0/1 [00:00<?, ?it/s]2025-06-16 16:20:51,269 [INFO] httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM steps: 100%|██████████| 1/1 [00:00<00:00,  1.09it/s]
LLM steps:   0%|          | 0/1 [00:00<?, ?it/s]2025-06-16 16:20:51,646 [INFO] httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM steps: 100%|██████████| 1/1 [00:00<00:00,  2.66it/s]
2025-06-16 16:20:51,649 [INFO] LLMExtractor: STEP 2 – Packing entities into single JSON field
LLM steps: 100%|██████████| 1/1 [00:00<?, ?it/s]
2025-06-16 16:20:51,651 [

In [26]:
from Notebooks.VectorDBStructure.db_structure import DatabaseStructure

db = DatabaseStructure()

cleaned_conversations = user_input_processed
entities = df3['entities'].values[0]
relationship = df3['relationship'].values[0]
cleaned_conversation = df3['cleaned_conversation'].values[0]

fixed_relationships = db.fix_relationships(relationship)

2025-06-16 16:20:54,329 [INFO] sentence_transformers.SentenceTransformer: Use pytorch device_name: cuda
2025-06-16 16:20:54,329 [INFO] sentence_transformers.SentenceTransformer: Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [27]:
embedding = db.text_to_embedding(cleaned_conversation, entities, fixed_relationships).tolist()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [28]:
# Step 1: Initialize the reranker
reranker = CrossEncoderReranker(top_k=50)

# Step 2: Query Elastic and get hits
hits = query_similar(embedding, k=50)

# Step 3: Extract conversation candidates
candidates = [hit["_source"]["Conversation_History"]["conversation"] for hit in hits]

# Step 4: Rerank with cross-encoder
reranked = reranker.rerank(user_input, candidates)

# Step 5: Create a mapping: conversation -> (score, rank)
score_rank_map = {
    conv: (score, rank + 1)  # rank is 1-based
    for rank, (conv, score) in enumerate(reranked)
}

# Step 6: Construct final rows with rank
rows = []
for hit in hits:
    src = hit["_source"]
    conv = src["Conversation_History"]["conversation"]
    score, rank = score_rank_map.get(conv, (0.0, None))  # Not reranked if not in top_k

    rows.append({
        "prompt": user_input,
        "id": hit["_id"],
        "similarity_score": hit["_score"],
        "rerank_score": score,
        "rerank_rank": rank,
        "ChatID": src["ChatID"],
        "Company_name": src["Company_name"],
        "Conversation_History": conv,
        "Entities": json.dumps(src["Entities"]),
        "Relationships": json.dumps(src["Relationships"])
    })

# Step 7: Create DataFrame and optionally sort by rerank_rank
reranked_qa = pd.DataFrame(rows).sort_values(by="rerank_rank", na_position="last").reset_index(drop=True)


2025-06-16 16:20:56,727 [INFO] elastic_transport.transport: GET http://localhost:9200/chat_embeddings/_mapping [status:200 duration:0.003s]
2025-06-16 16:20:56,743 [INFO] elastic_transport.transport: POST http://localhost:9200/chat_embeddings/_search [status:200 duration:0.015s]


In [29]:
scaler = StandardScaler()
reranked_qa[["sim_norm", "rerank_norm"]] = scaler.fit_transform(
    reranked_qa[["similarity_score", "rerank_score"]].fillna(0)
)
reranked_qa["hybrid_score"] = 0.7 * reranked_qa["sim_norm"] + 0.3 * reranked_qa["rerank_norm"]

In [30]:
top_candidates = reranked_qa.sort_values(by="hybrid_score", ascending=False).head(10)

seen_combinations = set()
filtered_rows = []

for _, row in top_candidates.iterrows():
    key = (row["Entities"], row["Relationships"])
    if key not in seen_combinations:
        filtered_rows.append(row)
        seen_combinations.add(key)
    if len(filtered_rows) == 5:
        break

top5_df = pd.DataFrame(filtered_rows)

In [31]:
top5_df

Unnamed: 0,prompt,id,similarity_score,rerank_score,rerank_rank,ChatID,Company_name,Conversation_History,Entities,Relationships,sim_norm,rerank_norm,hybrid_score
26,I accidentally booked the same flight twiceVX6...,MM3LeJcB1089BAnsM783,1.656683,-7.155315,27,1210,VirginAmerica,Customer Better hope I make it in time if not ...,"{""products"": [], ""services"": [], ""issue_types""...",[],3.490571,-0.225382,2.375785
11,I accidentally booked the same flight twiceVX6...,rs3LeJcB1089BAnsMr2C,1.622004,-5.17057,12,824,VirginAmerica,Customer Thank you for being amazing rebooking...,"{""products"": [""ticket""], ""services"": [], ""issu...",[],2.132549,0.75777,1.720116
10,I accidentally booked the same flight twiceVX6...,5s3LeJcB1089BAnsM8DO,1.614958,-4.878383,11,1648,VirginAmerica,Customer without a doubt the worse airline to ...,"{""products"": [], ""services"": [""VirginAmerica r...",[],1.856619,0.902506,1.570385
0,I accidentally booked the same flight twiceVX6...,bM3LeJcB1089BAnsM8DO,1.581204,0.448395,1,1526,VirginAmerica,Customer you guys are so incompetent and ridic...,"{""products"": [""VirginAmerica flight vx1960"", ""...",[],0.534825,3.541148,1.436722
5,I accidentally booked the same flight twiceVX6...,Fs3LeJcB1089BAnsMLui,1.6066,-4.61611,6,160,VirginAmerica,Customer cancelled flight entire plane full of...,"{""products"": [""flight"", ""Ticket attendants"", ""...",[],1.529319,1.032424,1.380251


In [32]:
def parse_conversation(text: str) -> List[Dict[str, str]]:
    lines = text.split("\n")
    parsed = []
    for line in lines:
        lower = line.lower()
        if lower.startswith("customer"):
            role = "Customer"
            msg = line[len("Customer"):].strip()
        elif lower.startswith("company"):
            role = "Company"
            msg = line[len("Company"):].strip()
        else:
            # fallback: use last role or unknown
            role = "Customer" if not parsed else parsed[-1]["role"]
            msg = line.strip()
        if msg:
            parsed.append({"role": role, "message": msg})
    return parsed



In [33]:
def build_payload_per_qa(df_top5, query: str) -> str:
    results = []
    for _, row in df_top5.iterrows():
        conv = row["Conversation_History"]

        if isinstance(conv, str):
            try:
                conv_json = json.loads(conv)
                conversation = conv_json  # Already parsed
            except:
                conversation = parse_conversation(conv)
        else:
            conversation = conv

        try:
            intents = json.loads(row["Entities"])
        except:
            intents = {}

        try:
            relationships = json.loads(row["Relationships"])
        except:
            relationships = []

        results.append({
            "company_name": row["Company_name"],
            "conversation": conversation,
            "intents": intents,
            "relationships": relationships
        })

    full_payload = {
        "query": query.strip(),
        "retrieved_answers": results
    }

    return json.dumps(full_payload, ensure_ascii=False, indent=2)


In [34]:
payload = build_payload_per_qa(
    df_top5=top5_df.sort_values(by="hybrid_score", ascending=False).head(5),
    query=user_input
)

In [36]:
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": ENDBOT_PROMPT},
        {"role": "user", "content": payload}
    ],
    temperature=0,
    top_p=0.95
)
response.choices[0].message.content

2025-06-16 16:20:58,167 [INFO] httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


"I'm sorry to hear about your situation; please contact our reservations team at 1-877-359-8474 for assistance with your refund request."