In [1]:
import Py_files.twcs_processor as processor
import pandas as pd
import json
from Notebooks.VectorDBStructure.query import query_similar
from Py_files.reranker import CrossEncoderReranker
from sklearn.preprocessing import StandardScaler
from typing import List, Dict

In [2]:
user_input = "My Echo keeps playing the same song over and over, how do I fix recommendations?"

In [3]:
user_input = processor.TWCSProcessor._clean_single(user_input)
user_input_processed = processor.TWCSProcessor._convert_to_conversation(user_input)

In [4]:
# convert the user input to a pandas dataframe
user_input_df = pd.DataFrame([[user_input_processed]], columns=['cleaned_conversations'])

In [5]:
from Py_files.llm_extractor import LLMExtractor
pipe = LLMExtractor(dataframe = user_input_df)

# only products / issue-types / services
df1 = pipe.extract_entities()

# pack them into a single JSON field
df2 = pipe.process_entities_json()

# create RDF triples
df3 = pipe.extract_relationships()

# produce suggested resolutions
df4 = pipe.extract_resolution()

2025-05-12 22:54:18,301 [INFO] LLMExtractor: Loaded data – 1 rows
2025-05-12 22:54:18,301 [INFO] LLMExtractor: STEP 1 – Extracting issue-types, products, services
LLM steps:   0%|          | 0/1 [00:00<?, ?it/s]2025-05-12 22:54:19,689 [INFO] httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM steps: 100%|██████████| 1/1 [00:01<00:00,  1.39s/it]
LLM steps:   0%|          | 0/1 [00:00<?, ?it/s]2025-05-12 22:54:20,376 [INFO] httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM steps: 100%|██████████| 1/1 [00:00<00:00,  1.45it/s]
LLM steps:   0%|          | 0/1 [00:00<?, ?it/s]2025-05-12 22:54:21,046 [INFO] httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM steps: 100%|██████████| 1/1 [00:00<00:00,  1.51it/s]
2025-05-12 22:54:21,048 [INFO] LLMExtractor: STEP 2 – Packing entities into single JSON field
LLM steps: 100%|██████████| 1/1 [00:00<00:00, 1000.55it/s]
2025-05-12 22:5

In [6]:
df4

Unnamed: 0,cleaned_conversations,Issue Type,Product,Services,entities,relationship,resolution
0,"{'conversation': [{'role': 'Customer', 'messag...","{\n ""Company_name"": ""company_name_here"",\n ""...","{\n ""Company_name"": ""company_name_here"",\n ""...","{\n ""Company_name"": ""company_name_here"",\n ""...","{""products"": [""Echo""], ""services"": [], ""issue_...","[\n {""subject"": ""Echo"", ""predicate"": ""hasIs...","Based on the provided conversation, the custom..."


In [7]:
from Notebooks.VectorDBStructure.db_structure import DatabaseStructure

db = DatabaseStructure()

cleaned_conversations = user_input_processed
entities = df4['entities'].values[0]
resolution = df4['resolution'].values[0]
relationship = df4['relationship'].values[0]

fixed_relationships = db.fix_relationships(relationship,resolution)

2025-05-12 22:54:29,068 [INFO] datasets: PyTorch version 2.1.1+cu121 available.
2025-05-12 22:54:29,070 [INFO] datasets: TensorFlow version 2.12.1 available.
2025-05-12 22:54:29,182 [INFO] sentence_transformers.SentenceTransformer: Use pytorch device_name: cuda
2025-05-12 22:54:29,182 [INFO] sentence_transformers.SentenceTransformer: Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [8]:
embedding = db.text_to_embedding(entities, fixed_relationships).tolist()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
# Step 1: Initialize the reranker
reranker = CrossEncoderReranker(top_k=50)

# Step 2: Query Elastic and get hits
hits = query_similar(embedding, k=50)

# Step 3: Extract conversation candidates
candidates = [hit["_source"]["Conversation_History"]["conversation"] for hit in hits]

# Step 4: Rerank with cross-encoder
reranked = reranker.rerank(user_input, candidates)

# Step 5: Create a mapping: conversation -> (score, rank)
score_rank_map = {
    conv: (score, rank + 1)  # rank is 1-based
    for rank, (conv, score) in enumerate(reranked)
}

# Step 6: Construct final rows with rank
rows = []
for hit in hits:
    src = hit["_source"]
    conv = src["Conversation_History"]["conversation"]
    score, rank = score_rank_map.get(conv, (0.0, None))  # Not reranked if not in top_k

    rows.append({
        "prompt": user_input,
        "id": hit["_id"],
        "similarity_score": hit["_score"],
        "rerank_score": score,
        "rerank_rank": rank,
        "ChatID": src["ChatID"],
        "Company_name": src["Company_name"],
        "Conversation_History": conv,
        "Entities": json.dumps(src["Entities"]),
        "Relationships": json.dumps(src["Relationships"])
    })

# Step 7: Create DataFrame and optionally sort by rerank_rank
reranked_qa = pd.DataFrame(rows).sort_values(by="rerank_rank", na_position="last").reset_index(drop=True)


2025-05-12 22:54:32,096 [INFO] elastic_transport.transport: GET http://localhost:9200/chat_embeddings/_mapping [status:200 duration:0.050s]
2025-05-12 22:54:32,119 [INFO] elastic_transport.transport: POST http://localhost:9200/chat_embeddings/_search [status:200 duration:0.022s]


In [10]:
scaler = StandardScaler()
reranked_qa[["sim_norm", "rerank_norm"]] = scaler.fit_transform(
    reranked_qa[["similarity_score", "rerank_score"]].fillna(0)
)
reranked_qa["hybrid_score"] = 0.3 * reranked_qa["sim_norm"] + 0.7 * reranked_qa["rerank_norm"]

In [11]:
top_candidates = reranked_qa.sort_values(by="hybrid_score", ascending=False).head(10)

seen_combinations = set()
filtered_rows = []

for _, row in top_candidates.iterrows():
    key = (row["Entities"], row["Relationships"])
    if key not in seen_combinations:
        filtered_rows.append(row)
        seen_combinations.add(key)
    if len(filtered_rows) == 5:
        break

top5_df = pd.DataFrame(filtered_rows)

In [12]:
top5_df

Unnamed: 0,prompt,id,similarity_score,rerank_score,rerank_rank,ChatID,Company_name,Conversation_History,Entities,Relationships,sim_norm,rerank_norm,hybrid_score
0,My Echo keeps playing the same song over and o...,S6hsxZYBEfCvEamxqVZ5,1.819998,-2.16346,1,3482,AmazonHelp,Customer kids listening to disney soundtracks ...,"{""products"": [""Echo""], ""services"": [], ""issue_...",[],5.271612,5.231974,5.243865
1,My Echo keeps playing the same song over and o...,YKhsxZYBEfCvEamxpk62,1.597994,-5.203549,2,1455,AmazonHelp,Customer echo app is virtually useless Wont le...,"{""products"": [""echo app"", ""Apple Music"", ""Amaz...","[{""subject"": ""echo app"", ""predicate"": ""hasIssu...",0.065198,3.394398,2.395638
2,My Echo keeps playing the same song over and o...,BqhsxZYBEfCvEamxq1yR,1.661558,-8.449758,3,4949,AmazonHelp,Customer got my Echo Dot today and its so cool...,"{""products"": [""Echo Dot""], ""services"": [""Echo ...",[],1.555912,1.432233,1.469337
3,My Echo keeps playing the same song over and o...,c6hsxZYBEfCvEamxq1kG,1.655916,-8.606674,4,4290,SpotifyCares,Customer Hi Spotify my saved songs list keeps ...,"{""products"": [], ""services"": [], ""issue_types""...",[],1.42358,1.337386,1.363244
4,My Echo keeps playing the same song over and o...,hqhsxZYBEfCvEamxqFEb,1.596483,-8.814091,5,2261,SpotifyCares,Customer Oh when your stupid ads dont run and ...,"{""products"": [], ""services"": [], ""issue_types""...",[],0.02976,1.212013,0.857337


In [13]:
def parse_conversation(text: str) -> List[Dict[str, str]]:
    lines = text.split("\n")
    parsed = []
    for line in lines:
        lower = line.lower()
        if lower.startswith("customer"):
            role = "Customer"
            msg = line[len("Customer"):].strip()
        elif lower.startswith("company"):
            role = "Company"
            msg = line[len("Company"):].strip()
        else:
            # fallback: use last role or unknown
            role = "Customer" if not parsed else parsed[-1]["role"]
            msg = line.strip()
        if msg:
            parsed.append({"role": role, "message": msg})
    return parsed



In [14]:
def build_payload_per_qa(df_top5, query: str) -> str:
    results = []
    for _, row in df_top5.iterrows():
        conv = row["Conversation_History"]

        if isinstance(conv, str):
            try:
                conv_json = json.loads(conv)
                conversation = conv_json  # Already parsed
            except:
                conversation = parse_conversation(conv)
        else:
            conversation = conv

        try:
            intents = json.loads(row["Entities"])
        except:
            intents = {}

        try:
            relationships = json.loads(row["Relationships"])
        except:
            relationships = []

        results.append({
            "company_name": row["Company_name"],
            "conversation": conversation,
            "intents": intents,
            "relationships": relationships
        })

    full_payload = {
        "query": query.strip(),
        "retrieved_answers": results
    }

    return json.dumps(full_payload, ensure_ascii=False, indent=2)


In [15]:
payload = build_payload_per_qa(
    df_top5=top5_df.sort_values(by="hybrid_score", ascending=False).head(5),
    query="My Echo keeps playing the same song over and over how do I fix recommendations"
)

print(payload)

{
  "query": "My Echo keeps playing the same song over and over how do I fix recommendations",
  "retrieved_answers": [
    {
      "company_name": "AmazonHelp",
      "conversation": [
        {
          "role": "Customer",
          "message": "kids listening to disney soundtracks etc on Echo has completely ruined my recommendations Any solution"
        },
        {
          "role": "Company",
          "message": "The option isnt avail but were always looking for ways to improve Heres how to submit suggestions JO"
        }
      ],
      "intents": {
        "products": [
          "Echo"
        ],
        "services": [],
        "issue_types": [
          "recommendation issue"
        ]
      },
      "relationships": []
    },
    {
      "company_name": "AmazonHelp",
      "conversation": [
        {
          "role": "Customer",
          "message": "echo app is virtually useless Wont let me import my music which is 90 of the reason I bought it Very difficult to navigate a