In [None]:
import py_files.llm_pipeline.twcs_processor as processor
import pandas as pd
import json
from py_files.VectorDBStructure.query import query_similar
from py_files.llm_pipeline.reranker import CrossEncoderReranker
from sklearn.preprocessing import StandardScaler
from typing import List, Dict

import openai
from dotenv import load_dotenv


load_dotenv()
import os

from CONFIG import ENDBOT_PROMPT

# Load environment variables from .env file
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Set OpenAI API key
client = openai.OpenAI(api_key=OPENAI_API_KEY)

# User input part
prompts = pd.read_excel("data/processed/test_data/airway_test_data.xlsx")
results = []
count = 0

# Initialize the models once
from py_files.VectorDBStructure.db_structure import DatabaseStructure
db = DatabaseStructure()
reranker = CrossEncoderReranker(top_k=50)

for user_input in prompts['Conversation']:
    count += 1
    user_input = processor.TWCSProcessor._clean_single(user_input)
    user_input_processed = processor.TWCSProcessor._convert_to_conversation(user_input)

    # convert the user input to a pandas dataframe
    user_input_df = pd.DataFrame([[user_input,user_input_processed]], columns=['cleaned_conversation','structured_conversations'])

    from py_files.llm_pipeline.llm_extractor import LLMExtractor
    pipe = LLMExtractor(dataframe = user_input_df)

    # only products / issue-types / services
    df1 = pipe.extract_entities()

    # pack them into a single JSON field
    df2 = pipe.process_entities_json()

    # create RDF triples
    df3 = pipe.extract_relationships()

    # init db carried out

    cleaned_conversations = user_input_processed
    entities = df3['entities'].values[0]
    relationship = df3['relationship'].values[0]
    cleaned_conversation = df3['cleaned_conversation'].values[0]

    fixed_relationships = db.fix_relationships(relationship)

    embedding = db.text_to_embedding(cleaned_conversation, entities, fixed_relationships).tolist()

    # Initialize the reranker carried out
    
    # Step 2: Query Elastic and get hits
    hits = query_similar(embedding, k=50)

    # Step 3: Extract conversation candidates
    candidates = [hit["_source"]["Conversation_History"]["conversation"] for hit in hits]

    # Step 4: Rerank with cross-encoder
    reranked = reranker.rerank(user_input, candidates)

    # Step 5: Create a mapping: conversation -> (score, rank)
    score_rank_map = {
        conv: (score, rank + 1)  # rank is 1-based
        for rank, (conv, score) in enumerate(reranked)
    }

    # Step 6: Construct final rows with rank
    rows = []
    for hit in hits:
        src = hit["_source"]
        conv = src["Conversation_History"]["conversation"]
        score, rank = score_rank_map.get(conv, (0.0, None))  # Not reranked if not in top_k

        rows.append({
            "prompt": user_input,
            "id": hit["_id"],
            "similarity_score": hit["_score"],
            "rerank_score": score,
            "rerank_rank": rank,
            "ChatID": src["ChatID"],
            "Company_name": src["Company_name"],
            "Conversation_History": conv,
            "Entities": json.dumps(src["Entities"]),
            "Relationships": json.dumps(src["Relationships"])
        })

    # Step 7: Create DataFrame and optionally sort by rerank_rank
    reranked_qa = pd.DataFrame(rows).sort_values(by="rerank_rank", na_position="last").reset_index(drop=True)

    scaler = StandardScaler()
    reranked_qa[["sim_norm", "rerank_norm"]] = scaler.fit_transform(
        reranked_qa[["similarity_score", "rerank_score"]].fillna(0)
    )
    reranked_qa["hybrid_score"] = 0.7 * reranked_qa["sim_norm"] + 0.3 * reranked_qa["rerank_norm"]

    top_candidates = reranked_qa.sort_values(by="hybrid_score", ascending=False).head(10)

    seen_combinations = set()
    filtered_rows = []

    for _, row in top_candidates.iterrows():
        key = (row["Entities"], row["Relationships"])
        if key not in seen_combinations:
            filtered_rows.append(row)
            seen_combinations.add(key)
        if len(filtered_rows) == 5:
            break

    top5_df = pd.DataFrame(filtered_rows)

    def parse_conversation(text: str) -> List[Dict[str, str]]:
        lines = text.split("\n")
        parsed = []
        for line in lines:
            lower = line.lower()
            if lower.startswith("customer"):
                role = "Customer"
                msg = line[len("Customer"):].strip()
            elif lower.startswith("company"):
                role = "Company"
                msg = line[len("Company"):].strip()
            else:
                # fallback: use last role or unknown
                role = "Customer" if not parsed else parsed[-1]["role"]
                msg = line.strip()
            if msg:
                parsed.append({"role": role, "message": msg})
        return parsed

    def build_payload_per_qa(df_top5, query: str) -> str:
        results = []
        for _, row in df_top5.iterrows():
            conv = row["Conversation_History"]

            if isinstance(conv, str):
                try:
                    conv_json = json.loads(conv)
                    conversation = conv_json  # Already parsed
                except:
                    conversation = parse_conversation(conv)
            else:
                conversation = conv

            try:
                intents = json.loads(row["Entities"])
            except:
                intents = {}

            try:
                relationships = json.loads(row["Relationships"])
            except:
                relationships = []

            results.append({
                "company_name": row["Company_name"],
                "conversation": conversation,
                "intents": intents,
                "relationships": relationships
            })

        full_payload = {
            "query": query.strip(),
            "retrieved_answers": results
        }

        return json.dumps(full_payload, ensure_ascii=False, indent=2), results

    payload, retrievals = build_payload_per_qa(
        df_top5=top5_df.sort_values(by="hybrid_score", ascending=False).head(5),
        query=user_input
    )

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": ENDBOT_PROMPT},
            {"role": "user", "content": payload}
        ],
        temperature=0,
        top_p=0.95
    )
    # Response generated
    answer = response.choices[0].message.content

    results.append({
        "prompts" : user_input,
        "retrievals" : retrievals,
        "answers" : answer
    })
    print(f"Row {count} completed.")

result = pd.DataFrame(results)
result.to_excel('results.xlsx')