In [4]:
import os
import sys
import re
import pickle
import joblib
import logging
import datetime
import time
import random
from dotenv import load_dotenv

import pandas as pd
import numpy as np

# ML/NLP Libraries
import google.generativeai as genai
import torch
import spacy
import faiss
from sentence_transformers import SentenceTransformer, util, losses
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader

# Scikit-learn & LightGBM
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, FunctionTransformer
from sklearn.metrics import (classification_report, accuracy_score, f1_score,
                           mean_absolute_error)
import lightgbm as lgb

In [5]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# Check for GPU and set device
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    logger.info(f"GPU available: {torch.cuda.get_device_name(0)}")
    spacy.prefer_gpu() # Tell spaCy to use GPU if available
    logger.info("spaCy GPU preference set.")
else:
    DEVICE = torch.device("cpu")
    logger.info("GPU not available, using CPU.")
    spacy.require_cpu()
    logger.info("spaCy CPU preference set.")


2025-04-06 11:59:51,065 - INFO - GPU available: NVIDIA A100-SXM4-80GB MIG 1g.20gb
2025-04-06 11:59:51,274 - INFO - spaCy GPU preference set.


In [6]:
GEMINI_API_KEY = "AIzaSyBvLjAbdWz6v6_ii98B_j98IffX4TGrexM"

if not GEMINI_API_KEY:
    logger.warning("GEMINI_API_KEY not found in environment variables.")
else:
    logger.info("GEMINI_API_KEY loaded.")
    try:
        genai.configure(api_key=GEMINI_API_KEY)
        logger.info("Gemini configured.")
    except Exception as e:
        logger.error(f"Failed to configure Gemini: {e}")

2025-04-06 12:00:52,417 - INFO - GEMINI_API_KEY loaded.
2025-04-06 12:00:52,418 - INFO - Gemini configured.


In [7]:

# --- Model IDs & Paths ---
SUMMARIZER_MODEL_ID_FINETUNE = "google/flan-t5-base" # Base for fine-tuning
ENTITY_EXTRACTOR_MODEL_SPACY = "en_core_web_trf" # spaCy model
ENTITY_EXTRACTOR_MODEL_ID_FINETUNE = "bert-base-cased" # Base for fine-tuning NER
SENTENCE_TRANSFORMER_MODEL_ID = "paraphrase-MiniLM-L6-v2" # Bi-encoder

# --- Local Paths (within notebook environment) ---
DATA_DIR = "./data" # Create this directory if it doesn't exist
MODEL_DIR = os.path.join(DATA_DIR, "models")
INDEX_DIR = os.path.join(DATA_DIR, "index")
HISTORICAL_TICKETS_PATH = os.path.join(DATA_DIR, "dummy_historical_tickets.csv")
RETRIEVER_INDEX_PATH = os.path.join(INDEX_DIR, "ticket_index_gpu.index") # Use different name for GPU index
RETRIEVER_MAP_PATH = RETRIEVER_INDEX_PATH + ".map"
ROUTER_MODEL_PATH = os.path.join(MODEL_DIR, "router_pipeline.joblib")
TTR_ESTIMATOR_MODEL_PATH = os.path.join(MODEL_DIR, "ttr_pipeline.joblib")

# Create directories if they don't exist
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(INDEX_DIR, exist_ok=True)

# --- Training Hyperparameters (Examples) ---
SFT_EPOCHS = 2
SFT_LR = 1e-4
NER_EPOCHS = 3
NER_LR = 3e-5
RETRIEVER_BATCH_SIZE = 32
RETRIEVER_EPOCHS = 1

# --- Other Settings ---
DEFAULT_QUEUES = ["Sync-Team", "Payments", "Networking", "General"]
RETRIEVER_TOP_K = 3
TARGET_ENTITY_LABELS = { # Adjust based on actual needs and model used
    "issue_category": ["PRODUCT", "ORG", "EVENT", "WORK_OF_ART"],
    "device": ["PRODUCT"],
    "error_code": ["CARDINAL", "MONEY", "NORP", "FAC"], # Wider net, needs refinement/rules
    "escalation_target": ["ORG", "PERSON"],
    "promised_follow_up_date": ["DATE", "TIME"]
}
TTR_TARGET_VARIABLE = 'ttr_hours'
TTR_CATEGORICAL_FEATURES = ['predicted_queue', 'priority', 'day_of_week']
TTR_NUMERICAL_FEATURES = ['sentiment_score', 'solution_similarity_score', 'agent_load_at_open']
TTR_BOOLEAN_FEATURES = ['business_hours_flag']
TTR_ALL_FEATURES = TTR_CATEGORICAL_FEATURES + TTR_NUMERICAL_FEATURES + TTR_BOOLEAN_FEATURES

In [8]:
def load_historical_data(file_path=HISTORICAL_TICKETS_PATH):
    """Loads historical ticket data, creating dummy data if not found."""
    try:
        df = pd.read_csv(file_path)
        logger.info(f"Loaded historical data from {file_path}")
    except FileNotFoundError:
        logger.warning(f"{file_path} not found. Creating dummy data.")
        dummy_data = {
            'ticket_id': [1, 2, 3, 4, 5, 6, 7],
            'dialog': ["User: My screen is black. Agent: Did you try turning it off and on again? User: Yes. Agent: Okay, let's check the cable.",
                       "User: Can't login. Agent: Reset password link sent. User: Got it, thanks!",
                       "User: Payment failed. Agent: I see the error code X45. It's a bank issue. User: Ah okay.",
                       "User: How do I sync my calendar? Agent: Go to Settings > Accounts > Sync Now. User: Worked!",
                       "User: Wifi slow. Agent: Have you rebooted the router? User: Doing it now. Agent: Let me know. User: Better, thanks!",
                       "User: The app crashes when I click save. Error 503 pops up. Agent: Thanks, is this on iOS or Android? User: iOS. Agent: Okay, reporting bug to Sync-Team.",
                       "User: My bill seems wrong. Charge for service XYZ is too high. Agent: Let me check your account... yes, I see the discrepancy. I'll adjust it. Should reflect in 24h."],
            'summary': ["User reported black screen, agent suggested reboot & cable check.",
                        "User couldn't login, agent sent password reset link.",
                        "User payment failed due to bank error X45.",
                        "Agent guided user on how to sync calendar.",
                        "User reported slow wifi, resolved after router reboot.",
                        "App crashes with error 503 on iOS, agent reported bug.",
                        "User disputed bill charge, agent found error and adjusted."],
            'resolution': ["Check display cable connection.",
                           "Sent password reset link.",
                           "Advised user it was a bank-side issue.",
                           "Guided user through Settings > Accounts > Sync Now.",
                           "Advised user to reboot router.",
                           "Reported bug (Error 503, iOS) to Sync-Team.",
                           "Corrected billing error for service XYZ."],
            'queue': ["Networking", "General", "Payments", "Sync-Team", "Networking", "Sync-Team", "Payments"],
            'ttr_hours': [2.5, 0.2, 0.5, 0.1, 0.3, 48.0, 24.0], # Added longer TTRs
            'status': ["solved", "solved", "solved", "solved", "solved", "open", "solved"]
        }
        df = pd.DataFrame(dummy_data)
        # Add dummy features for TTR estimator
        df['priority'] = random.choices(['Low', 'Medium', 'High'], k=len(df))
        df['timestamp'] = pd.to_datetime([datetime.datetime.now() - datetime.timedelta(hours=random.randint(1,100)) for _ in range(len(df))])
        df['day_of_week'] = df['timestamp'].dt.strftime('%a')
        df['business_hours_flag'] = df['timestamp'].apply(lambda ts: 8 <= ts.hour < 18 and ts.weekday() < 5).astype(str)
        df['sentiment_score'] = [round(random.uniform(-0.8, 0.8), 2) for _ in range(len(df))]
        df['solution_similarity_score'] = [round(random.uniform(0.3, 0.95), 3) for _ in range(len(df))]
        df['agent_load_at_open'] = [random.randint(1, 8) for _ in range(len(df))]

        df.to_csv(file_path, index=False)
        logger.info(f"Created and saved dummy data to {file_path}")
    except Exception as e:
        logger.error(f"Error loading/creating historical data: {e}")
        return pd.DataFrame()
    return df

def preprocess_text_for_embedding(text):
    """Simple text cleaning."""
    if not isinstance(text, str): return ""
    return text.lower().strip()

def log_feedback_notebook(ticket_id, component, feedback_data):
    """Simulates logging feedback in the notebook."""
    timestamp = datetime.datetime.now().isoformat()
    log_entry = f"FEEDBACK @ {timestamp} | Ticket: {ticket_id} | Component: {component} | Data: {feedback_data}"
    print(log_entry)
    # In a real scenario, append to a file or DataFrame
    # with open(os.path.join(DATA_DIR, "feedback_log.csv"), "a") as f:
    #     # Write as CSV row
    #     pass

In [9]:
# Load Gemini Model (if API key is available)
gemini_model = None
if GEMINI_API_KEY:
    try:
        gemini_model = genai.GenerativeModel("gemini-1.5-flash") # Or "gemini-pro"
        logger.info("Gemini model 'gemini-1.5-flash' initialized.")
    except Exception as e:
        logger.error(f"Failed to initialize Gemini model: {e}")
else:
    logger.warning("Gemini API Key not set. Summarization will be unavailable.")

# --- Summarization Function ---
def summarize_conversation_gemini(conversation: str) -> str:
    """Summarizes using Gemini API, includes summary and action extraction."""
    if not gemini_model:
        return "Error: Gemini model not available."
    if not conversation: return "No conversation provided."

    prompt = f"""You are a customer support assistant. Analyze the following conversation.
1. Provide a crisp, 2-3 sentence summary focusing on the issue, agent actions, and outcome.
2. Provide a bulleted list of the main action items or steps taken by the agent or promised to the user.

Conversation:
{conversation}

Summary:
[Your 2-3 sentence summary here]

Actions:
* [Action 1]
* [Action 2]
...
"""
    try:
        response = gemini_model.generate_content(prompt)
        summary_text = response.text.strip()
        # Extract summary and actions (improved regex)
        summary_match = re.search(r"Summary:\n(.*?)\nActions:", summary_text, re.DOTALL | re.IGNORECASE)
        actions_match = re.search(r"Actions:\n(.*?)$", summary_text, re.DOTALL | re.IGNORECASE)

        summary = summary_match.group(1).strip() if summary_match else "Summary not extracted."
        actions = actions_match.group(1).strip() if actions_match else "Actions not extracted."

        # Post-process and format
        formatted_output = post_process_summary(summary, actions)
        logger.info("Summarization successful using Gemini.")
        return formatted_output

    except Exception as e:
        logger.error(f"Error during Gemini summarization: {e}")
        return f"Error: Could not generate summary ({e})"

# --- Post-processing ---
def post_process_summary(summary: str, actions: str) -> str:
    """Runs simple checks and formats the summary output."""
    has_action_verb = any(verb in summary.lower() for verb in ['sent', 'guided', 'checked', 'reset', 'advised', 'suggested', 'resolved', 'fixed', 'updated', 'reported', 'adjusted', 'escalated'])
    has_resolution_hint = any(hint in summary.lower() for hint in ['resolved', 'worked', 'fixed', 'thanks', 'okay', 'got it', 'better', 'sent', 'success', 'adjusted', 'corrected'])

    warning = ""
    if not has_action_verb: warning += "[Warning: Summary may lack clear action verb.] "
    if not has_resolution_hint: warning += "[Warning: Summary may lack clear resolution hint.]"

    # Ensure actions look like a list
    if not actions.startswith('*'):
        actions = "* " + actions.replace('\n', '\n* ')

    formatted_output = f"Summary: {summary.strip()}\nActions:\n{actions.strip()}"
    if warning: formatted_output += f"\n{warning.strip()}"
    return formatted_output

# --- Main Summarizer Function ---
def get_summary(conversation: str) -> str:
    # In this notebook, we primarily rely on Gemini
    return summarize_conversation_gemini(conversation)

2025-04-06 12:02:03,780 - INFO - Gemini model 'gemini-1.5-flash' initialized.


In [10]:
test_convo_summary = """
Agent: Support, how can I help?
User: Hi, the SmartWidget X1 keeps disconnecting from my Wifi network every few hours. I already rebooted it.
Agent: Got it. Let's check the signal strength. Can you access the widget's settings page? Go to Network > Status. What does it say for RSSI?
User: Okay, accessing now... It says RSSI -75 dBm.
Agent: Ah, that's quite low. Ideally, it should be better than -65 dBm. Is the widget far from your router?
User: It's in the next room, maybe 20 feet away through one wall.
Agent: Okay, -75 is borderline. Could you try moving the widget closer to the router, just temporarily, to see if the connection stabilizes?
User: Sure, I'll move it now... Okay, it's about 5 feet away. The RSSI now shows -58 dBm.
Agent: Excellent, that's much better. Let's keep it there for a while and see if the disconnections stop. If it stays connected, the location was likely the issue. We might need to look into a Wifi extender if you need it in the original spot.
User: Okay, makes sense. I'll monitor it. Thanks!
Agent: You're welcome! Let us know if it drops again.
"""
summary_result = get_summary(test_convo_summary)
print(summary_result)

2025-04-06 12:02:17,934 - INFO - Summarization successful using Gemini.
Summary: The user reported their SmartWidget X1 was frequently disconnecting from their Wifi. The agent diagnosed a weak signal (low RSSI value) as the likely cause.  The user successfully improved the connection by moving the widget closer to the router, and will monitor its performance.
Actions:
* Checked the SmartWidget's RSSI value via the device's settings page.
* Determined that a low RSSI value (-75 dBm) indicated a weak signal strength.
* Suggested and guided the user to move the SmartWidget closer to the router to improve signal strength.
* Confirmed improved signal strength (-58 dBm) after relocation.
* Advised the user to monitor the connection and consider a Wifi extender if the problem persists in the original location.
* Provided troubleshooting advice and support throughout the interaction.


In [11]:
# Load spaCy Model
nlp_ner = None
try:
    nlp_ner = spacy.load(ENTITY_EXTRACTOR_MODEL_SPACY, disable=["parser"]) # Keep tagger, NER
    nlp_ner.add_pipe('sentencizer')
    logger.info(f"Loaded spaCy NER model '{ENTITY_EXTRACTOR_MODEL_SPACY}' onto {DEVICE}.")
except OSError:
    logger.error(f"Failed to load spaCy model '{ENTITY_EXTRACTOR_MODEL_SPACY}'. Download it first.")
    logger.info("Attempting to load 'en_core_web_sm' as fallback...")
    try:
        nlp_ner = spacy.load("en_core_web_sm", disable=["parser"])
        nlp_ner.add_pipe('sentencizer')
        logger.info("Loaded fallback spaCy model 'en_core_web_sm'.")
    except OSError:
         logger.error("Fallback spaCy model also not found. NER disabled.")

# --- Entity Extraction Function ---
def extract_entities(text: str) -> dict:
    """Extracts predefined entities using the loaded spaCy model."""
    if not nlp_ner:
        return {"error": "NER model not available"}
    if not text: return {}

    doc = nlp_ner(text)
    entities_found = {key: [] for key in TARGET_ENTITY_LABELS.keys()}

    logger.debug(f"NER: Processing text ({len(text)} chars), found {len(doc.ents)} potential entities.")
    for ent in doc.ents:
        logger.debug(f" - Entity: '{ent.text}' ({ent.label_})")
        for target_entity, spacy_labels in TARGET_ENTITY_LABELS.items():
            if ent.label_ in spacy_labels:
                # Basic filtering (can be improved with rules)
                entity_text = ent.text.strip()
                # Rule for error codes (example: alphanumeric, length > 2)
                if target_entity == "error_code" and not re.match(r'^[A-Za-z0-9]{3,}$', entity_text):
                     continue
                # Avoid adding duplicates
                if entity_text not in entities_found[target_entity]:
                    entities_found[target_entity].append(entity_text)

    # Clean up empty lists
    final_entities = {k: v for k, v in entities_found.items() if v}
    logger.info(f"Extracted entities: {final_entities}")
    return final_entities

2025-04-06 12:02:50,995 - INFO - Loaded spaCy NER model 'en_core_web_trf' onto cuda.


In [12]:
test_dialog_ner = """
User: My payment failed with error X45 on the website using my Visa ending in 1234. This was for the Acme Inc subscription.
Agent: Okay, I see error X45 indicates a bank verification issue. I must escalate this to the Payments dept. Can I get your account ID?
User: It's account-9876. Please ask them to check by tomorrow EOD, that's 2024-07-28.
Agent: Will do. Escalation filed to Payments. They should review by 2024-07-28. Also, my SmartWidget X1 keeps crashing.
"""
entities_result = extract_entities(test_dialog_ner)
print(entities_result)

2025-04-06 12:03:56,445 - INFO - Extracted entities: {'issue_category': ['Visa', 'Acme Inc', 'X45', 'Payments', 'SmartWidget X1'], 'device': ['X45', 'SmartWidget X1'], 'escalation_target': ['Visa', 'Acme Inc', 'Payments'], 'promised_follow_up_date': ['tomorrow', '2024-07-28']}
{'issue_category': ['Visa', 'Acme Inc', 'X45', 'Payments', 'SmartWidget X1'], 'device': ['X45', 'SmartWidget X1'], 'escalation_target': ['Visa', 'Acme Inc', 'Payments'], 'promised_follow_up_date': ['tomorrow', '2024-07-28']}


In [None]:
# Load Sentence Transformer Model (will use GPU if detected)
try:
    bi_encoder = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL_ID, device=DEVICE)
    EMBEDDING_DIM = bi_encoder.get_sentence_embedding_dimension()
    logger.info(f"Loaded Sentence Transformer '{SENTENCE_TRANSFORMER_MODEL_ID}' onto {DEVICE}. Dim: {EMBEDDING_DIM}")
except Exception as e:
    logger.error(f"Failed to load Sentence Transformer model: {e}")
    bi_encoder = None
    EMBEDDING_DIM = None

# --- FAISS Indexing ---
faiss_index = None
index_to_ticket_data = {} # Map FAISS index ID -> {'resolution': ..., 'ticket_id': ...}

def build_or_load_retriever_index(force_rebuild=False):
    """Builds or loads the FAISS index for historical tickets (GPU)."""
    global faiss_index, index_to_ticket_data

    if not force_rebuild and os.path.exists(RETRIEVER_INDEX_PATH) and os.path.exists(RETRIEVER_MAP_PATH):
        try:
            logger.info(f"Loading existing FAISS index from {RETRIEVER_INDEX_PATH}")
            # Load CPU index first, then move to GPU
            # cpu_index = faiss.read_index(RETRIEVER_INDEX_PATH)
            # Load directly if saved from GPU resource? Test this.
            faiss_index = faiss.read_index(RETRIEVER_INDEX_PATH)

            if DEVICE.type == 'cuda':
                 # If GPU available, try moving the loaded index to GPU
                 # This requires the index to be compatible or requires cloning
                 res = faiss.StandardGpuResources() # Get GPU resources
                 co = faiss.GpuClonerOptions()
                 # Clone to GPU (might need specific index types)
                 try:
                      faiss_index = faiss.index_cpu_to_gpu(res, DEVICE.index or 0, faiss_index, co)
                      logger.info("Successfully moved loaded FAISS index to GPU.")
                 except Exception as clone_err:
                      logger.warning(f"Could not automatically move loaded index to GPU: {clone_err}. Using CPU index.")
                      # Fallback or keep the CPU index if cloning fails

            with open(RETRIEVER_MAP_PATH, 'rb') as f:
                index_to_ticket_data = pickle.load(f)
            logger.info(f"Loaded FAISS index with {faiss_index.ntotal} vectors and map.")
            return True
        except Exception as e:
            logger.error(f"Error loading index: {e}. Rebuilding.")

    logger.info("Building new retriever index...")
    if not bi_encoder:
        logger.error("Cannot build index: Sentence Transformer model not loaded.")
        return False

    df_history = load_historical_data()
    if df_history.empty: logger.error("No historical data."); return False

    # Index based on dialog for better context
    df_solved = df_history[df_history['status'].str.lower() == 'solved'].copy()
    if df_solved.empty: logger.error("No 'solved' tickets found."); return False
    logger.info(f"Indexing {len(df_solved)} solved tickets...")

    texts_to_embed = [preprocess_text_for_embedding(text) for text in df_solved['dialog'].fillna('')]
    ticket_ids = df_solved['ticket_id'].tolist()
    resolutions = df_solved['resolution'].fillna('N/A').tolist()

    logger.info("Encoding corpus (using GPU if available)...")
    corpus_embeddings = bi_encoder.encode(texts_to_embed, convert_to_tensor=True, show_progress_bar=True, device=DEVICE)
    corpus_embeddings_np = corpus_embeddings.cpu().numpy() # Move to CPU for FAISS build/normalize
    faiss.normalize_L2(corpus_embeddings_np) # Normalize for cosine similarity

    logger.info("Building FAISS index...")
    # Use IndexFlatL2 for CPU, potentially different for GPU optimization later
    cpu_index = faiss.IndexFlatL2(EMBEDDING_DIM)
    if DEVICE.type == 'cuda':
        try:
            res = faiss.StandardGpuResources() # Use default GPU resources
            gpu_index = faiss.index_cpu_to_gpu(res, DEVICE.index or 0, cpu_index)
            logger.info("Created FAISS index on GPU.")
            faiss_index = gpu_index
        except Exception as gpu_err:
            logger.warning(f"Failed to create FAISS index on GPU: {gpu_err}. Using CPU index.")
            faiss_index = cpu_index # Fallback to CPU index
    else:
        faiss_index = cpu_index # Use CPU index if no GPU

    faiss_index.add(corpus_embeddings_np) # Add normalized embeddings

    # Create mapping from FAISS index position (0 to n-1) to ticket info
    index_to_ticket_data = {i: {'resolution': res, 'ticket_id': tid}
                           for i, (tid, res) in enumerate(zip(ticket_ids, resolutions))}

    logger.info(f"Index build complete. Indexed {faiss_index.ntotal} documents.")

    # Save the index (move back to CPU first if it's on GPU) and map
    try:
        index_to_save = faiss_index
        if DEVICE.type == 'cuda' and isinstance(faiss_index, faiss.GpuIndex):
             logger.info("Moving FAISS index to CPU for saving.")
             index_to_save = faiss.index_gpu_to_cpu(faiss_index)

        faiss.write_index(index_to_save, RETRIEVER_INDEX_PATH)
        with open(RETRIEVER_MAP_PATH, 'wb') as f:
            pickle.dump(index_to_ticket_data, f)
        logger.info(f"Saved FAISS index to {RETRIEVER_INDEX_PATH} and map to {RETRIEVER_MAP_PATH}")
    except Exception as e:
        logger.error(f"Error saving index: {e}")

    return True

# --- Retrieval Function ---
def retrieve_solutions(query_text: str, top_k: int = RETRIEVER_TOP_K) -> list[dict]:
    """Retrieves top_k relevant resolutions using FAISS and bi-encoder."""
    if not faiss_index or not index_to_ticket_data:
        logger.error("Retriever index not ready.")
        return [{"error": "Retriever not initialized"}]
    if not query_text: return []

    logger.info(f"Retrieving solutions for query: '{query_text[:100]}...'")
    query_embedding = bi_encoder.encode(preprocess_text_for_embedding(query_text),
                                        convert_to_tensor=True, device=DEVICE)
    query_embedding_np = query_embedding.cpu().numpy().reshape(1, -1)
    faiss.normalize_L2(query_embedding_np) # Normalize query

    # Search the index
    distances, indices = faiss_index.search(query_embedding_np, top_k)

    results = []
    if indices.size > 0:
        for i in range(indices.shape[1]):
            idx = indices[0, i]
            if idx != -1 and idx in index_to_ticket_data:
                score = distances[0, i] # For L2 index, lower distance is better. Convert to similarity?
                # Convert L2 distance to cosine similarity: sim = 1 - (dist^2 / 2) for normalized vectors
                similarity = 1 - (score**2 / 2)
                ticket_info = index_to_ticket_data[idx]
                results.append({
                    "ticket_id": ticket_info['ticket_id'],
                    "resolution": ticket_info['resolution'],
                    "score": float(similarity) # Return cosine similarity
                })
            else:
                 logger.warning(f"Retrieved invalid index {idx} or not found in map.")

    # Sort by similarity score (higher is better)
    results = sorted(results, key=lambda x: x['score'], reverse=True)
    logger.info(f"Retrieved {len(results)} solutions.")
    return results[:top_k]


In [None]:
# Build index (can take time depending on data size and hardware)
# Set force_rebuild=True if you updated the data or model
index_ready = build_or_load_retriever_index(force_rebuild=False)
if not index_ready:
    logger.error("Failed to build or load the retriever index.")

In [None]:
if index_ready:
    test_query_retrieval = "user wifi keeps dropping very slow"
    recommendations = retrieve_solutions(test_query_retrieval)
    print("\n--- Recommended Resolutions ---")
    for rec in recommendations:
        print(f"- Ticket ID: {rec['ticket_id']}, Score: {rec['score']:.4f}, Resolution: {rec['resolution']}")
else:
    print("\nRetriever index not available. Skipping retrieval example.")

In [None]:
router_pipeline = None

# --- Router Training Function ---
def train_router():
    global router_pipeline
    logger.info("Starting task router training...")
    df_history = load_historical_data()
    target_col = 'queue'
    text_feature = 'summary' # Use summary for routing

    if df_history.empty or target_col not in df_history.columns or text_feature not in df_history.columns:
        logger.error("Insufficient data for training router."); return False

    df_train = df_history.dropna(subset=[text_feature, target_col]).copy()
    if len(df_train) < 10: logger.error(f"Need more data ({len(df_train)} rows) for router."); return False

    X = df_train[text_feature]
    y = df_train[target_col]
    known_classes = sorted(y.unique())
    logger.info(f"Router classes: {known_classes}")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))),
        ('clf', LogisticRegression(random_state=42, class_weight='balanced', C=1.0, max_iter=1000))
    ])

    logger.info("Training the router pipeline...")
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    logger.info(f"Router Eval - Accuracy: {accuracy:.4f}, Macro F1: {macro_f1:.4f}")
    # print(classification_report(y_test, y_pred, zero_division=0))

    try:
        joblib.dump(pipeline, ROUTER_MODEL_PATH)
        logger.info(f"Saved trained router model to {ROUTER_MODEL_PATH}")
        router_pipeline = pipeline # Update global var
        return True
    except Exception as e:
        logger.error(f"Error saving router model: {e}")
        return False

# --- Router Loading Function ---
def load_router_model():
    global router_pipeline
    if os.path.exists(ROUTER_MODEL_PATH):
        try:
            router_pipeline = joblib.load(ROUTER_MODEL_PATH)
            logger.info(f"Loaded task router model from {ROUTER_MODEL_PATH}")
            return True
        except Exception as e:
            logger.error(f"Error loading router model: {e}")
            router_pipeline = None
    return False

# --- Router Prediction Function ---
def predict_queue(ticket_summary: str) -> str:
    if not router_pipeline:
        logger.warning("Router model not loaded. Returning default queue.")
        return DEFAULT_QUEUES[-1] # General queue

    try:
        # Input must be iterable
        prediction = router_pipeline.predict([ticket_summary])
        predicted = prediction[0]
        # Ensure prediction is within known classes from training (if possible)
        # classes = router_pipeline.classes_ # Access classes if needed
        logger.info(f"Predicted queue: {predicted}")
        return predicted
    except Exception as e:
        logger.error(f"Error during queue prediction: {e}")
        return DEFAULT_QUEUES[-1] # Fallback



In [None]:

if not load_router_model():
    logger.info("Router model not found, attempting to train...")
    train_router()
else:
    logger.info("Router model loaded successfully.")

In [None]:

if router_pipeline:
    test_summary_route = "App crashes with error 503 on iOS during sync"
    predicted_q = predict_queue(test_summary_route)
    print(f"Summary: '{test_summary_route}' -> Predicted Queue: {predicted_q}")

    test_summary_route_2 = "cannot make payment with visa card error x45"
    predicted_q_2 = predict_queue(test_summary_route_2)
    print(f"Summary: '{test_summary_route_2}' -> Predicted Queue: {predicted_q_2}")
else:
    print("Router model not available for prediction example.")

In [None]:

ttr_pipeline = None
ttr_feature_names_out = None

# Transformer for log/exp target transformation
log_transformer = FunctionTransformer(np.log1p, np.expm1, validate=True)

# --- TTR Training Function ---
def train_ttr_estimator():
    global ttr_pipeline, ttr_feature_names_out
    logger.info("Starting TTR estimator training...")
    df_history = load_historical_data()

    if df_history.empty or TTR_TARGET_VARIABLE not in df_history.columns:
        logger.error(f"TTR training failed: Need '{TTR_TARGET_VARIABLE}' column."); return False

    # Ensure all required feature columns exist (might need dummy data for initial run)
    missing_cols = [col for col in TTR_ALL_FEATURES if col not in df_history.columns]
    if missing_cols:
        logger.error(f"TTR training failed: Missing required feature columns: {missing_cols}")
        logger.error("Ensure load_historical_data provides these or add them.")
        return False # Stop if features are missing

    df_train = df_history.dropna(subset=[TTR_TARGET_VARIABLE] + TTR_ALL_FEATURES).copy()

    # Convert boolean flags to string for categorical encoding
    for col in TTR_BOOLEAN_FEATURES:
        df_train[col] = df_train[col].astype(str)

    if len(df_train) < 10: logger.error(f"Need more data ({len(df_train)} rows) for TTR."); return False

    X = df_train[TTR_ALL_FEATURES]
    y = df_train[TTR_TARGET_VARIABLE]
    y_log = log_transformer.transform(y) # Log-transform target

    X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.25, random_state=42)

    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), TTR_CATEGORICAL_FEATURES + TTR_BOOLEAN_FEATURES),
            ('num', StandardScaler(), TTR_NUMERICAL_FEATURES)
        ],
        remainder='passthrough' # Should be empty if X includes only specified features
    )

    # Define Model (LightGBM)
    # Note: For GPU with LightGBM, need 'device':'gpu' and potentially build from source with GPU support
    lgbm = lgb.LGBMRegressor(random_state=42, n_estimators=100, learning_rate=0.1, num_leaves=31)

    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', lgbm)
    ])

    logger.info("Training the TTR pipeline...")
    pipeline.fit(X_train, y_train_log)

    # Get feature names after preprocessing (important for prediction)
    try:
        ttr_feature_names_out = pipeline.named_steps['preprocess'].get_feature_names_out()
        logger.info(f"TTR Feature names out: {ttr_feature_names_out.tolist()}")
    except Exception as e:
        logger.warning(f"Could not get feature names from TTR preprocessor: {e}")
        ttr_feature_names_out = X_train.columns.tolist() # Fallback

    # Evaluate
    y_pred_log = pipeline.predict(X_test)
    y_pred = np.expm1(y_pred_log) # Inverse transform predictions
    y_test = np.expm1(y_test_log) # Inverse transform true values
    mae = mean_absolute_error(y_test, y_pred)
    logger.info(f"TTR Eval - MAE: {mae:.4f} hours")

    try:
        # Save pipeline AND feature names
        joblib.dump({'pipeline': pipeline, 'feature_names': ttr_feature_names_out}, TTR_ESTIMATOR_MODEL_PATH)
        logger.info(f"Saved trained TTR estimator model to {TTR_ESTIMATOR_MODEL_PATH}")
        ttr_pipeline = pipeline # Update global var
        return True
    except Exception as e:
        logger.error(f"Error saving TTR estimator model: {e}")
        return False

# --- TTR Loading Function ---
def load_ttr_model():
    global ttr_pipeline, ttr_feature_names_out
    if os.path.exists(TTR_ESTIMATOR_MODEL_PATH):
        try:
            data = joblib.load(TTR_ESTIMATOR_MODEL_PATH)
            ttr_pipeline = data['pipeline']
            ttr_feature_names_out = data['feature_names']
            logger.info(f"Loaded TTR estimator model from {TTR_ESTIMATOR_MODEL_PATH}")
            if ttr_feature_names_out:
                 logger.info(f"Loaded TTR feature names: {ttr_feature_names_out.tolist()}")
            return True
        except Exception as e:
            logger.error(f"Error loading TTR model: {e}")
            ttr_pipeline = None
            ttr_feature_names_out = None
    return False

# --- TTR Prediction Function ---
def predict_ttr(features: dict) -> float:
    if not ttr_pipeline or not ttr_feature_names_out:
        logger.warning("TTR estimator model or feature names not loaded. Returning -1.")
        return -1.0

    try:
        # Create DataFrame with columns in the exact order expected by the pipeline
        input_df = pd.DataFrame([features])
         # Ensure all expected columns are present, add missing with NaN/default
        for col in ttr_feature_names_out:
            # Extract base feature name if transformers added prefixes (e.g., 'cat__')
            base_col = col.split('__')[-1]
            if base_col not in input_df.columns:
                 logger.warning(f"Missing feature '{base_col}' in input dict for TTR. Adding default.")
                 # Add sensible defaults (needs improvement based on actual feature types)
                 input_df[base_col] = 0.0 if base_col in TTR_NUMERICAL_FEATURES else 'Unknown'

        # Reorder input_df columns to match feature_names_out exactly
        # This requires mapping the original feature names to the potentially prefixed names
        input_reordered_df = pd.DataFrame(columns=ttr_feature_names_out)
        # This mapping part is tricky, assuming ColumnTransformer output format
        col_map = {}
        cat_bool_cols = TTR_CATEGORICAL_FEATURES + TTR_BOOLEAN_FEATURES
        num_cols = TTR_NUMERICAL_FEATURES
        current_cat_idx = 0
        current_num_idx = 0
        for proc_col_name in ttr_feature_names_out:
             if proc_col_name.startswith('cat__'):
                  orig_col = cat_bool_cols[current_cat_idx]
                  input_reordered_df[proc_col_name] = input_df[orig_col]
                  current_cat_idx +=1
             elif proc_col_name.startswith('num__'):
                  orig_col = num_cols[current_num_idx]
                  input_reordered_df[proc_col_name] = input_df[orig_col]
                  current_num_idx += 1
             else: # Passthrough? Should not happen if remainder='drop' or defined fully
                  input_reordered_df[proc_col_name] = input_df[proc_col_name] # Assume name matches


        # Predict log-transformed TTR
        log_ttr_pred = ttr_pipeline.predict(input_reordered_df)
        # Inverse transform to get actual TTR
        predicted_ttr = np.expm1(log_ttr_pred)
        predicted_ttr = max(0.0, predicted_ttr[0]) # Ensure non-negative

        logger.info(f"Predicted TTR (hours): {predicted_ttr:.2f}")
        return predicted_ttr

    except Exception as e:
        logger.error(f"Error during TTR prediction: {e}", exc_info=True)
        return -1.0

In [None]:

if not load_ttr_model():
    logger.info("TTR estimator model not found, attempting to train...")
    train_ttr_estimator()
else:
    logger.info("TTR estimator model loaded successfully.")

In [None]:

if ttr_pipeline:
    # Example features (match what the model expects)
    test_features_ttr = {
        'predicted_queue': 'Networking', # From router output
        'priority': 'High',             # From ticket metadata
        'day_of_week': 'Tue',           # Calculated from timestamp
        'sentiment_score': -0.5,        # From sentiment analysis (dummy here)
        'solution_similarity_score': 0.85, # From retriever output (dummy here)
        'agent_load_at_open': 6.0,      # From system data (dummy here)
        'business_hours_flag': 'True'   # Calculated from timestamp
    }
    predicted_ttr_result = predict_ttr(test_features_ttr)
    print(f"\nInput Features: {test_features_ttr}")
    print(f"Predicted TTR: {predicted_ttr_result:.2f} hours")
else:
    print("\nTTR Estimator model not available for prediction example.")


In [None]:
def process_ticket_notebook(ticket_id: str, conversation: str, priority: str = "Medium", timestamp: datetime.datetime = None):
    """Runs the full pipeline on a given conversation."""
    start_time = time.time()
    logger.info(f"--- Processing Ticket {ticket_id} ---")
    if timestamp is None: timestamp = datetime.datetime.now()
    warnings = []
    results = {"ticket_id": ticket_id}

    # 1. Summarization
    summary_output = get_summary(conversation)
    if "Error:" in summary_output:
        warnings.append("Summarization failed.")
        results['summary'] = summary_output
        summary_for_downstream = "" # Use empty summary if failed
    else:
        results['summary'] = summary_output
        # Extract just the summary part for downstream tasks if needed
        summary_match = re.search(r"Summary:\n(.*?)\nActions:", summary_output, re.DOTALL | re.IGNORECASE)
        summary_for_downstream = summary_match.group(1).strip() if summary_match else summary_output # Fallback

    # 2. Entity Extraction
    entities = extract_entities(conversation) # Use full convo for NER
    if "error" in entities:
        warnings.append("Entity extraction failed.")
        results['entities'] = {}
    else:
        results['entities'] = entities

    # 3. Task Routing
    if router_pipeline:
        predicted_queue = predict_queue(summary_for_downstream) # Use extracted summary
        results['predicted_queue'] = predicted_queue
    else:
        warnings.append("Router model not loaded.")
        results['predicted_queue'] = DEFAULT_QUEUES[-1] # Default

    # 4. Resolution Recommendation
    if index_ready:
        recommended_solutions = retrieve_solutions(summary_for_downstream) # Use extracted summary
        if any("error" in sol for sol in recommended_solutions):
             warnings.append("Resolution retrieval failed.")
             results['recommended_solutions'] = []
        else:
             results['recommended_solutions'] = recommended_solutions
    else:
        warnings.append("Retriever index not ready.")
        results['recommended_solutions'] = []

    # 5. TTR Estimation
    if ttr_pipeline:
        # Construct features - NEEDS REAL DATA MAPPING
        sentiment_score = round(random.uniform(-1, 1), 2) # Placeholder
        solution_similarity_score = results['recommended_solutions'][0]['score'] if results['recommended_solutions'] else 0.0
        agent_load_at_open = random.randint(1, 10) # Placeholder
        business_hours_flag = 8 <= timestamp.hour < 18 and timestamp.weekday() < 5
        day_of_week = timestamp.strftime('%a')

        ttr_features = {
            'predicted_queue': results['predicted_queue'],
            'priority': priority,
            'sentiment_score': sentiment_score,
            'solution_similarity_score': solution_similarity_score,
            'business_hours_flag': str(business_hours_flag),
            'day_of_week': day_of_week,
            'agent_load_at_open': float(agent_load_at_open)
            # Add other features...
        }
        predicted_ttr = predict_ttr(ttr_features)
        if predicted_ttr < 0:
            warnings.append("TTR prediction failed.")
            results['predicted_ttr_hours'] = -1.0
        else:
            results['predicted_ttr_hours'] = predicted_ttr
    else:
        warnings.append("TTR Estimator model not loaded.")
        results['predicted_ttr_hours'] = -1.0

    end_time = time.time()
    processing_time_ms = (end_time - start_time) * 1000
    results['processing_time_ms'] = processing_time_ms
    results['warnings'] = warnings

    logger.info(f"--- Finished Processing Ticket {ticket_id} in {processing_time_ms:.2f} ms ---")
    return results


In [None]:

sample_ticket_id = "NB-9876"
sample_conversation = """
User: Hey, my internet seems really slow today, especially when streaming video. It keeps buffering.
Agent: Hi there, let's check that out. Have you already tried restarting your modem and router?
User: Yes, I did that first thing. Didn't seem to change anything. My plan is supposed to be 500 Mbps.
Agent: Okay, thanks. Can you run a speed test at speedtest.net and tell me the download and upload speeds?
User: Sure, running it now... Hmm, getting about 50 Mbps down and 15 Mbps up. Way lower than usual.
Agent: Right, that's definitely low for your plan. Let me check the line from our end... I'm seeing some signal noise upstream. Could you check the coaxial cable connection at the back of the modem and at the wall outlet? Make sure they are finger-tight.
User: Let me look... The one at the modem was a little loose. I tightened it up.
Agent: Great. Could you please reboot the modem one more time now that the cable is secure? Just unplug the power for 30 seconds.
User: Okay, rebooting... [waits a minute] ...It's back online.
Agent: Perfect. Can you run that speed test again, please?
User: Running now... Wow, much better! 480 Mbps down, 22 Mbps up. Looks like that loose cable was the issue!
Agent: Excellent! Glad we could sort that out. That noise I saw on the line is gone now too. Anything else I can help with?
User: Nope, that fixed it. Thanks so much!
Agent: You're welcome! Have a good day.
"""

pipeline_results = process_ticket_notebook(sample_ticket_id, sample_conversation, priority="Medium")

# Print the results nicely
print("\n--- Pipeline Results ---")
print(f"Ticket ID: {pipeline_results.get('ticket_id')}")
print(f"\nSummary & Actions:\n{pipeline_results.get('summary', 'N/A')}")
print(f"\nEntities: {pipeline_results.get('entities', {})}")
print(f"\nPredicted Queue: {pipeline_results.get('predicted_queue', 'N/A')}")
print(f"\nPredicted TTR (hours): {pipeline_results.get('predicted_ttr_hours', 'N/A'):.2f}")
print("\nRecommended Solutions:")
for sol in pipeline_results.get('recommended_solutions', []):
    print(f"  - Ticket ID: {sol['ticket_id']}, Score: {sol['score']:.4f}, Resolution: {sol['resolution']}")
if not pipeline_results.get('recommended_solutions'): print("  (None)")
print(f"\nProcessing Time (ms): {pipeline_results.get('processing_time_ms', 0):.2f}")
if pipeline_results.get('warnings'):
    print("\nWarnings:")
    for w in pipeline_results['warnings']: print(f"  - {w}")
print("--- End Results ---")


In [None]:

# Simulate agent feedback on the retrieved solutions
if pipeline_results.get('recommended_solutions'):
    # Agent found the first solution helpful
    feedback_data_helpful = {
        "feedback_type": "solution_helpful",
        "solution_ticket_id": pipeline_results['recommended_solutions'][0]['ticket_id'],
        "agent_id": "agent007"
    }
    log_feedback_notebook(sample_ticket_id, "retriever", feedback_data_helpful)

    # Agent found the second solution not helpful
    if len(pipeline_results['recommended_solutions']) > 1:
        feedback_data_not_helpful = {
            "feedback_type": "solution_not_helpful",
            "solution_ticket_id": pipeline_results['recommended_solutions'][1]['ticket_id'],
            "reason": "Issue was different",
            "agent_id": "agent007"
        }
        log_feedback_notebook(sample_ticket_id, "retriever", feedback_data_not_helpful)

# Simulate feedback on the predicted queue
feedback_data_queue = {
    "feedback_type": "correct_queue", # or "incorrect_queue"
    "predicted_queue": pipeline_results.get('predicted_queue'),
    # "correct_queue": "Networking", # Provide if incorrect
    "agent_id": "agent007"
}
log_feedback_notebook(sample_ticket_id, "router", feedback_data_queue)
