In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModel

In [2]:
# --- 1. Load Data and Create the Knowledge Base ---

# Load the full processed dataset
df_full = pd.read_parquet('processed_customer_support_data.parquet')

# Create the combined_text feature we'll use for queries
df_full['combined_text'] = df_full['Ticket Subject'] + " | " + df_full['Cleaned_Description']

# Create our "Knowledge Base" by filtering for closed tickets that have a resolution text
# We also drop duplicates to ensure our knowledge base is clean
knowledge_base_df = df_full[df_full['Ticket Status'] == 'Closed'].dropna(subset=['Resolution']).drop_duplicates(subset=['Resolution']).copy()

# Keep only the essential columns for the knowledge base
knowledge_base_df = knowledge_base_df[['Ticket ID', 'combined_text', 'Resolution']].reset_index(drop=True)

print("Knowledge Base created successfully.")
print(f"Number of unique, solved tickets in the Knowledge Base: {len(knowledge_base_df)}")

# Display some examples from our knowledge base
print("\nExample entries from the Knowledge Base:")
print(knowledge_base_df.head())

Knowledge Base created successfully.
Number of unique, solved tickets in the Knowledge Base: 2769

Example entries from the Knowledge Base:
   Ticket ID                                      combined_text  \
0          3  Network problem | facing problem productpurcha...   
1          4  Account access | issue productpurchased please...   
2          5  Data loss | issue productpurchased please assi...   
3         11  Data loss | issue productpurchased please assi...   
4         12  Software bug | issue productpurchased please a...   

                                          Resolution  
0       Case maybe show recently my computer follow.  
1      Try capital clearly never color toward story.  
2                        West decision evidence bit.  
3              Measure tonight surface feel forward.  
4  Measure there house management pick knowledge ...  


In [3]:
# --- 2. Load the Fine-Tuned Transformer Model ---

# We'll use the same model as before to generate embeddings
model_name = "best-gatekeeper-model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Ensure the model is in evaluation mode and on the correct device
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"\nModel loaded and moved to {device}.")


Model loaded and moved to cpu.


In [4]:
# --- 3. Generate Embeddings for the Knowledge Base ---

# Re-use the embedding function from the previous notebook
def get_embeddings(texts, batch_size=32):
    all_embs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        embs = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        all_embs.append(embs)
    return np.vstack(all_embs)

# Generate embeddings for the RESOLUTIONS (this is our searchable index)
print("Generating embeddings for the resolutions...")
resolution_embeddings = get_embeddings(knowledge_base_df['Resolution'].tolist())
print("Resolution embeddings generated.")
print(f"Shape of the resolution embedding matrix: {resolution_embeddings.shape}")

# Generate embeddings for the original PROBLEMS (for testing purposes)
print("\nGenerating embeddings for the original problems...")
problem_embeddings = get_embeddings(knowledge_base_df['combined_text'].tolist())
print("Problem embeddings generated.")
print(f"Shape of the problem embedding matrix: {problem_embeddings.shape}")

Generating embeddings for the resolutions...
Resolution embeddings generated.
Shape of the resolution embedding matrix: (2769, 768)

Generating embeddings for the original problems...
Problem embeddings generated.
Shape of the problem embedding matrix: (2769, 768)


In [5]:
# --- 4. Build the Semantic Search Function ---

def search_knowledge_base(query_text: str, top_n: int = 5):
    """
    Searches the knowledge base for the most relevant resolutions to a query.
    """
    # Generate the embedding for the query
    query_embedding = get_embeddings([query_text])
    
    # Calculate cosine similarity against all resolution embeddings
    similarity_scores = cosine_similarity(query_embedding, resolution_embeddings)
    
    # Get the top N results
    top_indices = np.argsort(similarity_scores[0])[-top_n:][::-1]
    top_scores = similarity_scores[0][top_indices]
    
    # Get the results from the knowledge base DataFrame
    results_df = knowledge_base_df.iloc[top_indices].copy()
    results_df['similarity_score'] = top_scores
    
    # We are interested in the original problem and the found resolution
    return results_df[['combined_text', 'Resolution', 'similarity_score']]

# --- 5. Test the Semantic Search Engine ---

# Let's take a sample problem from our knowledge base to use as a query
# We'll use the 10th entry.
sample_index = 10
query_ticket_text = knowledge_base_df['combined_text'].iloc[sample_index]
actual_resolution = knowledge_base_df['Resolution'].iloc[sample_index]

print(f"--- Searching for solutions to the following ticket ---")
print(f"Query Text: {query_ticket_text}\n")
print(f"--- The ACTUAL resolution for this ticket is ---")
print(f"Actual Resolution: {actual_resolution}\n")

# Now, let's use our search engine to find the best matching resolutions
search_results = search_knowledge_base(query_text=query_ticket_text)

print(f"--- Top 5 Search Results from the Knowledge Base ---")
# We'll print each result for better readability
for i, row in search_results.iterrows():
    print(f"--- Result {i+1} (Score: {row['similarity_score']:.4f}) ---")
    print(f"Original Problem: {row['combined_text']}")
    print(f"Found Resolution: {row['Resolution']}\n")

--- Searching for solutions to the following ticket ---
Query Text: Product compatibility | issue productpurchased please assist seller sold item receive replacement exchange item full refund exchange immediately upon ive already contacted customer support multiple time issue remains unresolved

--- The ACTUAL resolution for this ticket is ---
Actual Resolution: Certain myself month past tree benefit.

--- Top 5 Search Results from the Knowledge Base ---
--- Result 1914 (Score: 0.9296) ---
Original Problem: Installation support | issue productpurchased please assist price changed brandid productname productname productbrandid ive already contacted customer support multiple time issue remains unresolved
Found Resolution: Example issue behavior financial stuff record seek far.

--- Result 2711 (Score: 0.9252) ---
Original Problem: Data loss | issue productpurchased please assist issue productpurchased please assist help working issue ive performed factory reset productpurchased hoping wo