In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
# 1. Load the processed data from Notebook 1
df = pd.read_parquet('processed_customer_support_data.parquet')

# 2. Create the combined text feature for finding duplicates
# We'll use the original 'Ticket Subject' for its clean signal
df['combined_text'] = df['Ticket Subject'] + " | " + df['Cleaned_Description']

# Let's also keep the original Ticket ID for easy reference
df = df[['Ticket ID', 'combined_text']].copy()

print("Data loaded and prepared.")
print(f"Total number of tickets to index: {len(df)}")
df.head()

Data loaded and prepared.
Total number of tickets to index: 8469


Unnamed: 0,Ticket ID,combined_text
0,1,Product setup | issue productpurchased please ...
1,2,Peripheral compatibility | issue productpurcha...
2,3,Network problem | facing problem productpurcha...
3,4,Account access | issue productpurchased please...
4,5,Data loss | issue productpurchased please assi...


In [9]:
# 1. Initialize and fit the TF-IDF Vectorizer
print("Fitting TF-IDF Vectorizer...")
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_text'])
print("TF-IDF matrix created.")
print(f"Shape of the matrix: {tfidf_matrix.shape}") # (num_tickets, vocab_size)

Fitting TF-IDF Vectorizer...
TF-IDF matrix created.
Shape of the matrix: (8469, 6030)


In [10]:
# 2. Create the search function
def find_tfidf_duplicates(ticket_id: int, top_n: int = 5):
    """
    Finds the most similar tickets to a given ticket_id using TF-IDF vectors.
    """
    # Get the index of the ticket from its ID
    try:
        ticket_idx = df.index[df['Ticket ID'] == ticket_id].tolist()[0]
    except IndexError:
        return f"Error: Ticket ID {ticket_id} not found."
    
    # Get the TF-IDF vector for our query ticket
    query_vector = tfidf_matrix[ticket_idx]
    
    # Calculate cosine similarity between the query and all other tickets
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)
    
    # Flatten the scores array and get the indices of the top N scores
    # We use [::-1] to sort in descending order
    top_indices = np.argsort(similarity_scores[0])[-top_n-1:-1][::-1]
    
    # Get the similarity scores for the top indices
    top_scores = similarity_scores[0][top_indices]
    
    # Get the original ticket details for the top indices
    results_df = df.iloc[top_indices].copy()
    results_df['similarity_score'] = top_scores
    
    print(f"--- Query Ticket ---")
    print(f"ID: {ticket_id}")
    print(f"Text: {df.iloc[ticket_idx]['combined_text']}\n")
    
    return results_df


In [11]:
# --- 3. Test the TF-IDF system ---

# Let's find a ticket about a common issue, for example, a "software bug"
# (We'll have to find a valid Ticket ID from our data first)
# Let's assume Ticket ID 20 is a good example (you can change this ID)
try:
    results = find_tfidf_duplicates(ticket_id=20)
    print("--- Top 5 Duplicates (TF-IDF) ---")
    print(results)
except TypeError:
    # This will happen if the ticket_id is not found.
    # Let's find a valid ID and try again.
    sample_id = df['Ticket ID'].iloc[20] # Get the ID of the 21st ticket
    print(f"Testing with a valid ID: {sample_id}\n")
    results = find_tfidf_duplicates(ticket_id=sample_id)
    print("--- Top 5 Duplicates (TF-IDF) ---")
    print(results)

--- Query Ticket ---
ID: 20
Text: Software bug | issue productpurchased please assist issue productpurchased please assist customer reviewer husband able take order apple ive checked available software update productpurchased none

--- Top 5 Duplicates (TF-IDF) ---
      Ticket ID                                      combined_text  \
730         731  Software bug | issue productpurchased please a...   
5008       5009  Payment issue | issue productpurchased please ...   
8456       8457  Payment issue | issue productpurchased please ...   
559         560  Display issue | issue productpurchased please ...   
6479       6480  Display issue | issue productpurchased please ...   

      similarity_score  
730           0.535271  
5008          0.459492  
8456          0.459492  
559           0.455732  
6479          0.455732  


In [12]:
import torch
from transformers import AutoTokenizer, AutoModel

# --- 1. Load a fine-tuned Transformer model ---
# We'll use the base model from our gatekeeper, which has been fine-tuned on our data.
# We load it with AutoModel to get the hidden states, not the classification head.
model_name = "best-gatekeeper-model" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Ensure the model is in evaluation mode
model.eval()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Model loaded and moved to {device}.")

Model loaded and moved to cpu.


In [13]:
# --- 2. Generate Embeddings for all tickets ---
# This function will take a batch of texts and return their embeddings
def get_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()} # Move inputs to the correct device
    with torch.no_grad():
        outputs = model(**inputs)
    # We use the mean of the last hidden state as the sentence embedding
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

# Generate embeddings for all tickets in the dataframe
# This may take a few minutes to run
print("Generating embeddings for all tickets...")
# We'll do this in batches to manage memory
batch_size = 32
all_embeddings = np.vstack([get_embeddings(df['combined_text'][i:i+batch_size].tolist()) for i in range(0, len(df), batch_size)])
print("Embeddings generated.")
print(f"Shape of the embedding matrix: {all_embeddings.shape}")

Generating embeddings for all tickets...
Embeddings generated.
Shape of the embedding matrix: (8469, 768)


In [15]:
# --- 3. Create the new search function ---
def find_embedding_duplicates(ticket_id: int, top_n: int = 5):
    """
    Finds the most similar tickets to a given ticket_id using Transformer embeddings.
    """
    try:
        ticket_idx = df.index[df['Ticket ID'] == ticket_id].tolist()[0]
    except IndexError:
        return f"Error: Ticket ID {ticket_id} not found."
    
    query_embedding = all_embeddings[ticket_idx].reshape(1, -1)
    
    similarity_scores = cosine_similarity(query_embedding, all_embeddings)
    
    top_indices = np.argsort(similarity_scores[0])[-top_n-1:-1][::-1]
    top_scores = similarity_scores[0][top_indices]
    
    results_df = df.iloc[top_indices].copy()
    results_df['similarity_score'] = top_scores
    
    print(f"--- Query Ticket ---")
    print(f"ID: {ticket_id}")
    print(f"Text: {df.iloc[ticket_idx]['combined_text']}\n")
    
    return results_df

# --- 4. Test the new system on the same ticket ---
sample_id = df['Ticket ID'].iloc[20] # Get the same ID as before
results_embeddings = find_embedding_duplicates(ticket_id=sample_id)
print("--- Top 5 Duplicates (Transformer Embeddings) ---")
print(results_embeddings)

--- Query Ticket ---
ID: 21
Text: Payment issue | issue productpurchased please assist name microsoft surface pro version usage ive checked available software update productpurchased none

--- Top 5 Duplicates (Transformer Embeddings) ---
      Ticket ID                                      combined_text  \
3484       3485  Data loss | issue productpurchased please assi...   
3980       3981  Hardware issue | issue productpurchased please...   
7713       7714  Network problem | issue productpurchased pleas...   
4159       4160  Display issue | issue productpurchased please ...   
4717       4718  Network problem | issue productpurchased pleas...   

      similarity_score  
3484          0.997018  
3980          0.996951  
7713          0.996826  
4159          0.996800  
4717          0.996796  
