# AI Bot Conversation Quality Evaluation


In [None]:
# 1. This evaluation was done manually

In [2]:
import pandas as pd

file_path = '/home/mery/Downloads/dataset1 (1).xlsx'
df = pd.read_excel(file_path)

df.head()

Unnamed: 0,message,source,lead_id
0,Hey Megan! This is Clara from Fitness Club Hun...,1,104332254
1,Unfortunately no! I'm just in town visiting an...,2,104332254
2,One of our representatives will be in touch wi...,1,104332254
3,Hey Summer! This is Clara from Fitness Club Wi...,1,104334572
4,"Hi Summer, are you ready to book your Free int...",1,104334572


In [56]:
df.info

<bound method DataFrame.info of                                                message  source    lead_id
0    Hey Megan! This is Clara from Fitness Club Hun...       1  104332254
1    Unfortunately no! I'm just in town visiting an...       2  104332254
2    One of our representatives will be in touch wi...       1  104332254
3    Hey Summer! This is Clara from Fitness Club Wi...       1  104334572
4    Hi Summer, are you ready to book your Free int...       1  104334572
..                                                 ...     ...        ...
343  Hi Melissa! We are located at 13 South Orange ...       1  104712073
344  Oh okay. Don't know SODO is. I'm too far from ...       2  104712073
345  You're welcome, Melissa! I understand if the l...       1  104712073
346                           Will do! Thank you 🙆🏼‍♀️       2  104712073
347  You're welcome, Melissa! If you ever have any ...       1  104712073

[348 rows x 3 columns]>

In [57]:
grouped = df.groupby('lead_id')

conversations = {lead_id: group for lead_id, group in grouped}
conversations

{104332254:                                              message  source    lead_id
 0  Hey Megan! This is Clara from Fitness Club Hun...       1  104332254
 1  Unfortunately no! I'm just in town visiting an...       2  104332254
 2  One of our representatives will be in touch wi...       1  104332254,
 104334572:                                              message  source    lead_id
 3  Hey Summer! This is Clara from Fitness Club Wi...       1  104334572
 4  Hi Summer, are you ready to book your Free int...       1  104334572
 5  Hi Summer it's Clara, with Fitness Club Winter...       1  104334572
 6  Hi, do you have any free intro classes on thur...       2  104334572
 7  Hi Summer! Sorry, we do not. We offer them on ...       1  104334572
 8  Hi Summer it’s Clara, with Fitness Club Winter...       1  104334572
 9  Hi Summer! When are you available to workout w...       1  104334572,
 104334676:                                               message  source    lead_id
 10  Hey Lily! 

In [58]:
sample_lead_id = list(conversations.keys())[0]
sample_conversation = conversations[sample_lead_id]
sample_conversation

Unnamed: 0,message,source,lead_id
0,Hey Megan! This is Clara from Fitness Club Hun...,1,104332254
1,Unfortunately no! I'm just in town visiting an...,2,104332254
2,One of our representatives will be in touch wi...,1,104332254


In [59]:
criteria = ["Relevance", "Accuracy", "Clarity", "Engagement", "Adherence to Instructions"]

def evaluate_response(message, lead_message):
    ratings = {}
    ratings["Relevance"] = 10 if "representatives will be in touch" in message else 7
    ratings["Accuracy"] = 10 if "representatives will be in touch" in message else 8
    ratings["Clarity"] = 10 if "representatives will be in touch" in message else 9
    ratings["Engagement"] = 9 if "How can I assist you today?" in message else 7
    ratings["Adherence to Instructions"] = 10 if any(keyword in lead_message for keyword in ["friend", "gift card", "payments", "discounts", "costs", "1Pass", "onepass"]) else 7
    return ratings

evaluations = []

for lead_id, conversation in conversations.items():
    bot_messages = conversation[conversation['source'] == 1]
    lead_messages = conversation[conversation['source'] == 2]
    
    for idx, bot_msg in bot_messages.iterrows():
        corresponding_lead_msg = lead_messages.loc[lead_messages.index < idx, 'message'].values[-1] if not lead_messages.loc[lead_messages.index < idx].empty else ""
        ratings = evaluate_response(bot_msg['message'], corresponding_lead_msg)
        overall_score = sum(ratings.values()) / len(ratings)
        
        evaluation = {
            "lead_id": lead_id,
            "message": bot_msg['message'],
            **ratings,
            "Overall Score": overall_score
        }
        evaluations.append(evaluation)

evaluation_df = pd.DataFrame(evaluations)
evaluation_df

Unnamed: 0,lead_id,message,Relevance,Accuracy,Clarity,Engagement,Adherence to Instructions,Overall Score
0,104332254,Hey Megan! This is Clara from Fitness Club Hun...,7,8,9,7,7,7.6
1,104332254,One of our representatives will be in touch wi...,10,10,10,7,7,8.8
2,104334572,Hey Summer! This is Clara from Fitness Club Wi...,7,8,9,7,7,7.6
3,104334572,"Hi Summer, are you ready to book your Free int...",7,8,9,7,7,7.6
4,104334572,"Hi Summer it's Clara, with Fitness Club Winter...",7,8,9,7,7,7.6
...,...,...,...,...,...,...,...,...
202,104707876,I understand that the location may not be conv...,7,8,9,7,7,7.6
203,104712073,Hey Melissa! This is Clara from Fitness Club S...,7,8,9,7,7,7.6
204,104712073,Hi Melissa! We are located at 13 South Orange ...,7,8,9,7,7,7.6
205,104712073,"You're welcome, Melissa! I understand if the l...",7,8,9,7,7,7.6


In [60]:
conversation_scores = evaluation_df.groupby('lead_id')['Overall Score'].mean().reset_index()
conversation_scores.columns = ['lead_id', 'conversation_score']

csv_output_path = '/home/mery/Downloads/conversation_scores.csv'
conversation_scores.to_csv(csv_output_path, index=False)
conversation_scores

Unnamed: 0,lead_id,conversation_score
0,104332254,8.2
1,104334572,7.6
2,104334676,7.6
3,104335880,7.6
4,104336965,7.6
5,104341596,7.6
6,104343173,7.6
7,104343389,7.6
8,104345224,7.6
9,104348478,7.6


In [61]:
# 2. This evaluation used a Random Forest model

In [62]:
import numpy as np
np.random.seed(42)
df['label'] = np.random.randint(1, 11, df.shape[0])  

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['message'])
y = df['label']  


In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error

# model = LinearRegression()
# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# print(f"Mean Squared Error: {mse}")

In [66]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 11.667780738010231


In [67]:
def evaluate_response_ml(response):
    response_vector = vectorizer.transform([response])
    predicted_score = model.predict(response_vector)
    return predicted_score[0]

evaluations = []

for lead_id, conversation in conversations.items():
    bot_messages = conversation[conversation['source'] == 1]
    
    for idx, bot_msg in bot_messages.iterrows():
        predicted_score = evaluate_response_ml(bot_msg['message'])
        
        evaluation = {
            "lead_id": lead_id,
            "message": bot_msg['message'],
            "Predicted Score": predicted_score
        }
        evaluations.append(evaluation)

evaluation_df = pd.DataFrame(evaluations)
evaluation_df

Unnamed: 0,lead_id,message,Predicted Score
0,104332254,Hey Megan! This is Clara from Fitness Club Hun...,7.106667
1,104332254,One of our representatives will be in touch wi...,3.695373
2,104334572,Hey Summer! This is Clara from Fitness Club Wi...,5.829333
3,104334572,"Hi Summer, are you ready to book your Free int...",6.070000
4,104334572,"Hi Summer it's Clara, with Fitness Club Winter...",6.737500
...,...,...,...
202,104707876,I understand that the location may not be conv...,6.506500
203,104712073,Hey Melissa! This is Clara from Fitness Club S...,7.810000
204,104712073,Hi Melissa! We are located at 13 South Orange ...,4.550000
205,104712073,"You're welcome, Melissa! I understand if the l...",6.263333


In [68]:
conversation_scores = evaluation_df.groupby('lead_id')['Predicted Score'].mean().reset_index()
conversation_scores.columns = ['lead_id', 'conversation_score']

csv_output_path = '/home/mery/Downloads/conversation_scores_random_forest.csv'
conversation_scores.to_csv(csv_output_path, index=False)
conversation_scores

Unnamed: 0,lead_id,conversation_score
0,104332254,5.40102
1,104334572,6.493028
2,104334676,6.145833
3,104335880,6.150917
4,104336965,6.542357
5,104341596,4.367396
6,104343173,5.382738
7,104343389,6.834571
8,104345224,6.181875
9,104348478,5.907349


In [69]:
# 3. This evaluation used Semantic Similarity metrics

In [3]:
import numpy as np
from sentence_transformers import SentenceTransformer, util

In [5]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def semantic_similarity(response, ideal_response):
    response_embedding = model.encode(response, convert_to_tensor=True)
    ideal_response_embedding = model.encode(ideal_response, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(response_embedding, ideal_response_embedding)
    return similarity.item()

ideal_responses = {
    "intro_class": "I suggest you book one of the following intro classes: [class1], [class2], [class3].",
    "booking_confirmation": "Please confirm if you would like to book the class.",
    "representative_message": "One of our representatives will be in touch with you shortly to assist you further."
}

def score_conversation(conversation):
    score = 0
    total_messages = len(conversation)
    
    for idx, row in conversation.iterrows():
        if row['source'] == 1: 
            response = row['message']
            intro_similarity = semantic_similarity(response, ideal_responses["intro_class"])
            booking_similarity = semantic_similarity(response, ideal_responses["booking_confirmation"])
            representative_similarity = semantic_similarity(response, ideal_responses["representative_message"])
            max_similarity = max(intro_similarity, booking_similarity, representative_similarity)
            score += max_similarity * 10  
    
    return score / total_messages  

lead_ids = df['lead_id'].unique()
results = []

for lead_id in lead_ids:
    conversation = df[df['lead_id'] == lead_id]
    conversation_score = score_conversation(conversation)
    results.append({'lead_id': lead_id, 'conversation_score': conversation_score})

results_df = pd.DataFrame(results)
results_df.to_csv('/home/mery/Downloads/conversation_scores_semantic_similarity.csv', index=False)

results_df



Unnamed: 0,lead_id,conversation_score
0,104332254,4.411959
1,104334572,2.537026
2,104334676,2.890478
3,104335880,1.702502
4,104336965,1.604161
5,104341596,2.137816
6,104343173,1.695834
7,104343389,1.514381
8,104345224,1.546683
9,104348478,2.202436


In [None]:
# 4. This evaluation used Langchain Evaluation and the LlamaIndex OpenAI LLM

In [7]:
from langchain.evaluation import StringDistanceEvalChain
from llama_index.llms.openai import OpenAI

In [8]:
eval_chain = StringDistanceEvalChain()
llm = OpenAI(api_key='sk-proj-ObQZDE1zrNEabhwQx1V5T3BlbkFJ4sJ78Hmlv0fr5rMPZ5I4')

def get_predefined_responses():
    predefined_prompts = [
        "What should I respond when a lead asks about bringing something?",
        "What should I respond when a lead asks about a gift card?",
        "What should I respond when a lead asks about payment?",
        "What should I respond when a lead asks about discount?",
        "What should I respond when a lead asks about cost?",
        "What should I respond when a lead asks about onepass?"
    ]
    predefined_responses = {}
    for prompt in predefined_prompts:
        response = llm.complete(prompt, max_tokens=50)
        keyword = prompt.split()[-2].lower()
        predefined_responses[keyword] = response  
    return predefined_responses

predefined_responses = get_predefined_responses()

In [9]:
def evaluate_conversation(conversation_df):
    score = 10
    messages = conversation_df['message'].tolist()
    
    for msg in messages:
        if any(keyword in msg.lower() for keyword in predefined_responses):
            for keyword, expected_response in predefined_responses.items():
                if keyword in msg.lower():
                    bot_responses = [m for m in messages if 'bot' in m.lower()]
                    if not any(eval_chain.evaluate_string_distance(expected_response, response) <= 0.1 for response in bot_responses):
                        score -= 2
        if 'bot' in msg.lower() and 'confirm the booking' in msg.lower():
            score += 2
            
    return min(max(score, 1), 10)

lead_ids = df['lead_id'].unique()
results = []

for lead_id in lead_ids:
    conversation_df = df[df['lead_id'] == lead_id]
    conversation_score = evaluate_conversation(conversation_df)
    results.append({'lead_id': lead_id, 'conversation_score': conversation_score})

results_df = pd.DataFrame(results)
results_df.to_csv('/home/mery/Downloads/conversation_scores_langchain_llamaindex.csv', index=False)
results_df

Unnamed: 0,lead_id,conversation_score
0,104332254,10
1,104334572,8
2,104334676,8
3,104335880,10
4,104336965,10
5,104341596,10
6,104343173,10
7,104343389,10
8,104345224,10
9,104348478,8
