In [None]:
from sentence_transformers import SentenceTransformer, util
from litellm import embedding
import os
import numpy as np
import pandas as pd
import torch

os.environ["OPENAI_API_KEY"] = ""

# Import and test run GTE-Large

In [None]:
model = SentenceTransformer('thenlper/gte-large')

In [None]:
sentences_1 = ["My grandmother is nice"]
sentences_2 = ["I think old people are nice"]

embeddings_1 = model.encode(sentences_1, normalize_embeddings=True)
embeddings_2 = model.encode(sentences_2, normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

# Test run OpenAI Ada

In [None]:
open_1 = embedding("text-embedding-ada-002", sentences_1).data[0]["embedding"]
open_2 = embedding("text-embedding-ada-002", sentences_2).data[0]["embedding"]

similarity = np.array(open_1) @ np.array(open_2).T
print(similarity)

# Import and Clean Data

In [None]:
faq = pd.read_csv("<praekelt_mc_faqs.csv>")
qna = pd.read_csv("<validation_khumo_labelled_aaq.csv>")

In [None]:
# Find the elements in qna_set which are not there in faq_set and remove them.
faq_set = set(faq["faq_title"])
qna_set = set(qna["FAQ Name"])

# Remove all rows of qna which are not in faq
qna = qna[~qna["FAQ Name"].isin(qna_set.difference(faq_set))]
qna

# Evaluate GTE-large

In [None]:
# Encode all the questions and faqs
question_embeddings = model.encode(qna["Question"].tolist(), normalize_embeddings=True, convert_to_tensor=True)
faq_embeddings = model.encode(faq["faq_content_to_send"].tolist(), normalize_embeddings=True, convert_to_tensor=True)

In [None]:
# Get top 10 similarity using semantic search from sentence transformers
# You can swith the k to get match rates for any other top k
result = util.semantic_search(
    question_embeddings,
    faq_embeddings,
    top_k=10
)

In [None]:
# Create results_df
top_10_corpus_id = []
top_10_scores = []
top_10_faq_title = []
top_10_faq_content_to_send = []

# Iterate over the list of lists
for x in result:
    # Extract the 'corpus_id' and 'score' values and append them to the respective lists
    top_10_corpus_id.append([item['corpus_id'] for item in x])
    top_10_scores.append([item['score'] for item in x])
    top_10_faq_title.append([faq["faq_title"][item['corpus_id']] for item in x])
    top_10_faq_content_to_send.append([faq["faq_content_to_send"][item['corpus_id']] for item in x])

# Create a DataFrame
results_df = pd.DataFrame({
    'question': qna["question_with_instruction"],
    'top_10_corpus_id': top_10_corpus_id,
    'top_10_scores': top_10_scores,
    'top_10_faq_title': top_10_faq_title,
    'top_10_faq_content_to_send': top_10_faq_content_to_send,
    'correct_title': qna["FAQ Name"],
})

results_df["correct"] = results_df.apply(lambda x: x["correct_title"] in x["top_10_faq_title"], axis=1)
results_df.correct.mean()

# Evaluate Ada (OpenAI)

In [None]:
# Get Ada embeddings for all the questions in qna
faq_embeddings_openai = []
for i in faq["faq_content_to_send"]:
    faq_embeddings_openai.append(embedding("text-embedding-ada-002", i).data[0]["embedding"])

faq_embeddings_openai = torch.tensor(faq_embeddings_openai)

In [None]:
# Get Ada embeddings for all the questions in qna
question_embeddings_openai = []
for i in qna["Question"]:
    question_embeddings_openai.append(embedding("text-embedding-ada-002", i).data[0]["embedding"])

question_embeddings_openai = torch.tensor(question_embeddings_openai)

In [None]:
# Get top 10 similarity using semantic search from sentence transformers
# You can swith the k to get match rates for any other top k
result_openai = util.semantic_search(
    question_embeddings_openai,
    faq_embeddings_openai,
    top_k=10
)

In [None]:
# Create results_df
top_10_corpus_id = []
top_10_scores = []
top_10_faq_title = []
top_10_faq_content_to_send = []

# Iterate over the list of lists
for x in result_openai:
    # Extract the 'corpus_id' and 'score' values and append them to the respective lists
    top_10_corpus_id.append([item['corpus_id'] for item in x])
    top_10_scores.append([item['score'] for item in x])
    top_10_faq_title.append([faq["faq_title"][item['corpus_id']] for item in x])
    top_10_faq_content_to_send.append([faq["faq_content_to_send"][item['corpus_id']] for item in x])

# Create a DataFrame
results_openai_df = pd.DataFrame({
    'question': qna["question_with_instruction"],
    'top_10_corpus_id': top_10_corpus_id,
    'top_10_scores': top_10_scores,
    'top_10_faq_title': top_10_faq_title,
    'top_10_faq_content_to_send': top_10_faq_content_to_send,
    'correct_title': qna["FAQ Name"],
})

results_openai_df["correct"] = results_df.apply(lambda x: x["correct_title"] in x["top_10_faq_title"], axis=1)
results_openai_df.correct.mean()