# Question-Question matching using sentence embeddings

* Get embeddings for all test questions, `E_test`
  * get unique `question` from test data
* Get embeddings for all reference questions, `E_ref`
  * from training data for q-q, get the positive samples, and extract unique questions from `question_ref` column
* Get match scores for `E_text`, `E_ref` all combinations
* Pool cosine similarities


In [None]:
import numpy as np
import pandas as pd
import boto3
import sagemaker

sess = sagemaker.Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

In [None]:
import botocore
from datasets.filesystems import S3FileSystem
from datasets import load_dataset, Dataset

s3 = S3FileSystem()
s3_bucket = 'praekelt-static-resources'
s3_prefix='experiment/data/yal/question-question-matching'

training_input_path = f's3://{s3_bucket}/{s3_prefix}/train'
test_input_path = f's3://{s3_bucket}/{s3_prefix}/test'

In [None]:
training_input_path

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
#Sentences we want to encode. Example:
sentence = ['This framework generates embeddings for each input sentence']


#Sentences are encoded by calling model.encode()
embedding = model.encode(sentence)

In [None]:
embedding.shape

In [None]:
hub = {
  'HF_MODEL_ID': 'sentence-transformers/all-MiniLM-L6-v2', # model_id from hf.co/models
  'HF_TASK': 'feature-extraction'                           # NLP task you want to use for predictions
}


huggingface_estimator = HuggingFaceModel(
    env=hub,
    role=role,
    transformers_version='4.12',
    pytorch_version='1.9',
    py_version='py38',
)

## Realtime inference

In [None]:
resource_tags = [
    {"Key":'Project', "Value": 'praekelt-skoll'}, 
    {"Key":'BillingCode', "Value":'praekelt-skoll'}
]

predictor = huggingface_estimator.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge", tags=resource_tags)

# FAQ Ranking


## Batch transform


In [None]:
from datasets import load_from_disk

untokenized_train_input_path = f's3://{s3_bucket}/{s3_prefix}/train_untokenized'
untokenized_train_dataset = load_from_disk(untokenized_train_input_path, fs=s3)

untokenized_test_input_path = f's3://{s3_bucket}/{s3_prefix}/test_untokenized'
untokenized_test_dataset = load_from_disk(untokenized_test_input_path, fs=s3)

Get reference questions

In [None]:
train_df = untokenized_train_dataset.to_pandas()
df_faq_ref = train_df[train_df.label == 1].drop(columns=["question", "faq_content_to_send", "__index_level_0__"]).drop_duplicates()

In [None]:
df_faq_ref

get test questions

In [None]:
test_df = untokenized_test_dataset.to_pandas()
test_questions = test_df[~test_df['question'].duplicated()].drop(columns=["question_ref", "faq_content_to_send", "__index_level_0__"])

In [None]:
test_questions

In [None]:
batch_df = pd.concat(
    [
        df_faq_ref.assign(is_test=0).rename(columns={"question_ref": "question"}),
        test_questions.assign(is_test=1)
    ], 
    axis=0
)

In [None]:
batch_df.head()

In [None]:
batch_df.question.nunique()

In [None]:
import re

whitespace = re.compile('\s+')

In [None]:
batch_inputs = batch_df.apply(
    lambda example: 
    '[CLS] ' + whitespace.sub(' ', example.question) + ' [SEP]', 
    axis=1
)

## Realtime prediction

In [None]:
output_array = []
for text in batch_inputs:
    data = {"inputs": text}
    output_array.append(np.asarray(predictor.predict(data)[0][0]))

In [None]:
import numpy as np

output = np.asarray(output_array)

In [None]:
batch_df.shape

In [None]:
batch_df.loc[:, "embedding"] = output_array

In [None]:
batch_df.shape

In [None]:
batch_df.head()

In [None]:
ref_embeddings = batch_df[batch_df.is_test == 0]
q_embeddings = batch_df[batch_df.is_test == 1]

ref = np.asarray(ref_embeddings.embedding.tolist())
q = np.asarray(q_embeddings.embedding.tolist())

ref.shape, q.shape

In [None]:
ref_norm = np.linalg.norm(ref, axis=1)
cossim_list = []

for qi in q:
    cossim = np.dot(qi, ref.T) / (np.linalg.norm(qi) * ref_norm)
    cossim_list.append(cossim)

In [None]:
cossim_arr = np.asarray(cossim_list)

In [None]:
cossim_arr.shape

Now we have scores for each reference question.

For each question we want to average the scores among same FAQ questions.

In [None]:
ref_embeddings.head()

In [None]:
from collections import defaultdict

top10_scores = defaultdict(list)
ks = [1, 3, 5, 7, 10]

for i, a in enumerate(cossim_arr):
    ref_embeddings.loc[:, "cossim"] = a.flatten()
    scores = ref_embeddings.groupby("faq_id").cossim.agg(["mean", "max"])
    top10_by_max = scores[(scores.shape[0] - scores['max'].argsort()) <= 10].index.tolist()
    top10_by_mean = scores[(scores.shape[0] - scores['mean'].argsort()) <= 10].index.tolist()
    top10_scores['question'].append(q_embeddings.iloc[i].question)
    top10_scores['top10_by_max'].append(top10_by_max)
    top10_scores['top10_by_mean'].append(top10_by_mean)
    top10_scores['faq_id'].append(q_embeddings.iloc[i].faq_id)

In [None]:
top10_score_df = pd.DataFrame(top10_scores)

accuracy_by_max = defaultdict(list)
accuracy_by_mean = defaultdict(list)

for k in ks:
    accuracy_by_max[k].append(top10_score_df.apply(lambda row: row.faq_id in row.top10_by_max[:k], axis=1).mean())
    accuracy_by_mean[k].append(top10_score_df.apply(lambda row: row.faq_id in row.top10_by_mean[:k], axis=1).mean())

In [None]:
accuracy_by_mean

In [None]:
cossim_argsort = np.argsort(cossim_arr)  # rank of each number, smallest is 0
cossim_argsort_argsort = np.argsort(cossim_argsort) # rank of position, top 10 is the last 10

In [None]:
# pred_results = {
#     'faq_id': [],
#     'actual': [],
#     'predicted': [],
#     'question': [],
#     'question_ref': [],
#     'context': [],
# }

# for i, prediction in enumerate(batch_transform_result):
#     score = int(prediction['label'] == 'LABEL_0') * (1 - prediction['score']) + int(prediction['label'] == 'LABEL_1') * prediction['score']
#     example = batch_df.iloc[i]
#     pred_results['faq_id'].append(example['faq_id'])
#     pred_results['actual'].append(float(example['label']))
#     pred_results['predicted'].append(score)
#     pred_results['question'].append(example['question'])
#     pred_results['question_ref'].append(example['question_ref'])
#     pred_results['context'].append(example['faq_content_to_send'])

In [None]:
pred = pd.DataFrame(pred_results)
pred.to_pickle(s3_path_join(output_s3_path,'predictions_question_embedding.pkl')
pred.plot.scatter(x='actual', y='predicted')

In [None]:
pred.question.nunique()

Check ranking quality

In [None]:
from collections import defaultdict
ranking_accuracy = defaultdict(list)
top_n = [1, 3, 5, 7, 10]
for question, gdf in pred.groupby("question"):
    _df = gdf.sort_values(by='predicted', ascending=False)
    for n in top_n:
        ranking_accuracy[f"top_{n}"].append((_df["actual"].iloc[:n] == 1.0).any())

In [None]:
ranking_acc_result = dict()
for k, v in ranking_accuracy.items():
    ranking_acc_result[k] = pd.Series(v).mean()
    
print(ranking_acc_result)

In [None]:
(pd.Series(ranking_acc_result) * 100).to_frame()

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt

In [None]:
for k, v in ranking_acc_result.items():
    print(f"{k}\t{v*100:.1f}%")

In [None]:
fpr, tpr, _ = roc_curve(pred.actual, pred.predicted)
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % roc_auc,
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic example")
plt.legend(loc="lower right")
plt.show()

In [None]:
cm = confusion_matrix(pred.actual.astype(int), pred.predicted > 0.5)
import seaborn as sns

sns.heatmap(cm, annot=True)