# Question-question matching (classifier)

* input: `[CLS] <user query> [SEP] <example question j for content i> [SEP]` for all `i, j`

In [None]:
import numpy as np
import pandas as pd
import boto3
import sagemaker

sess = sagemaker.Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

In [None]:
import botocore
from datasets.filesystems import S3FileSystem
from datasets import load_dataset, Dataset

s3 = S3FileSystem()
s3_bucket = 'praekelt-static-resources'
s3_prefix='experiment/data/yal/question-question-matching'

training_input_path = f's3://{s3_bucket}/{s3_prefix}/train'
test_short_input_path = f's3://{s3_bucket}/{s3_prefix}/test_short'
test_full_input_path = f's3://{s3_bucket}/{s3_prefix}/test'

training_input_path

In [None]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters = {
    'epochs': 5,
    'train_batch_size': 32,
    'model_name':'distilbert-base-uncased'
}
resource_tags = [
    {"Key":'Project', "Value": 'praekelt-skoll'}, 
    {"Key":'BillingCode', "Value":'praekelt-skoll'}
]
huggingface_estimator = HuggingFace(
    entry_point='train-classification.py',
    source_dir='./scripts',
    instance_type='ml.g4dn.2xlarge',
    instance_count=1,
    role=role,
    transformers_version='4.12',
    pytorch_version='1.9',
    py_version='py38',
    hyperparameters=hyperparameters,
    tags=resource_tags
)

In [None]:
%%capture
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit({'train': training_input_path, 'test': test_short_input_path})

In [None]:
huggingface_estimator.model_data

# Evaluate

In [None]:
from sagemaker.huggingface import HuggingFaceModel

huggingface_estimator = HuggingFaceModel(
    role=role, 
    model_data='s3://sagemaker-af-south-1-678681925278/huggingface-pytorch-training-2022-07-27-08-03-47-876/output/model.tar.gz',
    image_uri='626614931356.dkr.ecr.af-south-1.amazonaws.com/huggingface-pytorch-training:1.9-transformers4.12-gpu-py38-cu111-ubuntu20.04'
)

# FAQ Ranking


## Batch transform


In [None]:
from datasets import load_from_disk
untokenized_test_input_path = f's3://{s3_bucket}/{s3_prefix}/test_untokenized'
untokenized_test_dataset = load_from_disk(untokenized_test_input_path, fs=s3)

In [None]:
batch_df = untokenized_test_dataset.to_pandas()
batch_df.head()

In [None]:
batch_df.question.nunique()

In [None]:
import re

whitespace = re.compile('\s+')

In [None]:
batch_inputs = batch_df.apply(
    lambda example: 
    '[CLS] ' + whitespace.sub(' ', example.question) + ' [SEP] ' + whitespace.sub(' ', example.question_ref) + ' [SEP]', 
    axis=1
)

In [None]:
batch_inputs.tolist()[0]

In [None]:
import csv
import json
from sagemaker.s3 import S3Uploader,s3_path_join

# datset files
dataset_jsonl_file = "question_question_pair_score.jsonl"

with open(dataset_jsonl_file, "w+") as outfile:
    for text in batch_inputs.tolist():
        input_dict = {'inputs': text.replace("@","")}
        json.dump(input_dict, outfile)
        outfile.write('\n')

# uploads a given file to S3.
batch_transform_s3_prefix = 's3://praekelt-static-resources/experiment/outputs/batch-transform'
input_s3_path = s3_path_join(batch_transform_s3_prefix,"input.jsonl")
output_s3_path = s3_path_join(batch_transform_s3_prefix,"output")
s3_file_uri = S3Uploader.upload(dataset_jsonl_file,input_s3_path)

print(f"{dataset_jsonl_file} uploaded to {s3_file_uri}")

In [None]:
%%capture

resource_tags = [
    {"Key":'Project', "Value": 'praekelt-skoll'}, 
    {"Key":'BillingCode', "Value":'praekelt-skoll'},
    {"Key": 'model_type', "Value": 'question-question-pair-score'}
]

# create Transformer to run our batch job
batch_job = huggingface_estimator.transformer(
    instance_count=1,
    instance_type='ml.g4dn.2xlarge',
    output_path=output_s3_path, # we are using the same s3 path to save the output with the input
    strategy='SingleRecord',
    tags=resource_tags,
)

# starts batch transform job and uses s3 data as input
batch_job.transform(
    data=s3_file_uri,
    content_type='application/json',    
    split_type='Line'
)

In [None]:
import json
from sagemaker.s3 import S3Downloader
from ast import literal_eval
# creating s3 uri for result file -> input file + .out
output_file = f"{dataset_jsonl_file}.out"
output_path = s3_path_join(output_s3_path,output_file)

# download file
S3Downloader.download(output_path,'.')

batch_transform_result = []
with open(output_file) as f:
    for line in f:
        # converts jsonline array to normal array
        line = "[" + line.replace("[","").replace("]",",") + "]"
        batch_transform_result = literal_eval(line) 
        
# print results 
print(batch_transform_result[:3])

## Real-time prediction Ranking

Real-time prediction

In [None]:
batch_df.head()

In [None]:
batch_df.shape

In [None]:
predictor = huggingface_estimator.deploy(1, instance_type='ml.g4dn.xlarge', tags=resource_tags)

In [None]:
pred_results = {
    'faq_id': [],
    'actual': [],
    'predicted': [],
    'question': [],
    'question_ref': [],
    'context': [],
}

for i, prediction in enumerate(batch_transform_result):
    score = int(prediction['label'] == 'LABEL_0') * (1 - prediction['score']) + int(prediction['label'] == 'LABEL_1') * prediction['score']
    example = batch_df.iloc[i]
    pred_results['faq_id'].append(example['faq_id'])
    pred_results['actual'].append(float(example['label']))
    pred_results['predicted'].append(score)
    pred_results['question'].append(example['question'])
    pred_results['question_ref'].append(example['question_ref'])
    pred_results['context'].append(example['faq_content_to_send'])

In [None]:
pred = pd.DataFrame(pred_results)
pred.to_pickle(s3_path_join(output_s3_path,'predictions_question_question_pair_score.pkl')
pred.plot.scatter(x='actual', y='predicted')

In [None]:
output_s3_path

Check ranking quality

- for each question, want to average or max the scores across multiple question_refs

### TODO: Pool scores across multiple...###

In [None]:
from collections import defaultdict
ranking_accuracy = defaultdict(list)
top_n = [1, 3, 5, 7, 10]

pred_avged = pred.drop(columns=["question_ref"]).drop_duplicates()
pred_avged.loc[:, "predicted"] = pred.groupby(["question", "question_ref"]).predicted.mean()


for question, gdf in pred.groupby("question"):
    _df = gdf.sort_values(by='predicted', ascending=False)
    for n in top_n:
        ranking_accuracy[f"top_{n}"].append((_df["actual"].iloc[:n] == 1.0).any())

In [None]:
ranking_acc_result = dict()
for k, v in ranking_accuracy.items():
    ranking_acc_result[k] = pd.Series(v).mean()
    
(pd.Series(ranking_acc_result) * 100).to_frame()

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt

In [None]:
fpr, tpr, _ = roc_curve(pred.actual, pred.predicted)
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % roc_auc,
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic example")
plt.legend(loc="lower right")
plt.show()

In [None]:
cm = confusion_matrix(pred.actual.astype(int), pred.predicted > 0.5)
import seaborn as sns

sns.heatmap(cm, annot=True)