# Ensemble QQ and QA results

1. Train a QA matching model with the same train-test split (see 1.3 for more details)
2. Combine QQ and QA model scores using various methods (see below)

## Potential methods
Given a query $q'$ we want to find top 5 $(q_{ij}, a_i)$ pairs. 

An FAQ content (document) $a_i$ can have many questions $q_{ij}, j=1, \dots, k_i$ associated with it.

### Method 1
1. Given $q'$, compute scores for all $a_i$'s using Q-A scorer
2. From top $N$ $a_i$'s, say $A_N$, get corresponding questions $Q_N = \{q_{ij}| \forall j \text{ and } i \text{ s.t. } a_i \in A_N\}$
3. Compute scores for each pair $(q', q_{ij}), q_{ij}\in Q_N$, score $s_{ij}$ using Q-Q scorer
4. $s_i = \max_j(s_{ij})$
5. Rank $a_i$'s by $s_i$'s

Why would this work?
- we narrow the pool of candidates using FAQ contents (num(FAQ contents) <= num(FAQ content quesitons))
- if there are questions that are very similar, we'll get a high score

### Method 2
1. Compute scores for all $(q', a_i)$ (using Q-A)
2. Compute scores for all $(q', q_{ij})$ (using Q-Q)
3. Pool scores for each $a_i$ and rank

Pooling methods:
* Average
* Max

### Method 3
Method 1 but use Q-Q scoring first, then Q-A

# 1. Train Q-A matching model

## 1.1 Train

In [None]:
import numpy as np
import pandas as pd
import boto3
import sagemaker

sess = sagemaker.Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

In [None]:
import botocore
from datasets.filesystems import S3FileSystem
from datasets import load_dataset, Dataset

s3 = S3FileSystem()  
s3_prefix='experiment/data/yal/question-answer-matching'
s3_bucket = 'praekelt-static-resources'

training_input_path = f's3://{s3_bucket}/{s3_prefix}/train'
test_short_input_path = f's3://{s3_bucket}/{s3_prefix}/test_short'
test_full_input_path = f's3://{s3_bucket}/{s3_prefix}/test'

In [None]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters = {
    'epochs': 5,
    'train_batch_size': 32,
    'model_name':'distilbert-base-uncased'
}

resource_tags = [
    {"Key":'Project', "Value": 'praekelt-skoll'}, 
    {"Key":'BillingCode', "Value":'praekelt-skoll'},
    {"Key": 'model_type', "Value": 'question-answer-pair-score'}
]

huggingface_estimator = HuggingFace(
    entry_point='train-classification.py',
    source_dir='./scripts',
    instance_type='ml.g4dn.xlarge',
    instance_count=1,
    role=role,
    transformers_version='4.12',
    pytorch_version='1.9',
    py_version='py38',
    hyperparameters=hyperparameters,
    tags=resource_tags
)

In [None]:
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit({'train': training_input_path, 'test': test_short_input_path})

## 1.2 Evaluate

In [None]:
from datasets import load_from_disk

# untokenized_train_input_path = f's3://{s3_bucket}/{s3_prefix}/train_untokenized'
# untokenized_train_dataset = load_from_disk(untokenized_train_input_path, fs=s3)

untokenized_test_input_path = f's3://{s3_bucket}/{s3_prefix}/test_untokenized'
untokenized_test_dataset = load_from_disk(untokenized_test_input_path, fs=s3)

batch_df = untokenized_test_dataset.to_pandas()
batch_df.head()

In [None]:
import re

whitespace = re.compile('\s+')

batch_inputs = batch_df.apply(
    lambda example: 
    '[CLS] ' + whitespace.sub(' ', example.question) + ' [SEP] ' + whitespace.sub(' ', example.faq_content_to_send) + ' [SEP]', 
    axis=1
)

In [None]:
import csv
import json
from sagemaker.s3 import S3Uploader,s3_path_join

# datset files
dataset_jsonl_file = "question_answer_pair_score.jsonl"

with open(dataset_jsonl_file, "w+") as outfile:
    for text in batch_inputs.tolist():
        input_dict = {'inputs': text.replace("@","")}
        json.dump(input_dict, outfile)
        outfile.write('\n')

# uploads a given file to S3.
batch_transform_s3_prefix = 's3://praekelt-static-resources/experiment/outputs/batch-transform'
input_s3_path = s3_path_join(batch_transform_s3_prefix,"input.jsonl")
output_s3_path = s3_path_join(batch_transform_s3_prefix,"output")
s3_file_uri = S3Uploader.upload(dataset_jsonl_file,input_s3_path)

print(f"{dataset_jsonl_file} uploaded to {s3_file_uri}")

In [None]:
# create Transformer to run our batch job
batch_job = huggingface_estimator.transformer(
    instance_count=1,
    instance_type='ml.g4dn.2xlarge',
    output_path=output_s3_path, # we are using the same s3 path to save the output with the input
    strategy='SingleRecord',
    tags=resource_tags,
)

# starts batch transform job and uses s3 data as input
batch_job.transform(
    data=s3_file_uri,
    content_type='application/json',    
    split_type='Line'
)

In [None]:
dataset_jsonl_file = 'question_answer_pair_score.jsonl'
output_file = f"{dataset_jsonl_file}.out"
output_path = s3_path_join('s3://praekelt-static-resources/experiment/outputs/batch-transform/output', output_file)

# download file
S3Downloader.download(output_path,'.')

batch_transform_result = []
with open(output_file) as f:
    for line in f:
        # converts jsonline array to normal array
        line = "[" + line.replace("[","").replace("]",",") + "]"
        batch_transform_result = literal_eval(line) 

In [None]:
pred_results = {
    'faq_id': [],
    'actual': [],
    'predicted': [],
    'question': [],
    'context': [],
}

for i, prediction in enumerate(batch_transform_result):
    score = int(prediction['label'] == 'LABEL_0') * (1 - prediction['score']) + int(prediction['label'] == 'LABEL_1') * prediction['score']
    example = batch_df.iloc[i]
    pred_results['faq_id'].append(example['faq_id'])
    pred_results['actual'].append(float(example['label']))
    pred_results['predicted'].append(score)
    pred_results['question'].append(example['question'])
    pred_results['context'].append(example['faq_content_to_send'])
    
pred = pd.DataFrame(pred_results)
pred.to_pickle(s3_path_join(output_s3_path,'predictions_question_answer_pair_score.pkl'))
pred.plot.scatter(x='actual', y='predicted')

In [None]:
from collections import defaultdict
ranking_accuracy = defaultdict(list)
top_n = [1, 3, 5, 7, 10]
for question, gdf in pred.groupby("question"):
    _df = gdf.sort_values(by='predicted', ascending=False)
    for n in top_n:
        ranking_accuracy[f"top_{n}"].append((_df["actual"].iloc[:n] == 1.0).any())
        
ranking_acc_result = dict()
for k, v in ranking_accuracy.items():
    ranking_acc_result[k] = pd.Series(v).mean()
    
(pd.Series(ranking_acc_result) * 100).to_frame()

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt

fpr, tpr, _ = roc_curve(pred.actual, pred.predicted)
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % roc_auc,
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic example")
plt.legend(loc="lower right")
plt.show()

In [None]:
cm = confusion_matrix(pred.actual.astype(int), pred.predicted > 0.5)
import seaborn as sns

sns.heatmap(cm, annot=True)

# 2. Load predictions from Q-Q scorer

In [None]:
pred

Load batch_df again..

In [None]:
s3_prefix='experiment/data/yal/question-question-matching'
untokenized_test_qq_input_path = f's3://{s3_bucket}/{s3_prefix}/test_untokenized'
untokenized_test_qq_dataset = load_from_disk(untokenized_test_qq_input_path, fs=s3)

batch_qq_df = untokenized_test_qq_dataset.to_pandas()
batch_qq_df.head()

In [None]:
dataset_jsonl_file = 'question_question_pair_score.jsonl'
output_file = f"{dataset_jsonl_file}.out"
output_path = s3_path_join('s3://praekelt-static-resources/experiment/outputs/batch-transform/output', output_file)

# download file
S3Downloader.download(output_path,'.')

batch_transform_result = []
with open(output_file) as f:
    for line in f:
        # converts jsonline array to normal array
        line = "[" + line.replace("[","").replace("]",",") + "]"
        batch_transform_result = literal_eval(line) 

pred_results = {
    'faq_id': [],
    'actual': [],
    'predicted': [],
    'question': [],
    'question_ref': [],
    'context': [],
}

for i, prediction in enumerate(batch_transform_result):
    score = int(prediction['label'] == 'LABEL_0') * (1 - prediction['score']) + int(prediction['label'] == 'LABEL_1') * prediction['score']
    example = batch_qq_df.iloc[i]
    pred_results['faq_id'].append(example['faq_id'])
    pred_results['actual'].append(float(example['label']))
    pred_results['predicted'].append(score)
    pred_results['question'].append(example['question'])
    pred_results['question_ref'].append(example['question_ref'])
    pred_results['context'].append(example['faq_content_to_send'])
    
pred_qq = pd.DataFrame(pred_results)
pred_qq.to_pickle(s3_path_join(output_s3_path,'predictions_question_question_pair_score.pkl'))
pred_qq.plot.scatter(x='actual', y='predicted')

In [None]:
# get scores for each (q', a_i) pair, by averaging over q_ij's
scores_avged = pred_qq.groupby(['question', 'faq_id']).predicted.mean().reset_index().rename(columns={"predicted": "predicted_qq_avg"})
scores_maxed = pred_qq.groupby(['question', 'faq_id']).predicted.max().reset_index().rename(columns={"predicted": "predicted_qq_max"})

In [None]:
pred_qq[(pred_qq.question == scores_avged.loc[0].question) & (pred_qq.faq_id == 85)]

In [None]:
pred_qq_maxed = scores_maxed.merge(pred_qq[['question', 'faq_id', 'actual']].drop_duplicates(), how="left")
pred_qq_maxed

In [None]:
pred_qq_avged = scores_avged.merge(pred_qq[['question', 'faq_id', 'actual']].drop_duplicates(), how="left")
pred_qq_avged

In [None]:
248*104

# 3. Combine scores

## 3.1 Method 1

1. Given $q'$, compute scores for all $a_i$'s using Q-A scorer
2. From top $N$ $a_i$'s, say $A_N$, get corresponding questions $Q_N = \{q_{ij}| \forall j \text{ and } i \text{ s.t. } a_i \in A_N\}$
3. Compute scores for each pair $(q', q_{ij}), q_{ij}\in Q_N$, score $s_{ij}$ using Q-Q scorer
4. $s_i = \max_j(s_{ij})$
5. Rank $a_i$'s by $s_i$'s


In [None]:
top_10_by_qa = pred.sort_values(by='predicted', ascending=False).groupby('question').head(10).sort_values(by=['question', 'predicted'], ascending=False)
top_10_by_qa.head(20)

In [None]:
top_10_by_qa.rename(columns={"predicted": "predicted_qa"}, inplace=True)

In [None]:
top_10_by_qa = top_10_by_qa.merge(scores_maxed)

In [None]:
top_10_by_qa.head()

In [None]:
from collections import defaultdict

def get_top_k_accuracy(df, variable):
    ranking_accuracy = defaultdict(list)
    top_n = [1, 3, 5, 7, 10]
    for question, gdf in df.groupby("question"):
        _df = gdf.sort_values(by=variable, ascending=False)
        for n in top_n:
            ranking_accuracy[f"top_{n}"].append((_df["actual"].iloc[:n] == 1.0).any())

    ranking_acc_result = dict()
    for k, v in ranking_accuracy.items():
        ranking_acc_result[k] = pd.Series(v).mean()
    
    return pd.Series(ranking_acc_result).to_frame()

get_top_k_accuracy(top_10_by_qa, 'predicted_qq_max') * 100

In [None]:
get_top_k_accuracy(top_10_by_qa.merge(scores_avged), 'predicted_qq_avg') * 100

In [None]:
get_top_k_accuracy(pred, 'predicted') * 100

In [None]:
get_top_k_accuracy(pred_qq_maxed, 'predicted_qq_max') * 100

In [None]:
get_top_k_accuracy(pred_qq_avged, 'predicted_qq_avg') * 100

## 3.2 Method 2
1. Compute scores for all $(q', a_i)$ (using Q-A)
2. Compute scores for all $(q', q_{ij})$ (using Q-Q)
3. Pool scores for each $a_i$ and rank

Pooling methods:
* Average
* Max

In [None]:
pred_qq_maxed.head()

In [None]:
pred.head()

In [None]:
pred.shape, pred_qq_maxed.shape

In [None]:
pred_qa_qq_max_merged = pred.merge(pred_qq_maxed)
assert pred_qa_qq_max_merged.shape[0] == pred.shape[0]
pred_qa_qq_max_merged.head()

In [None]:
pred_qa_qq_max_merged.loc[:, "pooled_avg"] = pred_qa_qq_max_merged[['predicted', 'predicted_qq_max']].mean(axis=1)
pred_qa_qq_max_merged.loc[:, "pooled_max"] = pred_qa_qq_max_merged[['predicted', 'predicted_qq_max']].max(axis=1)

In [None]:
get_top_k_accuracy(pred_qa_qq_max_merged, 'pooled_max') * 100

In [None]:
get_top_k_accuracy(pred_qa_qq_max_merged, 'pooled_avg') * 100

In [None]:
pred_qa_qq_avg_merged = pred.merge(pred_qq_avged)
pred_qa_qq_avg_merged.loc[:, "pooled_avg"] = pred_qa_qq_avg_merged[['predicted', 'predicted_qq_avg']].mean(axis=1)
pred_qa_qq_avg_merged.loc[:, "pooled_max"] = pred_qa_qq_avg_merged[['predicted', 'predicted_qq_avg']].max(axis=1)

get_top_k_accuracy(pred_qa_qq_avg_merged, 'pooled_max') * 100

In [None]:
get_top_k_accuracy(pred_qa_qq_avg_merged, 'pooled_avg') * 100