In [None]:
import numpy as np
import pandas as pd
import boto3
import sagemaker

sess = sagemaker.Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

In [None]:
import botocore
from datasets.filesystems import S3FileSystem
from datasets import load_dataset, Dataset

s3 = S3FileSystem()  
s3_prefix='experiment/data/automodel_classification_split'

training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train_with_neg'
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test_with_neg'

In [None]:
training_input_path

In [None]:
from sagemaker.huggingface import HuggingFace

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [None]:
model.config

In [None]:
# hyperparameters, which are passed into the training job
hyperparameters = {
    'epochs': 5,
    'train_batch_size': 32,
    'model_name':'distilbert-base-uncased'
}
resource_tags = [
    {"Key":'Project', "Value": 'praekelt-skoll'}, 
    {"Key":'BillingCode', "Value":'praekelt-skoll'}
]
huggingface_estimator = HuggingFace(
    entry_point='train-classification.py',
    source_dir='./scripts',
    instance_type='ml.g4dn.xlarge',
    instance_count=1,
    role=role,
    transformers_version='4.12',
    pytorch_version='1.9',
    py_version='py38',
    hyperparameters=hyperparameters,
    tags=resource_tags
)

In [None]:
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

In [None]:
huggingface_estimator.model_data

# Evaluate

In [None]:
resource_tags = [
    {"Key":'Project', "Value": 'praekelt-skoll'}, 
    {"Key":'BillingCode', "Value":'praekelt-skoll'}
]
predictor = huggingface_estimator.deploy(1, instance_type='ml.m5.xlarge', tags=resource_tags)

In [None]:
from datasets import load_from_disk
from datasets.filesystems import S3FileSystem

# create S3FileSystem without credentials
s3 = S3FileSystem()  

# load encoded_dataset to from s3 bucket
s3_prefix='experiment/data/automodel_classification_split'

simple_test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test_with_neg'
simple_tokenized_test_dataset = load_from_disk(simple_test_input_path, fs=s3)  

In [None]:
simple_tokenized_test_dataset

In [None]:
# load encoded_dataset to from s3 bucket
s3_prefix='experiment/data/untokenized_split'

# save test_dataset to s3
untokenized_test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test_with_neg'
untokenized_test_dataset = load_from_disk(untokenized_test_input_path, fs=s3)  
untokenized_train_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train_with_neg'
untokenized_train_dataset = load_from_disk(untokenized_train_input_path, fs=s3)

In [None]:
test_questions = pd.Series(untokenized_test_dataset['question'])
test_questions[test_questions.isin(untokenized_train_dataset['question'])]

In [None]:
untokenized_test_dataset

In [None]:
untokenized_inputs = [
    {'inputs': ['[CLS] ' + example['question'] + ' [SEP] ' + example['faq_content_to_send'] + ' [SEP]']} for example in untokenized_test_dataset
]

In [None]:
preds = predictor.predict(untokenized_inputs[12])

In [None]:
preds

In [None]:
untokenized_test_dataset[12]

# FAQ Ranking


## Batch transform


We CAN'T have the same questions in training as in test data.
We should select negative samples from only within training data!!!

In [None]:
faqs = pd.read_csv("s3://praekelt-static-resources/experiment/data/yal_faqmatches.csv")
faqs = faqs[~faqs.faq_title.duplicated()]
train_df = pd.read_csv("s3://praekelt-static-resources/experiment/data/train_dataset_untokenized.csv")

train_faqs = faqs[faqs.faq_id.isin(train_df.faq_id.unique())]

In [None]:
qdf.faq_id.isin(train_faqs.faq_id.unique()).all()

In [None]:
batch_df_list = []
faqs_keep_cols = ['faq_id', 'faq_title', 'faq_content_to_send']
for question, qdf in untokenized_test_dataset.to_pandas().groupby('question'):
    if qdf.shape[0] > 1:
        print(f"Following question occurred multiple times and hence will be skipped: {question}")
        continue
    
    faq_id = qdf.faq_id.iloc[0]
    other_faqs = train_faqs.loc[train_faqs.faq_id != faq_id, faqs_keep_cols].copy()
    other_faqs["question"] = question
    other_faqs["label"] = 0
    
    batch_df_list.extend([qdf.drop(columns=['__index_level_0__']), other_faqs])
    
batch_df = pd.concat(batch_df_list, axis=0)

In [None]:
batch_df.head()

In [None]:
batch_df.question.nunique()

In [None]:
batch_inputs = batch_df.apply(lambda example: '[CLS] ' + example.question + ' [SEP] ' + example.faq_content_to_send + ' [SEP]', axis=1).tolist()

In [None]:
import csv
import json
from sagemaker.s3 import S3Uploader,s3_path_join

# datset files
dataset_jsonl_file="batch_test_data.jsonl"

with open(dataset_jsonl_file, "w+") as outfile:
    for text in batch_inputs:
        input_dict = {'inputs': text.replace("@","")}
        json.dump(input_dict, outfile)
        outfile.write('\n')

# uploads a given file to S3.
input_s3_path = s3_path_join("s3://",sess.default_bucket(),"batch_transform/input")
output_s3_path = s3_path_join("s3://",sess.default_bucket(),"batch_transform/output")
s3_file_uri = S3Uploader.upload(dataset_jsonl_file,input_s3_path)

print(f"{dataset_jsonl_file} uploaded to {s3_file_uri}")

In [None]:
# create Transformer to run our batch job
batch_job = huggingface_estimator.transformer(
    instance_count=1,
    instance_type='ml.g4dn.xlarge',
    output_path=output_s3_path, # we are using the same s3 path to save the output with the input
    strategy='MultiRecord',
    tags=resource_tags
)

# starts batch transform job and uses s3 data as input
batch_job.transform(
    data=s3_file_uri,
    content_type='application/json',    
    split_type='Line')

In [None]:
import json
from sagemaker.s3 import S3Downloader
from ast import literal_eval
# creating s3 uri for result file -> input file + .out
output_file = f"{dataset_jsonl_file}.out"
output_path = s3_path_join(output_s3_path,output_file)

# download file
S3Downloader.download(output_path,'.')

batch_transform_result = []
with open(output_file) as f:
    for line in f:
        # converts jsonline array to normal array
        line = "[" + line.replace("[","").replace("]",",") + "]"
        batch_transform_result = literal_eval(line) 
        
# print results 
print(batch_transform_result[:3])

## Real-time prediction Ranking

Real-time prediction

In [None]:
batch_df.head()

In [None]:
batch_df.shape

In [None]:
predictor = huggingface_estimator.deploy(1, instance_type='ml.m5.xlarge', tags=resource_tags)

In [None]:
pred_results = {
    'faq_id': [],
    'actual': [],
    'predicted': [],
    'question': [],
    'context': [],
}
for i, (idx, example) in enumerate(batch_df.iterrows()):
    prediction = predictor.predict({'inputs': batch_inputs[i]})[0]
    score = int(prediction['label'] == 'LABEL_0') * (1 - prediction['score']) + int(prediction['label'] == 'LABEL_1') * prediction['score'] 
    pred_results['faq_id'].append(example['faq_id'])
    pred_results['actual'].append(float(example['label']))
    pred_results['predicted'].append(score)
    pred_results['question'].append(example['question'])
    pred_results['context'].append(example['faq_content_to_send'])

In [None]:
pred = pd.DataFrame(pred_results)
pred.plot.scatter(x='actual', y='predicted')

In [None]:
pred[(pred["question"] == "What's the best way to get over a break up?") & (pred["actual"] == 1.0)]

In [None]:
pred[pred.question == "What does it mean if my HIV count is very low "]

Drop this question because it maps to two FAQs in yal dataset

In [None]:
pred = pred[pred.question != "What does it mean if my HIV count is very low "]

In [None]:
pred.to_pickle("predictions.pkl")

In [None]:
pred.shape

In [None]:
pred.question.nunique()

Check ranking quality

In [None]:
from collections import defaultdict
ranking_accuracy = defaultdict(list)
top_n = [1, 3, 5, 7, 10]
for question, gdf in pred.groupby("question"):
    _df = gdf.sort_values(by='predicted', ascending=False)
    for n in top_n:
        ranking_accuracy[f"top_{n}"].append((_df["actual"].iloc[:n] == 1.0).any())

In [None]:
ranking_acc_result = dict()
for k, v in ranking_accuracy.items():
    ranking_acc_result[k] = pd.Series(v).mean()
    
print(ranking_acc_result)

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt

In [None]:
for k, v in ranking_acc_result.items():
    print(f"{k}\t{v*100:.1f}%")

In [None]:
fpr, tpr, _ = roc_curve(pred.actual, pred.predicted)
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % roc_auc,
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic example")
plt.legend(loc="lower right")
plt.show()

In [None]:
cm = confusion_matrix(pred.actual.astype(int), pred.predicted > 0.5)
import seaborn as sns

sns.heatmap(cm, annot=True)

In [None]:
cm

In [None]:
pred.sample().iloc[0].to_dict()

## Save data to praekelt s3

In [None]:
questions_df = pred.question.drop_duplicates().to_frame()
questions_df.to_csv("s3://praekelt-static-resources/experiment/data/test_questions.csv", index=False)

In [None]:
mask = (pred.question.apply(lambda x: x.strip()) == "What does it mean if my HIV count is very low") & (pred.actual == 1.0)
print(pred.question[mask].iloc[0])
pred[mask]

In [None]:
pred.question.unique()

In [None]:
import numpy as np

In [None]:
np.array_equal(questions_df.question.values, pred.question.unique())

In [None]:
train_untokenized_df = untokenized_train_dataset.to_pandas()

In [None]:
train_untokenized_df.head()

In [None]:
train_untokenized_df.drop(columns=["__index_level_0__"]).to_csv("s3://praekelt-static-resources/experiment/data/train_dataset_untokenized.csv", index=False)

In [None]:
train_untokenized_df.groupby('label').size()

In [None]:
1623