In [None]:
import numpy as np
import pandas as pd
import boto3
import sagemaker

sess = sagemaker.Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

In [None]:
faqs = pd.read_csv("s3://praekelt-static-resources/experiment/data/yal_faqmatches.csv")
faqs = faqs[~faqs.faq_title.duplicated()]

df = pd.concat(
    (pd.read_csv(f"s3://praekelt-static-resources/yal_validation/yal_validation_questions_batch_{n}.csv") 
     for n in [1, 2])
    , axis=0
).drop(columns=["Unnamed: 0"]).reset_index(drop=True)

df = df[~df.question.duplicated()]

df_merged = df.merge(faqs, left_on="faq_title", right_on="faq_title", how="inner")
df_merged = df_merged.drop_duplicates()

In [None]:
df_merged.head()

In [None]:
df_merged.question.duplicated().sum()

ADDING ANOTHER SYNTHETIC QUESTION

In [None]:
df_merged[df_merged.faq_id == 119].faq_content_to_send.unique()

In [None]:
faq_to_add_question = df_merged[df_merged.faq_id == 119].iloc[0]
df_merged[df_merged.faq_id == 119]

In [None]:
df_merged = df_merged.append({
    "question": "how to tell my viral load",
    "faq_title": faq_to_add_question.faq_title,
    "faq_id": 119,
    "faq_content_to_send": faq_to_add_question.faq_content_to_send, 
},ignore_index=True)

# Data Split

We need 3 question groups
1. Questions to use in representing FAQ contents (each content is represented by (q, a) pair(s) -- there could be multiple questions for an answer)
2. Questions for training - train to match these questions to 1
3. Questions for testing - test how well the model matches these question to 1

Negative samples: for each FAQ, pick 8~10 unrelated questions and label as 0. These questions can come from 1&2 combined.

* For Q-Q matching
  * Train: positive + negative samples
    * (+) For each `Q` in 2, create `(Q, q)` pairs labelled 1 if `Q` and `q` have the same `faq_id` (`q` in 1)
    * (-) as described above
  * Test:
    * (+) For each `Q` in 3, create `(Q, q)` pairs labelled 1 if `Q` and `q` have the same `faq_id` (`q` in 1)
    * (-) For each `Q` in 3, create `(Q, q-)` pairs labelled 0, for each `q- != q` in 1
* For Q-A relevance scoring
    - Train: Use 1 & 2 & negative samples
    - Test: 3

In [None]:
from sklearn.model_selection import train_test_split

keep_columns = ['question', 'faq_title', 'faq_id', 'faq_content_to_send']
df_merged = df_merged[keep_columns]

df_faq_ref = df_merged.groupby('faq_title').sample(2, replace=False, random_state=42)

In [None]:
df_faq_ref.head()

In [None]:
df_merged.shape

In [None]:
df_faq_ref.shape

In [None]:
df_merged[~df_merged.index.isin(df_faq_ref.index)]

In [None]:
df_merged_remaining = df_merged.drop(index=df_faq_ref.index)

In [None]:
df_faq_ref = df_faq_ref.rename(columns={'question': 'question_ref'})
df_faq_ref.head()

In [None]:
positive_train_df, test_df = train_test_split(df_merged_remaining, test_size=0.4, stratify=df_merged_remaining.faq_title, random_state=42)

In [None]:
df_merged_remaining.groupby('faq_title').size().hist(bins=16)

In [None]:
df_merged.groupby("faq_id").size().min()

In [None]:
df_merged.groupby("faq_id").size()[df_merged.groupby("faq_id").size() == 3]

In [None]:
df_merged_remaining.question.isin(df_faq_ref.question_ref).any()

In [None]:
test_df.question.isin(positive_train_df.question).any()

## Positive samples

In [None]:
positive_train_df.head()

In [None]:
# Create positive q-q pairs
positive_train_qq_df = positive_train_df.merge(df_faq_ref)
positive_train_qq_df.loc[:, "label"] = 1
positive_train_qq_df.head()

In [None]:
assert positive_train_df.shape[0] * 2 == positive_train_qq_df.shape[0]

## Negative samples

!!! Remember to set the labels as integers!

Create a dataframe of all questions for each FAQ (including reference questions)

In [None]:
positive_df = pd.concat([
    positive_train_df, 
    df_faq_ref.rename(columns={'question_ref': 'question'})
], axis=0)
positive_df.loc[:, "label"] = 1
assert not positive_df.question.duplicated().any()
positive_df.head()

In [None]:
positive_df[positive_df.faq_id == 8]

In [None]:
negative_qq_samples = []
for cur_id, _df in positive_df.groupby('faq_id'):
    # all questions whose FAQ ID is not this one (wrong questions to this FAQ)
    cur_negative_questions = positive_df[positive_df.faq_id != cur_id]
    cur_negative_questions.loc[:, 'faq_id'] = cur_id
    cur_negative_questions.loc[:, 'faq_title'] = faqs.loc[faqs.faq_id == cur_id, "faq_title"].iloc[0]
    cur_negative_questions.loc[:, 'faq_content_to_send'] = faqs.loc[faqs.faq_id == cur_id, "faq_content_to_send"].iloc[0]
    
    # merge this FAQ's questions -- creates all possible negative samples for this FAQ
    all_possible_negative_samples = cur_negative_questions.merge(
        _df.rename(columns={"question": "question_ref"}) # technically question_ref contains correct questions for this FAQ
    )
    
    # Sample 5 negative samples per reference questions
    cur_negative_samples = all_possible_negative_samples.groupby("question_ref").sample(5, random_state=42)
    negative_qq_samples.append(cur_negative_samples)
    
negative_train_qq_df = pd.concat(negative_qq_samples, axis=0)
negative_train_qq_df.loc[:, "label"] = 0

train_df = pd.concat([negative_train_qq_df, positive_train_qq_df], axis=0)

In [None]:
train_df.question_ref.nunique()

In [None]:
positive_train_df.question.nunique() + df_faq_ref.question_ref.nunique()

Create dataset for Q-A

In [None]:
# To get negative q-a pairs, from the negative question-question pairs, get unique rows barring reference questions
negative_train_df = negative_train_qq_df.drop(columns=["question_ref"]).drop_duplicates()

# combine all positive q-a pairs with negative ones.
# Note that we can include the reference questions in the positive samples here.
qa_train_df = pd.concat([positive_df, negative_train_df])

In [None]:
qa_train_df

In [None]:
qa_test_df = test_df.copy()

In [None]:
print("FAQs:", faqs.shape[0])
print("Merged FAQs:", df_merged.faq_id.nunique())
print("Reference Q-A pairs:", df_faq_ref.shape[0])
print("Positive training data:", positive_df.shape[0])
print("Negative training data:", negative_train_df.shape[0])
print("Positive training Q-Q data:", positive_train_qq_df.shape[0])
print("Negative training Q-Q data:", negative_train_qq_df.shape[0])
print("Training data for Q-Q:", train_df.shape[0])
print("Training data for Q-A:", qa_train_df.shape[0])
print("Test data:", test_df.shape[0])

In [None]:
test_df.groupby('faq_id').size().hist(bins=16)

In [None]:
train_df.groupby("label").size()

### Test DF for q-q matching

* Test:
  * (+) For each Q in 3, create (Q, q) pairs labelled 1 if Q and q have the same faq_id (q in 1)
  * (-) For each Q in 3, create (Q, q-) pairs labelled 0, for each q- != q in 1

Might as well create entire test set..

In [None]:
positive_test_df = test_df.copy()
positive_test_df.loc[:, "label"] = 1
positive_test_qq_df = positive_test_df.merge(df_faq_ref)
positive_test_qq_df.head()

In [None]:
assert not positive_test_df.question.duplicated().any()

In [None]:
negative_qq_test_samples = []

# For each question, get all (q, a) pairs such that a doesn't answer the question
for idx, row in positive_test_df.iterrows():
    cur_id = row["faq_id"]
    other_faqs = df_faq_ref[df_faq_ref.faq_id != cur_id].copy()
    other_faqs.loc[:, "question"] = row["question"]
    negative_qq_test_samples.append(other_faqs)
    
negative_test_qq_df = pd.concat(negative_qq_test_samples, axis=0)
negative_test_qq_df.loc[:, "label"] = 0

test_df = pd.concat([negative_test_qq_df, positive_test_qq_df],)

In [None]:
df_faq_ref.groupby("faq_id").size().mean()

In [None]:
test_df.groupby("question").size().mean()

In [None]:
assert test_df.shape[0] == positive_test_df.shape[0] * 2 * df_faq_ref.faq_id.nunique()

In [None]:
test_df.head()

### Test DF for q-a matching

just drop duplicates barring question_ref

In [None]:
qa_test_df = test_df.drop(columns=['question_ref']).drop_duplicates()

In [None]:
qa_test_df

In [None]:
assert qa_test_df.shape[0] == df_faq_ref.faq_id.nunique() * positive_test_df.shape[0]

# Tokenize

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

Training data for question-answer pairing

In [None]:
qa_matching_train_dataset = Dataset.from_pandas(qa_train_df)
qa_matching_test_dataset = Dataset.from_pandas(qa_test_df)

# Preprocess

In [None]:
def custom_tokenize_qq(examples):
    return tokenizer(
        examples['question'], 
        examples['question_ref'], 
        max_length=384,
        padding='max_length',
        truncation="only_second",
        return_overflowing_tokens=True,
        stride=128,
    )

def custom_tokenize_qa(examples):
    return tokenizer(
        examples['question'], 
        examples['faq_content_to_send'], 
        max_length=384,
        padding='max_length',
        truncation="only_second",
        return_overflowing_tokens=True,
        stride=128,
    )

In [None]:
remove_columns = ['question', 'faq_id', 'faq_title', 'faq_content_to_send', '__index_level_0__', 'question_ref', ]

simple_tokenized_train_dataset = train_dataset.map(custom_tokenize_qq, batched=True, batch_size=1000, remove_columns=remove_columns)
simple_tokenized_test_dataset = test_dataset.map(custom_tokenize_qq, batched=True, batch_size=1000, remove_columns=remove_columns)

In [None]:
remove_columns = ['question', 'faq_id', 'faq_title', 'faq_content_to_send', '__index_level_0__', ]
tokenized_qa_matching_train_dataset = qa_matching_train_dataset.map(custom_tokenize_qa, batched=True, batch_size=1000, remove_columns=remove_columns)
tokenized_qa_matching_test_dataset = qa_matching_test_dataset.map(custom_tokenize_qa, batched=True, batch_size=1000, remove_columns=remove_columns)

In [None]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()
s3_bucket = 'praekelt-static-resources'
s3_prefix='experiment/data/yal/question-question-matching'

# save train_dataset to s3
simple_training_input_path = f's3://{s3_bucket}/{s3_prefix}/train'
simple_tokenized_train_dataset.save_to_disk(simple_training_input_path,fs=s3)

# save test_dataset to s3
simple_test_input_path = f's3://{s3_bucket}/{s3_prefix}/test'
simple_tokenized_test_dataset.save_to_disk(simple_test_input_path,fs=s3)

In [None]:
from datasets import Dataset

n_positive = positive_train_qq_df.shape[0]
tokenized_qq_matching_test_dataset_short = Dataset.from_dict(simple_tokenized_test_dataset.shuffle().sort("label", reverse=True)[:2*n_positive])

# save short test_dataset to s3
short_simple_test_input_path = f's3://{s3_bucket}/{s3_prefix}/test_short'
tokenized_qq_matching_test_dataset_short.save_to_disk(short_simple_test_input_path,fs=s3)

qa

In [None]:
s3_prefix='experiment/data/yal/question-answer-matching'

# save train_dataset to s3
qa_training_input_path = f's3://{s3_bucket}/{s3_prefix}/train'
tokenized_qa_matching_train_dataset.save_to_disk(qa_training_input_path,fs=s3)

# save test_dataset to s3
simple_test_input_path = f's3://{s3_bucket}/{s3_prefix}/test'
tokenized_qa_matching_test_dataset.save_to_disk(simple_test_input_path,fs=s3)

In [None]:
n_positive = positive_df.shape[0]
tokenized_qa_matching_test_dataset_short = Dataset.from_dict(tokenized_qa_matching_test_dataset.shuffle().sort("label", reverse=True)[:2*n_positive])

# save short test_dataset to s3
short_simple_test_input_path = f's3://{s3_bucket}/{s3_prefix}/test_short'
tokenized_qa_matching_test_dataset_short.save_to_disk(short_simple_test_input_path,fs=s3)

Also save untokenized data

In [None]:
s3_prefix='experiment/data/yal/question-question-matching'

# save train_dataset to s3
training_input_path = f's3://{s3_bucket}/{s3_prefix}/train_untokenized'
train_dataset.save_to_disk(training_input_path,fs=s3)

# save test_dataset to s3
test_input_path = f's3://{s3_bucket}/{s3_prefix}/test_untokenized'
test_dataset.save_to_disk(test_input_path,fs=s3)

In [None]:
s3_prefix='experiment/data/yal/question-answer-matching'

# save train_dataset to s3
training_input_path = f's3://{s3_bucket}/{s3_prefix}/train_untokenized'
qa_matching_train_dataset.save_to_disk(training_input_path,fs=s3)

# save test_dataset to s3
test_input_path = f's3://{s3_bucket}/{s3_prefix}/test_untokenized'
qa_matching_test_dataset.save_to_disk(test_input_path,fs=s3)