# Preprocessing MomConnect Data


1. Clean FAQs --> the "user generated" questions will be our training data
2. Clean Validation data --> these will be our test data

For QA+QQ combined, we need

1. Reference questions -- random split from 1
2. Training questions -- rest split from 1
3. Test questions -- from 2

In [None]:
import sagemaker

sess = sagemaker.Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

In [None]:
import numpy as np
import pandas as pd

faqs = pd.read_csv('s3://praekelt-static-resources/experiment/data/[Sam] Helpdesk Q&A _ MOMZA _ FAQ Content.xlsx - FAQs.csv')
df_phase_1 = pd.read_csv('s3://praekelt-static-resources/validation_aaq/validation_khumo_labelled.csv')
df_phase_2 = pd.read_csv('s3://praekelt-static-resources/validation_aaq/validation_khumo_labelled_phase2.csv')

# Clean data

## 1. Clean FAQs sheet

In [None]:
faqs.columns

In [None]:
column_map = {
    'Validation questions - USER GENERATED': 'questions_usr',
    'Validation questions - SYNTHETIC': 'questions_syn',
    'FAQ Content': 'faq_content',
    'FAQ Name': 'faq_name',
    'FAQ title': 'faq_title',
}
faqs = faqs.rename(columns=column_map)
faqs = faqs[column_map.values()]
faqs.head()

In [None]:
faqs.questions_usr.iloc[0].split('\n')

In [None]:
faqs[faqs.questions_usr.isnull()]

In [None]:
faqs = faqs[faqs.faq_name != 'FAQ Name']
faqs = faqs[~faqs.questions_usr.isnull()]
faqs.head()

I'm dropping FAQs with no "user generated" questions for now.

In [None]:
faqs.loc[:, "questions_usr"] = faqs.questions_usr.apply(lambda x: np.asarray(x.split('\n')))

In [None]:
faqs.loc[faqs.faq_name == "Preg - ANAEMIA", 'faq_name'] = "Preg - Anemia"

Drop rows with too few questions

In [None]:
faqs.questions_usr.apply(lambda x: len(x)).hist(bins=16)

In [None]:
faqs[faqs.questions_usr.apply(lambda x: len(x)) < 4]

All Covid related.. seems ok to drop.

Keep only FAQs with >=4 questions.

In [None]:
faqs = faqs[faqs.questions_usr.apply(lambda x: len(x)) >= 4]

It's easier to keep the FAQ questions in an "unexploded" form.

Split the synthetic questions into reference questions (which will be used to represent
an FAQ in question-question matching) and training questions.
To sample reference questions we'll randomly sample 2 indices from the questions for each FAQ.

In [None]:
from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence

rs = RandomState(MT19937(SeedSequence(123456789)))

def get_ref_split(l):
    r = np.arange(len(l))
    rs.shuffle(r)
    return r[:2], r[2:]

faqs.loc[:, "_splits"] = faqs.questions_usr.apply(get_ref_split)
faqs.loc[:, "question_ref"] = faqs.apply(lambda x: x.questions_usr[x._splits[0]], axis=1)
faqs.loc[:, "question"] = faqs.apply(lambda x: x.questions_usr[x._splits[1]], axis=1)

In [None]:
faqs.head()

In [None]:
for col in ['question', 'question_ref', 'questions_usr',]:
    faqs[col] = faqs[col].apply(lambda x: list(x))

In [None]:
faqs.shape

## 2. Clean validation data

In [None]:
df_phase_1.info()

In [None]:
df_phase_2.info()

In [None]:
df_phase_1.head()

In [None]:
df_column_map = {
    'FAQ Name': 'faq_name',
    'Question': 'question',
    
}
df_ref_cols = ['question_msg_id', '_vnd_v1_chat_owner_anon', 'question_inserted_at', 'answer_msg_id', 'answer_inserted_at']
df = pd.concat([df_phase_1, df_phase_2])
df = df[df_ref_cols + list(df_column_map.keys())].rename(columns=df_column_map)
df = df[df.faq_name.notnull()]
df

In [None]:
cols = ['question_inserted_at', 'answer_inserted_at']
df[cols] = df[cols].apply(pd.to_datetime)

In [None]:
df[['question_inserted_at', 'answer_inserted_at']].describe()

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 1, figsize=(10, 16))
df.groupby('faq_name').size().plot(kind='barh', ax=ax)

In [None]:
merged = df.merge(faqs.drop(columns=['question']), how="left")
merged[merged.questions_usr.isnull()].faq_name.value_counts()

Some FAQs don't match to the FAQ sheet, we'll just drop them

In [None]:
df_merged = df.merge(faqs.drop(columns=['question']))
df_merged.head()

In [None]:
df_merged = df_merged[~df_merged.question.duplicated()]
df_merged = df_merged[~df_merged.question.isnull()]

In [None]:
df.info()

In [None]:
df.faq_name.value_counts()

Note: the question_usr column hasn't been exploded yet.

# Create Q-A relevance scoring dataset

* Train (base: `faqs`)
  * Use (`question_ref`, `faq_content`), explode `question_ref`
  * Use (`question`, `faq_content`), explode `question`
  * For each unique `faq_content`) sample 5~6 wrong `question`/`question_ref`'s
* Test (base: `df_merged`)
  * Use (`question`, `faq_content`)
  * Use (`question`, wrong `faq_content`) to get ranks

### Train

In [None]:
qa_train_ref = faqs.drop(
    columns='question'
).explode(
    'question_ref'
).rename(
    columns={'question_ref': 'question'}
).copy()

cols = ['question', 'faq_content', 'faq_name']
qa_train_ref = qa_train_ref[cols]
qa_train_rest = faqs.explode('question')[cols].copy()

qa_train_pos = pd.concat([qa_train_ref, qa_train_rest])

In [None]:
qa_train_rest[qa_train_rest.question.apply(lambda x: isinstance(x, list))]

In [None]:
qa_train_pos

In [None]:
negative_samples = []

for faq_name, qdf in qa_train_pos.groupby('faq_name'):
    # For each FAQ content, sample "wrong" questions
    neg_df = qa_train_pos[qa_train_pos.faq_name != faq_name].copy() # all "wrong" questions
    neg_df_sampled = neg_df.assign(faq_content=qdf['faq_content'].iloc[0], faq_name=faq_name).sample(6)
    negative_samples.append(neg_df_sampled)
    
qa_train_neg = pd.concat(negative_samples)

In [None]:
qa_train_neg.head()

In [None]:
qa_train = pd.concat([qa_train_pos.assign(label=1), qa_train_neg.assign(label=0)])
qa_train.shape

### Test

In [None]:
qa_test_pos = df_merged.assign(label=1).copy()
qa_test_pos.head()

In [None]:
qa_test_pos[qa_test_pos.question.duplicated()]

In [None]:
negative_samples = []

for question, qdf in qa_test_pos.groupby("question"):
    # For each question we want to infer on all possible FAQs
    faq_name = qdf.faq_name.iloc[0]
    faq_content = qdf.faq_content.iloc[0]
    neg_df = faqs[faqs.faq_name!= faq_name].drop(columns=['question']).copy()
    neg_df = neg_df.assign(question=question, label=0)
    negative_samples.append(neg_df)
    
qa_test_neg = pd.concat(negative_samples)

qa_test = pd.concat([qa_test_pos, qa_test_neg])
qa_test_short = qa_test.sample(1000)

In [None]:
qa_test_pos.shape, qa_test_neg.shape

In [None]:
qa_test.faq_name.nunique()

Check each unique question has 150 rows

In [None]:
num_rows_per_q = qa_test.groupby('question').size()
num_rows_per_q[num_rows_per_q != 150]

In [None]:
qa_test_short.shape

# Create Q-Q semantic matching dataset

* Train (base: `faqs`)
  * Use (`question`, `question_ref`), explode both
  * For each unique `qustion_ref`, sample 5~6 wrong `question` / `question_ref`'s
* Test (base: `df_merged`)
  * Use (`question`, `question_ref`), explode `question_ref`
  * Use (`question`, wrong `faq_content`) to get ranks

In [None]:
qq_train_pos = faqs.explode('question').explode('question_ref').drop('_splits', axis=1).assign(label=1).copy()
qq_train_pos = qq_train_pos[~qq_train_pos.question_ref.apply(lambda x: x=='')].reset_index(drop=True)
qq_train_pos.head()

In [None]:
negative_samples = []

for question_ref, qdf in qq_train_pos.groupby("question_ref"):
    faq_name = qdf.faq_name.iloc[0]
    faq_content = qdf.faq_content.iloc[0]
    neg_df = qq_train_pos[
        qq_train_pos.faq_name != faq_name
    ].assign(
        label=0, 
        faq_name=faq_name, 
        faq_content=faq_content, 
        question_ref=question_ref
    ).sample(5).copy()
    negative_samples.append(neg_df)
    
qq_train_neg = pd.concat(negative_samples).reset_index(drop=True)
qq_train_neg

In [None]:
cols = ['question', 'faq_name', 'faq_content', 'question_ref', 'label']
qq_train = pd.concat([qq_train_pos[cols], qq_train_neg[cols]])

In [None]:
qq_train

### test

In [None]:
qq_test_pos = df_merged.explode('question_ref').assign(label=1)
qq_test_pos = qq_test_pos[qq_test_pos.question_ref != '']

In [None]:
qq_test_pos

In [None]:
df.question_msg_id.duplicated().any()

In [None]:
negative_samples = []

for question, qdf in qq_test_pos.groupby("question"):
    neg_df = faqs[faqs.faq_name!= qdf.faq_name.iloc[0]].copy()
    neg_df = neg_df.explode('question_ref').assign(question=question, label=0)
    negative_samples.append(neg_df)

qq_test_neg = pd.concat(negative_samples)

In [None]:
qq_test = pd.concat([qq_test_pos, qq_test_neg], axis=0)

In [None]:
qq_test.shape

In [None]:
qq_test_short = qq_test.sample(1000)

In [None]:
qq_test.groupby('question').size()[qq_test.groupby('question').size() != 300]

In [None]:
(qq_test.groupby('question').size() == faqs.explode('question_ref').shape[0]).all()

Some `question_ref`'s were empty strings.

In [None]:
for _df in [qa_train, qa_test, qa_test_short, qq_train, qq_test, qq_test_short]:
    _df.loc[:, "question"] = _df.question.astype(str)

# Preprocess

## Tokenize

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
def custom_tokenize_qq(examples):
    return tokenizer(
        examples['question'], 
        examples['question_ref'], 
        max_length=384,
        padding='max_length',
        truncation="only_second",
        return_overflowing_tokens=False,
    )

def custom_tokenize_qa(examples):
    return tokenizer(
        examples['question'], 
        examples['faq_content'], 
        max_length=384,
        padding='max_length',
        truncation="only_second",
        return_overflowing_tokens=False,
    )

### QA

In [None]:
qa_train_dataset = Dataset.from_pandas(qa_train)

In [None]:
qa_test_dataset = Dataset.from_pandas(qa_test)
qa_test_dataset_short = Dataset.from_pandas(qa_test_short)

In [None]:
remove_columns = ['question', 'faq_content', '__index_level_0__',]

qa_tkn_train_dataset = qa_train_dataset.map(custom_tokenize_qa, batched=True, batch_size=1000, remove_columns=remove_columns)
qa_tkn_test_dataset_short = qa_test_dataset_short.map(custom_tokenize_qa, batched=True, batch_size=1000, remove_columns=remove_columns)

In [None]:
qa_tkn_train_dataset

In [None]:
qa_tkn_test_dataset_short

### QA Upload to S3

In [None]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()
s3_bucket = 'praekelt-static-resources'
s3_prefix_fmt ='experiment/data/mc/{task_type}'

task_type = 'question-answer-matching'
s3_prefix = s3_prefix_fmt.format(task_type=task_type)

In [None]:
# save train_dataset to s3
qa_training_input_path = f's3://{s3_bucket}/{s3_prefix}/train'
qa_tkn_train_dataset.save_to_disk(qa_training_input_path,fs=s3)

# save test_dataset to s3
qa_test_short_input_path = f's3://{s3_bucket}/{s3_prefix}/test_short'
qa_tkn_test_dataset_short.save_to_disk(qa_test_short_input_path,fs=s3)

# save untokenized train_dataset to s3
qa_training_input_path = f's3://{s3_bucket}/{s3_prefix}/train_untokenized'
qa_train_dataset.save_to_disk(qa_training_input_path,fs=s3)

In [None]:
# save untokenized test_dataset to s3
qa_test_input_path = f's3://{s3_bucket}/{s3_prefix}/test_untokenized'
qa_test_dataset.save_to_disk(qa_test_input_path,fs=s3)

# save untokenized short test_dataset to s3
qa_test_short_input_path = f's3://{s3_bucket}/{s3_prefix}/test_untokenized_short'
qa_test_dataset_short.save_to_disk(qa_test_short_input_path,fs=s3)

In [None]:
del qa_tkn_train_dataset, qa_tkn_test_dataset_short

In [None]:
del qa_test_dataset

### QQ

In [None]:
qq_test_short.question = qq_test_short.question.astype(str)

In [None]:
qq_test.question = qq_test.question.astype(str)

In [None]:
qq_train_dataset = Dataset.from_pandas(qq_train)
qq_test_dataset = Dataset.from_pandas(qq_test)

remove_columns = ['question', 'question_ref', '__index_level_0__',]
qq_tkn_train_dataset = qq_train_dataset.map(custom_tokenize_qq, batched=True, batch_size=1000, remove_columns=remove_columns)

In [None]:
qq_test_dataset_short = Dataset.from_pandas(qq_test_short)
remove_columns = ['question_msg_id', '_vnd_v1_chat_owner_anon', 'question_inserted_at', 'answer_msg_id', 'answer_inserted_at', 'question', 'questions_usr', 'questions_syn', 'faq_content', 'faq_title', '_splits', 'question_ref', '__index_level_0__']
qq_tkn_test_dataset_short = qq_test_dataset_short.map(custom_tokenize_qq, batched=True, batch_size=1000, remove_columns=remove_columns)

In [None]:
task_type = 'question-question-matching'
s3_prefix = s3_prefix_fmt.format(task_type=task_type)

# save train_dataset to s3
qq_training_input_path = f's3://{s3_bucket}/{s3_prefix}/train'
qq_tkn_train_dataset.save_to_disk(qq_training_input_path,fs=s3)

# save test_dataset to s3
qq_test_short_input_path = f's3://{s3_bucket}/{s3_prefix}/test_short'
qq_tkn_test_dataset_short.save_to_disk(qq_test_short_input_path,fs=s3)

# save untokenized train_dataset to s3
qq_training_input_path = f's3://{s3_bucket}/{s3_prefix}/train_untokenized'
qq_train_dataset.save_to_disk(qq_training_input_path,fs=s3)

# save untokenized test_dataset to s3
qq_test_input_path = f's3://{s3_bucket}/{s3_prefix}/test_untokenized'
qq_test_dataset.save_to_disk(qq_test_input_path,fs=s3)

# save untokenized short test_dataset to s3
qq_test_short_input_path = f's3://{s3_bucket}/{s3_prefix}/test_untokenized_short'
qq_test_dataset_short.save_to_disk(qq_test_short_input_path,fs=s3)