In [None]:
import numpy as np
import pandas as pd
import boto3
import sagemaker

sess = sagemaker.Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

In [None]:
faqs = pd.read_csv("s3://praekelt-static-resources/experiment/data/yal_faqmatches.csv")
faqs = faqs[~faqs.faq_title.duplicated()]

In [None]:
faqs

In [None]:
df = pd.concat(
    (pd.read_csv(f"s3://praekelt-static-resources/yal_validation/yal_validation_questions_batch_{n}.csv") 
     for n in [1, 2])
    , axis=0
).drop(columns=["Unnamed: 0"]).reset_index(drop=True)

In [None]:
df.groupby("faq_title").size().hist(bins=12)

In [None]:
df.head()

In [None]:
df.faq_title.nunique(), faqs.faq_title.nunique()

Merge

In [None]:
df_merged = df.merge(faqs, left_on="faq_title", right_on="faq_title", how="inner")

In [None]:
df.shape, df_merged.shape

In [None]:
df_merged = df_merged.drop_duplicates()

Which rows got dropped during merging?

In [None]:
df[~df.faq_title.isin(faqs.faq_title)]

In [None]:
import torch

torch.cuda.is_available()

In [None]:
df_merged.head()

In [None]:
faq_token_lengths = df_merged.faq_content_to_send.apply(lambda text: len(text.split()))
faq_token_lengths.hist(bins=10);
print(f"FAQ token lengths: {faq_token_lengths.min()} ~ {faq_token_lengths.max()}")

In [None]:
df_merged[df_merged.question.duplicated(keep=False)].sort_values(by='question')

# For Question-Answer Pairs

## Split

In [None]:
from sklearn.model_selection import train_test_split

keep_columns = ['question', 'faq_title', 'faq_id', 'faq_content_to_send']
df_merged = df_merged[keep_columns]
positive_train_df, test_df = train_test_split(df_merged, test_size=0.3, stratify=df_merged.faq_title)

In [None]:
positive_train_df.shape, test_df.shape

In [None]:
test_df.groupby("faq_title").size().hist(bins=4)

## Negative Sampling

We should select negative samples only within training data!!!

In [None]:
negative_samples = []
for cur_id, _df in positive_train_df.groupby("faq_id"):
    cur_negative_samples = positive_train_df[positive_train_df.faq_id != cur_id].sample(10)
    cur_negative_samples['faq_id'] = cur_id
    cur_negative_samples['faq_title'] = faqs.loc[faqs.faq_id == cur_id, "faq_title"].iloc[0]
    cur_negative_samples['faq_content_to_send'] = faqs.loc[faqs.faq_id == cur_id, "faq_content_to_send"].iloc[0]
    negative_samples.append(cur_negative_samples)

In [None]:
negative_samples = pd.concat(negative_samples, axis=0)

In [None]:
negative_samples.question.duplicated().sum()

In [None]:
negative_samples

In [None]:
negative_samples['label'] = 0

In [None]:
positive_samples = positive_train_df
positive_samples['label'] = 1

In [None]:
train_df = pd.concat([negative_samples, positive_samples],)

In [None]:
train_df.sample(10)

## Tokenize

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

from datasets import load_dataset, Dataset

In [None]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
train_dataset[0]

In [None]:
94*2

## Preprocessing for Sequence Classification

In [None]:
def custom_tokenize(examples):
    return tokenizer(
        examples['question'], 
        examples['faq_content_to_send'], 
        max_length=384,
        padding='max_length',
        truncation="only_second",
        return_overflowing_tokens=True,
        stride=128,
    )

remove_columns = ['question', 'faq_id', 'faq_title', 'faq_content_to_send', '__index_level_0__']
simple_tokenized_train_dataset = train_dataset.map(custom_tokenize, batched=True, batch_size=1000, remove_columns=remove_columns)
simple_tokenized_test_dataset = test_dataset.map(custom_tokenize, batched=True, batch_size=1000, remove_columns=remove_columns)

In [None]:
simple_tokenized_train_dataset

In [None]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()  
s3_prefix='experiment/data/automodel_classification_split'

# save train_dataset to s3
simple_training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train_with_neg'
simple_tokenized_train_dataset.save_to_disk(simple_training_input_path,fs=s3)

# save test_dataset to s3
simple_test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test_with_neg'
simple_tokenized_test_dataset.save_to_disk(simple_test_input_path,fs=s3)

In [None]:
s3_prefix='experiment/data/untokenized_split'

# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train_with_neg'
train_dataset.save_to_disk(training_input_path,fs=s3)

# save test_dataset to s3
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test_with_neg'
test_dataset.save_to_disk(test_input_path,fs=s3)

## Preprocessing for Question Answering

In [None]:
pad_on_right = tokenizer.padding_side == "right"

In [None]:
pad_on_right

In [None]:
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "faq_content_to_send"],
        examples["faq_content_to_send" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["faq_content_to_send"][sample_index]
        is_positive = examples["label"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if is_positive == 0.0:
           tokenized_examples["start_positions"].append(cls_index)
           tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = 0
            end_char = start_char + len(answers)

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [None]:
remove_columns = ['question', 'faq_id', 'label', 'faq_title', 'faq_content_to_send', '__index_level_0__']
tokenized_train_dataset = train_dataset.map(prepare_train_features, batched=True, batch_size=1000, remove_columns=remove_columns)
tokenized_test_dataset = test_dataset.map(prepare_train_features, batched=True, batch_size=1000, remove_columns=remove_columns)

In [None]:
s3_prefix = 'experiment/data/automodel_split'
# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train_with_neg'
tokenized_train_dataset.save_to_disk(training_input_path,fs=s3)

# save test_dataset to s3
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test_with_neg'
tokenized_test_dataset.save_to_disk(test_input_path,fs=s3)

In [None]:
training_input_path == 's3://sagemaker-af-south-1-678681925278/experiment/data/automodel_split/train_with_neg'