# Create RASA config files

## 1. Load data

In [None]:
import pandas as pd

validation_bucket = 'praekelt-static-resources'
validation_faqs_prefix = 'experiment/data/[Sam] Helpdesk Q&A _ MOMZA _ FAQ Content.xlsx - FAQs.csv'

In [None]:
faqs = pd.read_csv(f"/Users/suzinyou/Downloads/{validation_faqs_prefix.split('/')[-1]}")

In [None]:
faqs.head()

Follows preprocessing from `experiments/bert_sagemaker/notebooks/1.1-sy-MC-preprocess_mc_data.ipynb`:

In [None]:
import numpy as np
from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence

# Clean column names in FAQs file
column_map = {
    'Validation questions - USER GENERATED': 'questions_usr',
    'Validation questions - SYNTHETIC': 'questions_syn',
    'FAQ Content': 'faq_content',
    'FAQ Name': 'faq_name',
    'FAQ title': 'faq_title',
}
faqs = faqs.rename(columns=column_map)

# Keep only the columns we need
faqs = faqs[column_map.values()]

# Drop rows we can't use
faqs = faqs[faqs.faq_name != 'FAQ Name']
faqs = faqs[~faqs.questions_usr.isnull()]

# Parse example questions column so each elemnt is an array of questions (we use numpy array so we can index them)
faqs.loc[:, "questions_usr"] = faqs.questions_usr.apply(lambda x: np.asarray(x.split('\n')))

# Clean Anaemia FAQ name
faqs.loc[faqs.faq_name == "Preg - ANAEMIA", 'faq_name'] = "Preg - Anemia"

# Keep FAQs with at least 4 or more example questions
faqs = faqs[faqs.questions_usr.apply(lambda x: len(x)) >= 4]

# (Only relevant for question-question matching)
# Split into reference questions (tied to the FAQ) and example questions for training
rs = RandomState(MT19937(SeedSequence(123456789)))

def get_ref_split(l):
    r = np.arange(len(l))
    rs.shuffle(r)
    return r[:2], r[2:]

faqs.loc[:, "_splits"] = faqs.questions_usr.apply(get_ref_split)
faqs.loc[:, "question_ref"] = faqs.apply(lambda x: x.questions_usr[x._splits[0]], axis=1)
faqs.loc[:, "question"] = faqs.apply(lambda x: x.questions_usr[x._splits[1]], axis=1)

# Cast numpy arrays into lists
for col in ['question', 'question_ref', 'questions_usr',]:
    faqs[col] = faqs[col].apply(lambda x: list(x))

In [None]:
faqs.head()

We can't have `/` in RASA FAQ question names, so replace them

In [None]:
faqs.faq_name[faqs.faq_name.apply(lambda x: "/" in x)]

In [None]:
faqs["faq_name_rasa"] = faqs.faq_name.apply(lambda x: '_'.join(x.lower().replace(" - ", "-").replace(" / ", "").split()))

## FAQs data
We need the data in this format:

```yaml
responses:
  utter_chitchat/ask_name:
  - image: "https://i.imgur.com/zTvA58i.jpeg"
    text: Hello, my name is Retrieval Bot.
  - text: I am called Retrieval Bot!
  utter_chitchat/ask_weather:
  - text: Oh, it does look sunny right now in Berlin.
    image: "https://i.imgur.com/vwv7aHN.png"
  - text: I am not sure of the whole week but I can see the sun is out today.

```

In [None]:
faq_content_fmt = """  utter_faq/{name}:
  - text: \"{content}\"
"""

yaml_text = """responses:
"""

faq_yaml_text = yaml_text + "\n".join(
    faqs.apply(
        lambda faq: faq_content_fmt.format(
            name=faq.faq_name_rasa, 
            content=faq.faq_content.replace("\"", "\\\"")
        ), 
        axis=1
    ).tolist()
)

In [None]:
print(faq_yaml_text[:1000])

## Training/test data

```yaml
nlu:
  - intent: chitchat/ask_name
    examples: |
      - What is your name?
      - May I know your name?
      - What do people call you?
      - Do you have a name for yourself?
  - intent: chitchat/ask_weather
    examples: |
      - What's the weather like today?
      - Does it look sunny outside today?
      - Oh, do you mind checking the weather for me please?
      - I like sunny days in Berlin.
```

In [None]:
faqs.columns

In [None]:
print(render_faq_questions(faqs.iloc[0]))

In [None]:
nlu_element_fmt = """  - intent: faq/{name}
    examples: |
{questions}"""

nlu_yaml_text = """nlu:
"""

def render_faq_questions(faq):
    all_questions = faq.question + faq.question_ref
    
    formatted_questions_str = '\n'.join([f"      - {q}" for q in all_questions])
    
    return nlu_element_fmt.format(
        name=faq.faq_name_rasa, 
        questions=formatted_questions_str
    )

nlu_yaml_text += "\n".join(faqs.apply(render_faq_questions, axis=1).tolist())

In [None]:
print(nlu_yaml_text[:1000] + "...")

## Format validation data

In [None]:
validation_phase_1_data_prefix = "validation_aaq/validation_khumo_labelled.csv"
validation_phase_2_data_prefix = "validation_aaq/validation_khumo_labelled_phase2.csv"
df_phase_1 = pd.read_csv(f's3://{validation_bucket}/{validation_phase_1_data_prefix}')
df_phase_2 = pd.read_csv(f's3://{validation_bucket}/{validation_phase_2_data_prefix}')

df = pd.concat([df_phase_1, df_phase_2])

In [None]:
df.head()

In [None]:
df.info()

Taken also from `experiments/bert_sagemaker/notebooks/1.1-sy-MC-preprocess_mc_data.ipynb`:

In [None]:
df_column_map = {
    'FAQ Name': 'faq_name',
    'Question': 'question',
    
}

df_ref_cols = ['question_msg_id']

df = df[df_ref_cols + list(df_column_map.keys())].rename(columns=df_column_map)
df = df[df.faq_name.notnull()]

In [None]:
merged = df.merge(faqs.drop(columns=['question']), how="left")
merged[merged.questions_usr.isnull()].faq_name.value_counts()

Some FAQs don't match to the FAQ sheet, we'll just drop them

In [None]:
df_merged = df.merge(faqs.drop(columns=['question']))
df_merged.head()

In [None]:
df_merged = df_merged[~df_merged.question.duplicated()]
df_merged = df_merged[~df_merged.question.isnull()]

In [None]:
df_merged.head()

In [None]:
df_merged.faq_name_rasa.isnull().any()

In [None]:
df_merged.shape

In [None]:
def render_validation_questions(faq_name, group):
    questions = [q.replace("\n", " ") for q in group.question]
    formatted_questions_str = '\n'.join([f"      - {q}" for q in questions])
    
    return nlu_element_fmt.format(
        name=faq_name, 
        questions=formatted_questions_str
    )

nlu_test_yaml_text = """nlu:
""" + "\n".join([
    render_validation_questions(faq_name_rasa, group_df)
    for faq_name_rasa, group_df in df_merged.groupby("faq_name_rasa")
])

print(nlu_test_yaml_text[:1000] + "...")

In [None]:
with open("/Users/suzinyou/IDinsight/praekelt/rasa_test/data/nlu_test.yml", "w") as fp:
    n = fp.write(nlu_test_yaml_text)

In [None]:
print(n)

# Evaluate

In [None]:
import json

with open("/Users/suzinyou/IDinsight/praekelt/rasa_test/results/response_selection_report.json", "r") as fp:
    report = json.load(fp)

In [None]:
report_df = pd.DataFrame(report).T
report_df.index.name = "faq_name_rasa"
for col in ["precision","recall","f1-score","support"]:
    report_df[col] = report_df[col].astype(float)

In [None]:
report_df.head(12)

```python
recall = tp_i / (tp_i + fn_i) = tp_i / support_i

top 1 accuracy: sum(tp_i) / sum(support)
```

In [None]:
(report_df.recall * report_df.support).sum() / report_df.support.sum()

In [None]:
with open("/Users/suzinyou/IDinsight/praekelt/rasa_test/results/response_selection_errors.json", "r") as fp:
    err = json.load(fp)

In [None]:
df_merged.shape

In [None]:
len(err)

In [None]:
err_df = pd.DataFrame(err)

In [None]:
err_df.head()

In [None]:
err_df["intent_response_key_prediction"].iloc[0]

In [None]:
err_df["intent_response_key_prediction"].apply(lambda x: type(x))

In [None]:
err_df["pred_faq_name"] = err_df["intent_response_key_prediction"].apply(lambda x: x["name"])
err_df["pred_proba"] = err_df["intent_response_key_prediction"].apply(lambda x: x["confidence"])

In [None]:
err_df.head()

Compare with BERT

In [None]:
s3_bucket = 'praekelt-static-resources'
s3_prefix = 'experiment/data/mc/question-answer-matching'
batch_transform_s3_prefix = 's3://praekelt-static-resources/experiment/outputs/batch-transform/mc'
output_s3_path = batch_transform_s3_prefix + "/output_4xlarge"
pred = pd.read_pickle(output_s3_path + '/predictions_question_answer_pair_score.pkl')