In [None]:
import pandas as pd

validation_bucket = 'praekelt-static-resources'
validation_data_prefix = 'mc_validation/validation_khumo_labelled_aaq.csv'
validation_faqs_prefix = 'experiment/data/[Sam] Helpdesk Q&A _ MOMZA _ FAQ Content.xlsx - FAQs.csv'

In [None]:
df = pd.read_csv(f"s3://{validation_bucket}/{validation_data_prefix}")

In [None]:
df.head()

In [None]:
faqs = pd.read_csv(f"/Users/suzinyou/Downloads/{validation_faqs_prefix.split('/')[-1]}")

In [None]:
faqs.head()

Follows preprocessing from `experiments/bert_sagemaker/notebooks/1.1-sy-MC-preprocess_mc_data.ipynb`:

In [None]:
import numpy as np

In [None]:
column_map = {
    'Validation questions - USER GENERATED': 'questions_usr',
    'Validation questions - SYNTHETIC': 'questions_syn',
    'FAQ Content': 'faq_content',
    'FAQ Name': 'faq_name',
    'FAQ title': 'faq_title',
}
faqs = faqs.rename(columns=column_map)
faqs = faqs[column_map.values()]

faqs.questions_usr.iloc[0].split('\n')
faqs = faqs[faqs.faq_name != 'FAQ Name']
faqs = faqs[~faqs.questions_usr.isnull()]

faqs.loc[:, "questions_usr"] = faqs.questions_usr.apply(lambda x: np.asarray(x.split('\n')))
faqs.loc[faqs.faq_name == "Preg - ANAEMIA", 'faq_name'] = "Preg - Anemia"

faqs = faqs[faqs.questions_usr.apply(lambda x: len(x)) >= 4]

from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence

rs = RandomState(MT19937(SeedSequence(123456789)))

def get_ref_split(l):
    r = np.arange(len(l))
    rs.shuffle(r)
    return r[:2], r[2:]

faqs.loc[:, "_splits"] = faqs.questions_usr.apply(get_ref_split)
faqs.loc[:, "question_ref"] = faqs.apply(lambda x: x.questions_usr[x._splits[0]], axis=1)
faqs.loc[:, "question"] = faqs.apply(lambda x: x.questions_usr[x._splits[1]], axis=1)

for col in ['question', 'question_ref', 'questions_usr',]:
    faqs[col] = faqs[col].apply(lambda x: list(x))

In [None]:
faqs.head()

In [None]:
faqs.faq_name[faqs.faq_name.apply(lambda x: "/" in x)]

In [None]:
faqs["faq_name_rasa"] = faqs.faq_name.apply(lambda x: '_'.join(x.lower().replace(" - ", "-").replace(" / ", "").split()))

## FAQs data
We need the data in this format:

```yaml
responses:
  utter_chitchat/ask_name:
  - image: "https://i.imgur.com/zTvA58i.jpeg"
    text: Hello, my name is Retrieval Bot.
  - text: I am called Retrieval Bot!
  utter_chitchat/ask_weather:
  - text: Oh, it does look sunny right now in Berlin.
    image: "https://i.imgur.com/vwv7aHN.png"
  - text: I am not sure of the whole week but I can see the sun is out today.

```

In [None]:
faq_content_fmt = """  utter_faq/{name}:
  - text: \"{content}\"
"""

yaml_text = """responses:
"""

faq_yaml_text = yaml_text + "\n".join(
    faqs.apply(
        lambda faq: faq_content_fmt.format(
            name=faq.faq_name_rasa, 
            content=faq.faq_content.replace("\"", "\\\"")
        ), 
        axis=1
    ).tolist()
)

In [None]:
print(faq_yaml_text)

## Training data

```yaml
nlu:
  - intent: chitchat/ask_name
    examples: |
      - What is your name?
      - May I know your name?
      - What do people call you?
      - Do you have a name for yourself?
  - intent: chitchat/ask_weather
    examples: |
      - What's the weather like today?
      - Does it look sunny outside today?
      - Oh, do you mind checking the weather for me please?
      - I like sunny days in Berlin.
```

In [None]:
faqs.columns

In [None]:
training_data_fmt = """  - intent: faq/{name}
    examples: |
{questions}"""

nlu_yaml_text = """nlu:
"""

nlu_yaml_text += "\n".join(
    faqs.apply(
        lambda faq: training_data_fmt.format(
            name=faq.faq_name_rasa, 
            questions='\n'.join(
                [
                    f"      - {q}"
                    #"      - \"{x}\"".format(x=q.replace("\"", "\\\""))
                    for q in faq.question + faq.question_ref
                ]
            )
        ), 
        axis=1
    ).tolist()
)

In [None]:
print(nlu_yaml_text)

In [None]:
import os
def addToClipBoard(text):
    command = 'echo ' + text.strip() + '| clip'
    os.system(command)

In [None]:
addToClipBoard(faq_yaml_text)