# Data preparation

Preprocess synthetic MomConnect data for testing OpenAI embeddings.

We use synthetic questions in order to abide by our data sharing agreement.

Ideally, we would split synthetic questions into the following:

* reference questions: for question-question matching
* training questions: for training BERT
* test questions: for evaluating BERT or OpenAI

But many FAQs only have 4 synthetic questions.

In [None]:
import pandas as pd
import s3fs


fs = s3fs.S3FileSystem()

with fs.open("s3://praekelt-static-resources/experiment/data/[Sam] Helpdesk Q&A _ MOMZA _ FAQ Content.xlsx - FAQs.csv") as f:
    faqs = pd.read_csv(f)

In [None]:
import numpy as np
from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence

# Clean column names in FAQs file
column_map = {
    'Validation questions - USER GENERATED': 'questions_usr',
    'Validation questions - SYNTHETIC': 'questions_syn',
    'FAQ Content': 'faq_content',
    'FAQ Name': 'faq_name',
    'FAQ title': 'faq_title',
    'IDinsight Tags': 'faq_tags',
}
faqs = faqs.rename(columns=column_map)

# Keep only the columns we need
faqs = faqs[column_map.values()]

# Drop rows we can't use
faqs = faqs[faqs.faq_name != 'FAQ Name']
faqs = faqs[~faqs.questions_usr.isnull()]

# Parse example questions column so each elemnt is an array of questions (we use numpy array so we can index them)
faqs.loc[:, "questions_usr"] = faqs.questions_usr.apply(lambda x: np.asarray(x.split('\n')))

# Clean Anaemia FAQ name
faqs.loc[faqs.faq_name == "Preg - ANAEMIA", 'faq_name'] = "Preg - Anemia"

# Keep FAQs with at least 4 or more example questions
faqs = faqs[faqs.questions_usr.apply(lambda x: len(x)) >= 4]

# (Only relevant for question-question matching)
# Split into reference questions (tied to the FAQ) and example questions for training
rs = RandomState(MT19937(SeedSequence(123456789)))

def get_ref_split(l):
    r = np.arange(len(l))
    rs.shuffle(r)
    return r[:2], r[2:]

faqs.loc[:, "_splits"] = faqs.questions_usr.apply(get_ref_split)
faqs.loc[:, "question_ref"] = faqs.apply(lambda x: x.questions_usr[x._splits[0]], axis=1)
faqs.loc[:, "question"] = faqs.apply(lambda x: x.questions_usr[x._splits[1]], axis=1)

# Cast numpy arrays into lists
for col in ['question', 'question_ref', 'questions_usr',]:
    faqs[col] = faqs[col].apply(lambda x: list(x))

In [None]:
faqs.head()

In [None]:
faqs.faq_content.nunique()

In [None]:
faqs.faq_name.nunique()

In [None]:
faqs.isnull().any()

In [None]:
faqs[faqs.faq_content_embedding.isnull()]

Distribution of number of synthetic questions per FAQ (excluding 2 reference questions, which we'll use for validation)

In [None]:
faqs._splits.apply(lambda x: len(x[1])).hist(bins=10)

In [None]:
!pip install plotly

In [None]:
!pip install scipy

In [None]:
!pip install scikit-learn

In [None]:
import tiktoken

from openai.embeddings_utils import get_embedding, get_embeddings


# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002

In [None]:
max_tokens = 600  # the maximum for text-embedding-ada-002 is 8191

In [None]:
encoding = tiktoken.get_encoding(embedding_encoding)

faqs_n_tokens = faqs.faq_content.apply(lambda x: len(encoding.encode(x)))
faqs_n_tokens.describe()

In [None]:
faqs_n_tokens.sum()

OpenAI rate limits for pay-as-you-go (cf. [here](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_handle_rate_limits.ipynb)):

* 60 requests / minute
* 250,000 davinci tokens / minute (and proportionally more for cheaper models)

Pricing

* \$0.0004  / 1K tokens for Ada
* \$0.0005  / 1K tokens for Babbage
* \$0.0020  / 1K tokens for Curie
* \$0.0200  / 1K tokens for Davinci

In [None]:
pricing = {
    "Ada": 0.0004 / 1000,
    "Babbage": 0.0005 / 1000,
    "Curie": 0.002 / 1000,
    "Davinci": 0.02 / 1000
}

print("Estimated cost for all FAQs")
for model, rate in pricing.items():
    print(f"{model}: ${rate * faqs_n_tokens.sum():.2f}")

In [None]:
import time

MAX_REQUESTS_PER_MIN = 60
WAIT_SECONDS_PER_REQUEST = 60 / MAX_REQUESTS_PER_MIN

def delayed_get_embedding(text):
    time.sleep(WAIT_SECONDS_PER_REQUEST)
    return get_embedding(text, engine=embedding_model)

In [None]:
faq_embeddings = get_embeddings(faqs.faq_content.tolist(), engine=embedding_model)

In [None]:
print(len(faq_embeddings))
print(len(faq_embeddings[0]))

In [None]:
!pip install pyarrow

In [None]:
faqs["faq_content_embedding"] = pd.Series(faq_embeddings).apply(np.asarray)
faqs[["faq_name", "faq_title", "faq_content_embedding"]].to_parquet("../data/faq_embeddings.parquet")

In [None]:
faqs.faq_content_embedding[faqs.faq_content_embedding.isnull()]

In [None]:
faqs["current_query_cossim"]

In [None]:
from openai.embeddings_utils import get_embedding, cosine_similarity

# search through the FAQs
def get_top_k_faqs(query, k=10):
    query_embedding = get_embedding(
        query,
        engine=embedding_model
    )
    null_mask = faqs.faq_content_embedding.isnull()
    faqs["current_query_cossim"] = faqs.faq_content_embedding.apply(lambda x: cosine_similarity(x, query_embedding))
    faqs.loc[null_mask, "current_query_cossim"] = np.nan

    results = (
        faqs.sort_values("current_query_cossim", ascending=False)
        .head(k)
        .faq_name
        .tolist()
    )
    del faqs["current_query_cossim"]
    return results

In [None]:
top_10_faq_names = get_top_k_faqs("Is it normal to vomit every day for a week when I'm 16 weeks pregnant?", k=10)

In [None]:
top_10_faq_names

In [None]:
validation_data = faqs[faqs.faq_content_embedding.notnull()].explode("question_ref").reset_index()

In [None]:
validation_data.head()

In [None]:
validation_data.loc[validation_data.question_ref.apply(len) > 250, "question_ref"]

In [None]:
validation_data.loc[249].question_ref

In [None]:
validation_data.question_ref.apply(len).hist()

In [None]:
validation_data_questions = validation_data.question_ref.tolist()

In [None]:
query_embeddings_list = []
failed_indices = []
for i, query in enumerate(validation_data_questions):
    try:
        query_embeddings_list.append(get_embedding(query, engine=embedding_model))
    except Exception as e:
        print(f"{i}: {e}")
        failed_indices.append(i)

In [None]:
validation_data_questions[142]

In [None]:
validation_data_drop_weird = pd.concat([validation_data.iloc[:142], validation_data.iloc[143:]])

In [None]:
def get_top_k_faqs_for_embedding(query_embedding, k=10):
    null_mask = faqs.faq_content_embedding.isnull()
    faqs["current_query_cossim"] = faqs.faq_content_embedding.apply(lambda x: cosine_similarity(x, query_embedding))
    faqs.loc[null_mask, "current_query_cossim"] = np.nan

    results = (
        faqs.sort_values("current_query_cossim", ascending=False)
        .head(k)
        .faq_name
        .tolist()
    )
    del faqs["current_query_cossim"]
    return results

In [None]:
validation_data_drop_weird["top10_pred"] = list(map(get_top_k_faqs_for_embedding, query_embeddings_list))

In [None]:
for k in [1, 3, 5, 7, 10]:
    validation_data_drop_weird[f"isin_top{k}"] = validation_data_drop_weird.apply(lambda row: row.faq_name in row.top10_pred[:k], axis=1)

In [None]:
for k in [1, 3, 5, 7, 10]:
    acc=validation_data_drop_weird[f'isin_top{k}'].mean()
    print(f"Top {k} accuracy: {acc:.1%}")

In [None]:
validation_data_drop_weird.head()

In [None]:
validation_data_drop_weird[["question_ref", "faq_name", "faq_title", "faq_content"]].to_csv("../data/synthetic_validation_data.csv", index=False)

In [None]:
faqs.columns

In [None]:
# faqs["faq_tags"]=faqs.faq_tags.str.strip().str.split(", ")

In [None]:
faqs["faq_tags"] = faqs.faq_tags.apply(lambda x: "{" + str(x)[1:-1] + "}")

In [None]:
faqs.faq_tags

In [None]:
(
    faqs.loc[
        faqs.faq_content_embedding.notnull(),
        ["faq_name", "faq_title", "faq_content", "faq_tags", "questions_usr", "question_ref"]
    ]
    .rename(columns={"faq_content": "faq_content_to_send"})
    .to_csv("../data/faqs_with_synthetic_questions.csv", index=False)
)

Validation with custom embeddings + WMD + scoring on the entire content:

* Top 1 accuracy: 0.28
* Top 5 accuracy: 0.59
* Top 10 accuracy: 0.72


With Davinci?

In [None]:
model='text-davinci-003'