Relies on 1.2

In [None]:
import pandas as pd
import s3fs


bucket = "s3://praekelt-static-resources/"
val_uri = "validation_aaq/validation_khumo_labelled_phase1and2_with_dem_context_clean.csv"

df = pd.read_csv(bucket + val_uri)

In [None]:
df2 = pd.read_csv(bucket + "validation_aaq/validation_khumo_labelled_phase1and2_with_dem_context.csv")

In [None]:
df2.shape

In [None]:
df2.columns

Remove PII

In [None]:
original_text = df2[df2.question_msg_id=="q31593"].iloc[0].Question
pii_removed_text = " ".join(original_text.split()[:-2])

In [None]:
pii_removed_text

In [None]:
df2[df2.question_msg_id=="q31593"].iloc[0].Question = pii_removed_text

Add labels on whether the chosen FAQ is the correct answer or not

In [None]:
df3 = pd.read_csv("s3://praekelt-static-resources/experiment/data/MC_ FAQ labelling for messages (For Khumo) - Phase 1.csv")

In [None]:
df3

In [None]:
df3.columns

In [None]:
correct_answer_labels = df3.rename(columns={"Notes from Khumo: ": "correct_answer"})[["question_msg_id", "correct_answer"]].dropna().reset_index(drop=True)

In [None]:
merged_answers = df2.merge(correct_answer_labels, on="question_msg_id", how="left")
df2["correct_answer"] = merged_answers.correct_answer_x.fillna(merged_answers.correct_answer_y)

# 2. Evaluate OpenAI Embeddings model

### 2.2 Get embeddings for FAQ

In [None]:
from openai.embeddings_utils import get_embedding, get_embeddings

# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002

In [None]:
faq_embeddings = pd.read_parquet("../data/faq_embeddings_updated.parquet")

### 2.3 Get embeddings for queries

#### Check query data

In [None]:
df2.Question

#### Get embeddings for queries

In [None]:
validation_df = df2.loc[df2.Question.notnull(), ["question_msg_id", "Question", "correct_answer", "FAQ Name"]].rename(columns={"FAQ Name": "faq_name", "Question": "question"}).copy()
validation_df.faq_name.loc[validation_df.faq_name=="Preg - Anemia"] = "Preg - Anaemia"

In [None]:
validation_questions = validation_df.question.tolist()

In [None]:
question_embeddings1 = get_embeddings(validation_df.question.iloc[:2048], engine=embedding_model)
question_embeddings2 = get_embeddings(validation_df.question.iloc[2048:], engine=embedding_model)

In [None]:
len(question_embeddings1) + len(question_embeddings2)

In [None]:
validation_df.shape[0]

In [None]:
validation_df["embeddings"] = question_embeddings1+question_embeddings2

In [None]:
validation_df["embeddings"] = validation_df.embeddings.apply(np.asarray)

In [None]:
validation_df.to_parquet("../data/validation_df.parquet")

In [None]:
faqs.faq_content_embedding.isnull().any()

### 2.4 Compute top K accuracies

In [None]:
faq_embeddings

In [None]:
set(validation_embeddings.faq_name.unique()) - set(faq_embeddings.faq_name.unique())

In [None]:
validation_embeddings.faq_name.loc[validation_embeddings.faq_name=="Preg - Anemia"] = "Preg - Anaemia"

In [None]:
faq_embeddings.faq_name.unique()

In [None]:
from openai.embeddings_utils import cosine_similarity


def get_top_k_faqs_for_embedding(query_embedding, k=10):
    faq_embeddings["current_query_cossim"] = faq_embeddings.faq_content_embedding.apply(lambda x: cosine_similarity(x, query_embedding))
    
    results = (
        faq_embeddings.sort_values("current_query_cossim", ascending=False)
        .head(k)
        .faq_name
        .tolist()
    )
    del faq_embeddings["current_query_cossim"]
    return results

In [None]:
validation_df["top10_pred"] = list(map(get_top_k_faqs_for_embedding, validation_df.embeddings.tolist()))

In [None]:
for k in [1, 3, 5, 7, 10]:
    validation_df[f"isin_top{k}"] = validation_df.apply(
        lambda row: row["faq_name"] in row.top10_pred[:k], axis=1
    )

In [None]:
validation_df.isnull().any()

In [None]:
valid_mask = validation_df.faq_name.notnull()

In [None]:
print("For all answers")
for k in [1, 3, 5, 7, 10]:
    acc=validation_df.loc[valid_mask, f'isin_top{k}'].mean()
    print(f"Top {k} accuracy: {acc:.1%}")

In [None]:
print("For all answers")
for k in [1, 3, 5, 7, 10]:
    acc=validation_df.loc[valid_mask, f'isin_top{k}'].mean()
    print(f"{acc:.1%}")

In [None]:
print("For correct answers")
for k in [1, 3, 5, 7, 10]:
    acc=validation_df.loc[valid_mask & (validation_df.correct_answer.str.lower() == "yes"), f'isin_top{k}'].mean()
    print(f"Top {k} accuracy: {acc:.1%}")

In [None]:
print("For correct answers")
for k in [1, 3, 5, 7, 10]:
    acc=validation_df.loc[valid_mask & (validation_df.correct_answer.str.lower() == "yes"), f'isin_top{k}'].mean()
    print(f"{acc:.1%}")

In [None]:
validation_embeddings.correct_answer.str.lower().value_counts()

## 3. Save validation data to upload to S3

Add demographic context columns

In [None]:
dem_ctx_cols = ['fields_edd', 'calculated_age', 'calculated_weeks_preg', 'num_babies', 'most_recent_baby_dob', 'most_recent_baby_age_weeks']
validation_df = pd.concat([validation_df, df2.loc[df2.Question.notnull(), dem_ctx_cols]], axis=1)

In [None]:
bucket

In [None]:
validation_df.to_parquet("s3://praekelt-static-resources/experiment/data/mc/mc_openai_validation.parquet", index=False)

In [None]:
validation_df.to_csv("s3://praekelt-static-resources/experiment/data/mc/mc_openai_validation.csv", index=False)

In [None]:
faqs.info()

In [None]:
faq_embeddings.faq_content_embedding.isnull().any()

In [None]:
faq_embeddings.info()

In [None]:
faq_embeddings.merge(faqs)

In [None]:
faq_embeddings.merge(faqs).to_csv("s3://praekelt-static-resources/experiment/data/mc/mc_openai_faqs.csv", index=False)

In [None]:
import numpy as np
with open("/Users/suzinyou/IDinsight/praekelt/templates/aaq_core_template/validation/true_faq_rank.npy", "rb") as fp:
    aaq_rank = pickle.load(fp)

In [None]:
validation_df.shape

In [None]:
valid_mask.sum()

In [None]:
len(aaq_rank)

In [None]:
validation_df["aaq_rank"] = np.inf
validation_df.loc[valid_mask, "aaq_rank"] = aaq_rank

In [None]:
print("Custom embeddings model")
print("For correct answers")
for k in [1, 3, 5, 7, 10]:
    acc = validation_df.loc[valid_mask & (validation_df.correct_answer.str.lower() == "yes")].aaq_rank.apply(lambda x: x <= k).mean()
    print(f"{acc:.1%}")

In [None]:
print("Custom embeddings model")
print("For all answers")
for k in [1, 3, 5, 7, 10]:
    acc = validation_df.loc[valid_mask].aaq_rank.apply(lambda x: x <= k).mean()
    print(f"{acc:.1%}")