# Preprocessing: Sales Buddy

The SalesBuddy corpus has multiple source files, so requires a few extra preprocessing steps

In [None]:
import pandas as pd
import os
import json
import matplotlib.pyplot as plt
import csv
root = os.environ["PROJECT_ROOT"]

datapath = os.path.join(root, "data", "salesbuddy")
human_questions_path = os.path.join(datapath, "human")
os.makedirs(human_questions_path, exist_ok=True)

In [None]:
# the correct plain content key for each file
file_to_content_key = {
    "academy_kbint" : "plain_content",
    "docs_kbint" : "topic_plain",
    "pegacom_files": "Content",
    "pegacom_pages" : "rendered_item_plain",
    "saleshub_files" : "Content",
    "saleshub_pages": "rendered_item_plain"
}


In [None]:
corpus_path = os.path.join(datapath, "source_documents", "corpus_excel")

corpus = {}

for f in os.listdir(corpus_path):
    source = pd.read_excel(os.path.join(corpus_path, f), verbose=False)
    
    matched_key = next((k for k in file_to_content_key if f.startswith(k)), None)
    if matched_key:
        print(f"key of {f}: {matched_key}")
        source_contents = source[file_to_content_key[matched_key]]
        source_titles = source['title']
        source_contents_list = list(source_contents[source_contents.notna()])
        source_titles_list = list(source_titles[source_contents.notna()])
        
        source_subcorpus = dict(zip(source_titles_list, source_contents_list))
        corpus.update(source_subcorpus)

    


In [None]:
doc_lengths = [len(d.split(" ")) for d in corpus.values()]


In [None]:

plt.hist(doc_lengths, bins=50);
plt.ylabel("# Documents")
plt.xlabel("# Words")

In [None]:
with open(os.path.join(datapath, "corpus.json"), "w", encoding="utf8") as f:
    json.dump(corpus, f, indent=3)

Now for the questions

In [None]:
with open(os.path.join(datapath, "source_documents", "SalesBuddyQuestions.csv"), "r") as f:
    csv_reader = csv.DictReader(f)
    data = [row for row in csv_reader]

    questions = [{
        "question" : q["Prompt"],
        "reference": q["Answer"]
    } for q in data]

with open(os.path.join(human_questions_path, "questions.json"), "w", encoding="utf8") as f:
    json.dump(questions, f, indent=3)