In [3]:
import os
root = os.environ['PROJECT_ROOT']
import json
from tqdm import tqdm
import copy
import pdstools.infinity.client as client
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

cl = client.Infinity.from_basic_auth(
    pega_version='24.2',
    timeout=1000000,
    user_name=os.environ['PEGA_USERNAME']
)

# Ingestion

This notebook can be used to ingest a corpus into a specified datasource. We do this by first loading a template json request file, where we update the necessary variables and attributes. The structure of such an ingestion request can be found in the [pega documentation](https://docs.pega.com/bundle/knowledge-buddy/page/knowledge-buddy/implementation/ingestion-api.html). Before you attempt to run this notebook, ensure you have done preprocessing properly, as is documented in the pp_template.ipynb notebook.

These are the only variables you need to change to customise your experiment.

In [4]:
dataset_name = "Sales"
collection = "Experiments"
ingestion_template_path = f"{root}/data/ingestion/ingestion_single_doc.json"
chunk_overlap = 200 #chars
chunk_size = 1000 #chars

## Create requests
Using the corpus and ingestion template, we create a request for every document in the corpus.

In [None]:
with open(os.path.join(root, "data", dataset_name, "corpus.json"), "r", encoding="utf8") as f:
    corpus = json.load(f)

# filter out title only documents
corpus = {k:v for (k,v) in corpus.items() if k!=v}

with open(ingestion_template_path, "rb") as f:
    template = json.load(f)

ingestions = []

In [None]:
for i, (title, content) in enumerate(corpus.items()):
    ingestion = copy.deepcopy(template)
    ingestion["text"][0]["content"] = content
    ingestion["dataSource"] = dataset_name
    ingestion["title"] = title
    ingestion["collection"] = collection
    ingestion["objectId"] = title
    ingestion["chunkSize"] = chunk_size
    ingestion["chunkOverlap"] = chunk_overlap

    ingestions.append(ingestion)

Create a quick plot to get some information about the data. If you have any documents that are particularly large (anecdotally >15.000 words), the ingestion may fail. In that case you may want to split up these documents into multiple requests.

In [None]:
# get descriptives
texts_lengths = [len(t.split()) for t in corpus.values()]
m, sd = np.mean(texts_lengths), np.std(texts_lengths)
print(f"Mean words per doc: {m}")
print(f"SD words per doc: {sd}")
print(f"Total docs: {len(corpus)}")
print(f"Total words in corpus: {sum(texts_lengths)}")

sns.histplot(data=texts_lengths)
plt.xlabel("Document length (words)");

## Ingestion - ONLY RUN ONCE
This code ingests the corpus into the specified dataSource. Only do this once to prevent duplicate documents!

In [None]:
# ONLY RUN THIS ONCE!
for request in tqdm(ingestions):
    response = cl.put(
        data=request,
        endpoint= os.environ["PEGA_BASE_URL"]+"indexes"
    )