In [None]:
%pip install -r requirements.txt -q

In [None]:
%pip install tiktoken -q

# Create Vectorstore
In this notebook we create a vectorstore in our existing Postgressql aurora serverless db.
We will store Zendesk articles existing in our knowledge base. We will also build a way to track existing articles already stored.

## Database Connection

In [None]:
from utils import db

In [None]:
engine = db.engine

## Get zendesk articles

In [None]:
from utils.zendesk_articles import loader
import json
import pandas as pd

In [None]:
zendesk_loader = loader

In [None]:
documents = zendesk_loader.load()

In [None]:
print(f"Nr of articles in the knowledge base {len(documents)}")
documents[:2]

## Create a Log in the database about existing articles and latest updates
1. Extract the article ID and the latest update date
2. Store in a simple tracking table in RDS

In [None]:
metadata_list = [d.metadata for d in documents]
metadata_list[:2]

In [None]:
md_df = pd.DataFrame(metadata_list)
md_df["updated_at"] = pd.to_datetime(md_df["updated_at"])
md_df.info(), md_df.head()

In [None]:
#Store the tracking info
try:
    md_df.to_sql('zendesk_kb_tracker', con=engine, if_exists='append', index=False)
    print("Data inserted successfully.")
except Exception as e:
    print(f"An error occurred: {e}")

## Work with embeddings
What is the size of our articles, do we chunk them?

In [None]:
# Build a dataframe with the doc content and see the article length
docs = [d.to_dict() for d in documents]
doc_list = []
for doc in docs:
    d = {}
    d["id"] = doc["metadata"].get("id", "")
    d["title"] = doc["metadata"].get("title", "")
    d["updated_at"] = doc["metadata"].get("updated_at", "")
    d["text"] = doc.get("text", "")
    doc_list.append(d)
doc_df = pd.DataFrame(doc_list)
doc_df = doc_df.assign(text_len = doc_df["text"].apply(lambda x: len(x.split())))
doc_df
                                          

In [None]:
doc_df["text_len"].max()

# Embeddings

In [None]:
len(documents)

In [None]:
#Convert to langchain format
docs = [doc.to_langchain_format() for doc in documents]
print(len(docs))
docs[0]

In [None]:
import boto3
import json
import os
import sys

module_path = ".."
sys.path.append(os.path.abspath(module_path))
from utils import bedrock

bedrock_client = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    region="eu-central-1",
    runtime=True # Default. Needed for invoke_model() from the data plane
)

In [None]:
from utils.TokenCounterHandler import TokenCounterHandler

token_counter = TokenCounterHandler()

In [None]:
from langchain.embeddings import BedrockEmbeddings

embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1",
                               client=bedrock_client)

In [None]:
from langchain_community.vectorstores.pgvector import PGVector

In [None]:
COLLECTION_NAME = "zendesk_articles"

## Create Vector Database

In [None]:
db = PGVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=db.connection_str,
)

## If you already have a database, Load it instead

In [None]:
db =PGVector.from_existing_index(embedding=embeddings,collection_name=COLLECTION_NAME, connection_string=conn_string)

## Test the retrieval mechanism

In [None]:
query ="What do I need to do when I create a new case?"

In [None]:
docs_with_score = db.similarity_search_with_score(query,k=2)

In [None]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

# Try Question answering
Lets try a chain

## First load a LLM

In [None]:
from langchain.llms.bedrock import Bedrock

model_kwargs_claude = { 
        "max_tokens_to_sample": 512,
        "stop_sequences": [],
        "temperature":0,  
        "top_p":0.5
    }

# Anthropic Claude Model
llm = Bedrock(
    model_id="anthropic.claude-v2", 
    client=bedrock_client, 
    model_kwargs=model_kwargs_claude,
)

In [None]:
#Test
llm.invoke("Whats up")

## Set retriever from the db

In [None]:
retriever =db.as_retriever(search_kwargs={"k": 2})

In [None]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    callbacks=[token_counter]
)

In [None]:
result = qa.invoke({"query": query})
print(result['result'])

print(f"\n{result['source_documents']}")

In [None]:
result["result"]

In [None]:
print(doc_df[doc_df["title"]=="Log in Issues"]["text"].values[0])

In [None]:
query = "I have problems with my registration, what are the steps I need to follow?"
result = qa.invoke({"query": query})
print(result['result'])
print(f"\n{result['source_documents']}")