In [1]:
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader

DATA_PATH = "data"
def loadDocuments():
    loader = PyPDFDirectoryLoader(DATA_PATH)
    documents = loader.load()
    return documents

In [2]:
docs = loadDocuments()

In [3]:
len(docs)

30

## Split the text into characters

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [5]:
textSplitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 500,
    length_function = len,
    add_start_index = True
)

In [6]:
chunks = textSplitter.split_documents(docs)

In [7]:
print("metadata: \n",chunks[0].metadata)
print("page_content: \n", chunks[0].page_content)

metadata: 
 {'source': 'data/uno.pdf', 'page': 0, 'start_index': 0}
page_content: 
 UNO
RULES
RULES .ORGUNOTHE OFFICIAL
THE RULES OF UNO ARE SIMPLE.
HOWEVER, WE HAVE MADE IT EVEN
EASIER TO SURVEY THE RULES AND
ADDED AN FAQ AT THE END!
 
CHECK OUT UNORULES.ORG FOR MORE!RULES
.ORGUNO
PRESENTS


In [8]:
len(docs)

30

In [10]:
import chromadb

chroma_client = chromadb.Client()

## Load API Key

In [11]:
from dotenv import load_dotenv
load_dotenv()

True

In [12]:
import os
API_KEY = os.environ.get('OPENAI_API_KEY')

In [13]:
import os
from langchain_openai import OpenAIEmbeddings
from chromadb.utils import embedding_functions
import chromadb.utils.embedding_functions as embedding_functions

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=API_KEY,
                model_name="text-embedding-ada-002"
            )

collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=openai_ef)

In [14]:
# Prepare the data for insertion
documents = []
metadatas = []
ids = []

for chunk in chunks:
    documents.append(chunk.page_content)
    metadatas.append(chunk.metadata)
    if len(ids) == 0:
        ids.append(0)
    else:
        ids.append(ids[-1] + 1)

In [15]:
type(documents[0])

str

In [16]:
ids_str = []
for id in ids:
    ids_str.append(str(id))

# Insert the data into the collection
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids_str
)

In [45]:
query_txt = "ice cream"

In [46]:
results = collection.query(query_texts=[query_txt], n_results=5)

In [47]:
results

{'ids': [['65', '27', '29', '30', '64']],
 'distances': [[0.5465304851531982,
   0.5557506084442139,
   0.5559098720550537,
   0.5587649941444397,
   0.5590863227844238]],
 'metadatas': [[{'page': 7, 'source': 'data/risk.pdf', 'start_index': 1592},
   {'page': 2, 'source': 'data/monopoly.pdf', 'start_index': 524},
   {'page': 2, 'source': 'data/monopoly.pdf', 'start_index': 1560},
   {'page': 2, 'source': 'data/monopoly.pdf', 'start_index': 2037},
   {'page': 7, 'source': 'data/risk.pdf', 'start_index': 1110}]],
 'embeddings': None,
 'documents': [['own, or connected to it by a dashed line. Examples: Greenland mayattack the Northwest Territory, Ontario, Quebec and Iceland. North\nAfrica may attack Egypt, Western Europe and Brazil. At the western and\neastern edges of the board, Alaska is considered adjacent to, and mayattack, Kamchatka.\nYou must always have at least two armies in the territory you ’re\nattacking from.\nYou may continue attacking one territory until you have eliminated

In [48]:
if len(results) == 0 or results['distances'][0][1] > 0.7:
    print("no relevant results found")

In [49]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

In [50]:
model = ChatOpenAI(model="gpt-3.5-turbo")

In [51]:
PROMPT_TEMPLATE = """
Answer the question based only only on the following context:
{context} 
---
Answer the question based in the above context: {query}
"""

context_text = "\n\n--\n\n".join([doc for doc in results['documents'][0]])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, query=query_txt)

response_text = model.invoke(prompt)

print("###########-BOT-############")
print(response_text.content)

###########-BOT-############
There is no information in the provided context about ice cream.


In [43]:
from termcolor import colored

print(colored('hello', 'red'), colored('world', 'green'))

[31mhello[0m [32mworld[0m


In [44]:
query_txt = input(colored("Enter Prompt: ", "blue", "on_white", attrs=['reverse', 'blink']))