# Installing dependencies

In [14]:
# !pip install langchain -q
# !pip install openai -q
# !pip install PyPDF2 -q
# !pip install faiss-cpu -q
# !pip install tiktoken -q
# !pip install python-dotenv -q

# API KEYS

In [None]:
from dotenv import load_dotenv

load_dotenv()

# PDF Reading

In [2]:
from PyPDF2 import PdfReader

In [4]:
SOURCES = "./sources/"
PRINCIPLES = "NHRC-PDF-Principles_Of_Harm_Reduction.pdf"

In [5]:
pdf_reader = PdfReader(SOURCES + PRINCIPLES)

In [None]:
principles_text = ""
for page in pdf_reader.pages:
    text = page.extract_text()
    if text:
        principles_text += text

# Chunking

In [9]:
from langchain.text_splitter import  RecursiveCharacterTextSplitter

In [10]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 512,
    chunk_overlap  = 32,
    length_function = len,
)
principles_chunks = text_splitter.split_text(principles_text)

# Vector DB

In [16]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [None]:
embeddings = OpenAIEmbeddings() # Embeddings model
docsearch = FAISS.from_texts(principles_chunks, embeddings) # VectorDB

# Chain

In [17]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [None]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

In [None]:
query = "who are the authors of the article?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)