In [1]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_core.callbacks import StdOutCallbackHandler ## By using this line we can see how langchain works

In [2]:
MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [3]:
load_dotenv(override=True)

openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")

OpenAI API Key exists and begins sk-proj-


#### Read in documents using LangChain's loaders

In [5]:
## We can use this code to read markdwon files
folders = glob.glob("heathprocact_docs/*")
text_loader_kwargs = {'encoding': 'utf-8'}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [4]:
## We can use this code to read pdf files
folders = glob.glob("heathprocact_docs/*")
text_loader_kwargs = {'encoding': 'utf-8'}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)

    # Load PDF files
    pdf_loader = DirectoryLoader(
        folder,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader
    )
    pdf_docs = pdf_loader.load()
    for doc in pdf_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [5]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [6]:
len(chunks)

21

In [7]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

Document types found: References


In [8]:
embeddings = OpenAIEmbeddings()

In [9]:
## Delete if already exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

### Create vectorstore

In [10]:
vectorstore = Chroma.from_documents(documents = chunks, 
                                    embedding = embeddings, 
                                    persist_directory = db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 21 documents


In [11]:
## Get one vector and find how many dimensions it has
collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 1,536 dimensions


### Chat Application (RAG pipeline with LangChain)

In [12]:
## create a new Chat with OpenAI (model)
llm = ChatOpenAI(temperature = 0.7, 
                 model_name = MODEL)

## set up the conversation memory for the chat (Memory)
memory = ConversationBufferMemory(
    memory_key = 'chat_history', 
    return_messages = True)

## the retriever is an abstraction over the VectorStore that will be used during RAG (Retreiver)
retriever = vectorstore.as_retriever()
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm = llm, 
    retriever = retriever, 
    memory = memory,
    callbacks = [StdOutCallbackHandler()]) ## By using this line, we can see how langchain works

  memory = ConversationBufferMemory(


In [13]:
query = "Can you describe Heathprocact in a few sentences"
result = conversation_chain.invoke({"question":query})
print(result["answer"])



[1m> Entering new ConversationalRetrievalChain chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
20 
 
Hasekamp, O.; Lorente, A.; Hu, H.; Butz, A.; aan de Brugh, J. ; Landgraf, J.: Algorithm Theoretical Baseline Document for 
Sentinel-5 Precursor Methane Retrieval, Sron-S5P-Lev2-Rp-001, (v1.10), 1–67, 2019. 
Horel, J.; Splitt, M.; Dunn, L.; Pechmann, J.; White, B.; Ciliberti, C.; Lazarus, S.; Slemmer, J.; Zaff, D.; Burks, J. MesoWest: 
Cooperative Mesonets in the Western United States. Bull. Am. Met eorol. Soc. , 83, 211 – 225,  DOI: 10.1175/1520 -
0477(2002)083<0211:MCMITW>2.3.CO;2, 2002. 485 
Hu, H.; Landgraf, J.; Detmers, R.; Borsdorff, T.; Aan de Brugh, J.; Aben, I.; Butz, A.; Hasekamp, O.: Toward glob

In [14]:
def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

In [15]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.




[1m> Entering new ConversationalRetrievalChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: Can you describe Heathprocact in a few sentences
Assistant: I don't know.
Follow Up Input: what this research is about
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
18 
 
Author contribution 
D.J.V., D.J., J.M., I.S., and D.J.J. contributed to study conceptualization. D.J.V., D.J., J.M., I.S., and D.G. contributed to 435 
methods development and data analysis.

### If the RAG pipeline does not give the correct answers, we can do followings

* Change the chunk size and look in their overlapping
* Instead of giving chunks to the context, pass the full document to it
* Set the number of chunks in the RAG Pipeline as follows (Recommended)