# RAG Retrieval System

## Imports

In [1]:
# System
import os
os.environ['USER_AGENT'] = 'JimYin88'

# LLM Models
from langchain_openai import ChatOpenAI
from langchain_ollama import OllamaLLM
from langchain_ollama import ChatOllama

# Templates
from langchain_core.prompts import ChatPromptTemplate

# Document Loaders
from langchain_community.document_loaders import Docx2txtLoader
# from langchain_community.document_loaders import WebBaseLoader
# from langchain.document_loaders import BSHTMLLoader
# from langchain_community.document_loaders import UnstructuredRTFLoader

# Document Splitters
from langchain.text_splitter import TokenTextSplitter
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# Embeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
# from langchain_openai import OpenAIEmbeddings

# Vector Stores
from langchain_chroma import Chroma
# from langchain_community.vectorstores.faiss import FAISS

# LangChain
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# OutputParsers
from langchain.schema.output_parser import StrOutputParser

## Loading Environment Variables

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

## Load documents

In [3]:
loader = Docx2txtLoader(".\\data\\JimYin-Resume.docx")

docs = loader.load()

## Split documents into chunks

In [4]:
text_splitter = TokenTextSplitter(chunk_size=400, chunk_overlap=20)
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=20)

splitDocs = text_splitter.split_documents(docs)

## Embedding and Vector Store Documents

In [5]:
embedding_function = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [6]:
db = Chroma.from_documents(documents=splitDocs, 
                           embedding=embedding_function, 
                           collection_name="Resume",
                           persist_directory="./resume")

## ChromaDB methods

### Adding Documents

In [7]:
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"}
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"}
)

In [8]:
db.add_documents(documents=[document_1, document_2])

['8ec35247-d014-4403-a37f-53080ae0c268',
 '6cbc4b0c-5250-44d1-a946-6867002c9f46']

### Viewing Documents in Chroma DB

In [9]:
db.get()

{'ids': ['3cf3dd25-ac8c-4571-ba45-58cd8b42920d',
  'fc383c02-df17-4d79-9e76-a2f0d32c2bae',
  'f647d286-bff9-416d-bdac-1a67934c6eba',
  '124caf9e-6a81-41f3-8dd9-dbc265babdec',
  '8ec35247-d014-4403-a37f-53080ae0c268',
  '6cbc4b0c-5250-44d1-a946-6867002c9f46'],
 'embeddings': None,
 'documents': ['Jim C. Yin\n\n101 Darling Ave.\n\nBloomfield, NJ 07003\n\n Cell (626) 675-1990 • jimyin88@gmail.com\n\n\n\n\n\nSUMMARY\tHighly skilled data scientist with extensive expertise in the financial markets, possessing more than 20 years of experience in data science and finance.\n\n\t\t\n\nEXPERIENCE\n\n\n\n\t\t\t1/2023 – 9/2024\tPrudential Financial, Inc.\tNewark, NJ\n\n\t\t\t\t\tSenior Data Scientist\n\n\t\t\t• \tDeveloped various software programs to assist clients in retirement planning, such as a Monte Carlo simulation that accurately projects their life savings and recommends optimal asset allocations based on factors including risk tolerance.\n\n\t\t\t• \tConducted thorough analyses of annuity

In [11]:
db.get_by_ids(['8ec35247-d014-4403-a37f-53080ae0c268'])

[Document(id='8ec35247-d014-4403-a37f-53080ae0c268', metadata={'source': 'tweet'}, page_content='I had chocolate chip pancakes and scrambled eggs for breakfast this morning.')]

### Updating Documents in Chroma DB

In [12]:
updated_document_1 = Document(
    page_content="I had chocolate chip pancakes and fried eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

updated_document_2 = Document(
    page_content="The weather forecast for tomorrow is sunny and warm, with a high of 82 degrees.",
    metadata={"source": "news"},
    id=2,
)

db.update_document(document_id='8ec35247-d014-4403-a37f-53080ae0c268', document=updated_document_1)

# You can also update multiple documents at once
# vector_store.update_documents(ids=uuids[:2], documents=[updated_document_1, updated_document_2])

In [13]:
db.get()['documents'][-2]

'I had chocolate chip pancakes and fried eggs for breakfast this morning.'

### Deleting Documents from Chroma DB

In [14]:
db.delete(ids=['8ec35247-d014-4403-a37f-53080ae0c268',
               '6cbc4b0c-5250-44d1-a946-6867002c9f46'])

In [15]:
db.get()

{'ids': ['3cf3dd25-ac8c-4571-ba45-58cd8b42920d',
  'fc383c02-df17-4d79-9e76-a2f0d32c2bae',
  'f647d286-bff9-416d-bdac-1a67934c6eba',
  '124caf9e-6a81-41f3-8dd9-dbc265babdec'],
 'embeddings': None,
 'documents': ['Jim C. Yin\n\n101 Darling Ave.\n\nBloomfield, NJ 07003\n\n Cell (626) 675-1990 • jimyin88@gmail.com\n\n\n\n\n\nSUMMARY\tHighly skilled data scientist with extensive expertise in the financial markets, possessing more than 20 years of experience in data science and finance.\n\n\t\t\n\nEXPERIENCE\n\n\n\n\t\t\t1/2023 – 9/2024\tPrudential Financial, Inc.\tNewark, NJ\n\n\t\t\t\t\tSenior Data Scientist\n\n\t\t\t• \tDeveloped various software programs to assist clients in retirement planning, such as a Monte Carlo simulation that accurately projects their life savings and recommends optimal asset allocations based on factors including risk tolerance.\n\n\t\t\t• \tConducted thorough analyses of annuity purchase applications to identify and address potential issues, utilizing similarit

### Loading and Connecting an existing Chroma DB

In [16]:
db_new_connection = Chroma(collection_name="SEC_filings",
                           embedding_function=embedding_function,
                           persist_directory="./sec_filings")

## Retrieving Most Relevant Documents

In [17]:
question = "What was Jim C. Yin last job title?"

In [18]:
similar_docs = db.similarity_search(query = question, 
                                    k = 2)

In [19]:
similar_docs

[Document(id='3cf3dd25-ac8c-4571-ba45-58cd8b42920d', metadata={'source': '.\\data\\JimYin-Resume.docx'}, page_content='Jim C. Yin\n\n101 Darling Ave.\n\nBloomfield, NJ 07003\n\n Cell (626) 675-1990 • jimyin88@gmail.com\n\n\n\n\n\nSUMMARY\tHighly skilled data scientist with extensive expertise in the financial markets, possessing more than 20 years of experience in data science and finance.\n\n\t\t\n\nEXPERIENCE\n\n\n\n\t\t\t1/2023 – 9/2024\tPrudential Financial, Inc.\tNewark, NJ\n\n\t\t\t\t\tSenior Data Scientist\n\n\t\t\t• \tDeveloped various software programs to assist clients in retirement planning, such as a Monte Carlo simulation that accurately projects their life savings and recommends optimal asset allocations based on factors including risk tolerance.\n\n\t\t\t• \tConducted thorough analyses of annuity purchase applications to identify and address potential issues, utilizing similarity searches on advisors with comparable characteristics. Contributed to cost savings for the co

## Injecting the documents into the prompt context window

In [21]:
prompt = ChatPromptTemplate.from_template("""
    Answer the user's question.
    Context: {context}
    Question: {input}
    """)

## Instantiate LLM model and Running the Chain

### Using ChatGPT 4o-mini

In [20]:
chat_model = ChatOpenAI(model="gpt-4o-mini-2024-07-18",
                        max_completion_tokens=1028,
                        temperature=0.2)

In [22]:
chain = prompt | chat_model | StrOutputParser()

In [23]:
response1 = chain.invoke({"context": similar_docs,
                          "input": question})

print(response1)

Jim C. Yin's last job title was Senior Data Scientist at Prudential Financial, Inc.


### Using llama3.2

In [25]:
ollama_model = OllamaLLM(model="llama3.2", 
               num_ctx = 4096, 
               num_predict = 256,
               temperature = 0.7,
               system = 'You are a helpful assistant.')

In [28]:
ollama_chain = prompt | ollama_model | StrOutputParser()

In [29]:
response2 = ollama_chain.invoke({"context": similar_docs,
                                 "input": question})

print(response2)

Jim C. Yin's last job title was Senior Data Scientist at Prudential Financial, Inc., a position he held from January 2023 to September 2024.


## Gradio Interface

In [30]:
def rag_retrieval_question(question):

    db_new_connection = Chroma(collection_name="Resume",
                           embedding_function=embedding_function,
                           persist_directory="./resume")

    similar_docs = db.similarity_search(query = question,
                                        k = 2)

    prompt = ChatPromptTemplate.from_template("""
    Answer the user's question.
    Context: {context}
    Question: {input}
    """)

    ollama_model = OllamaLLM(model="llama3.2",
                             num_ctx = 4096, 
                             num_predict = 256,
                             temperature = 0.7,
                             system = 'You are a helpful assistant.')

    ollama_chain = prompt | ollama_model | StrOutputParser()

    response2 = ollama_chain.invoke({"context": similar_docs,
                                 "input": question})

    return response2

In [32]:
print(rag_retrieval_question("What was Jim Yin's last job title?"))

Jim Yin's last job title mentioned in his resume is Senior Data Scientist at Prudential Financial, Inc. from 1/2023 to 9/2024.


In [33]:
import gradio as gr

In [35]:
interface = gr.Interface(fn = rag_retrieval_question,
                         inputs=[gr.Text(label="Question")],
                         outputs=[gr.Text(label="Answer")],
                         flagging_mode="never"
                        )

In [36]:
interface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7862
* Running on public URL: https://3d6fd19a0d9491ccc7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


