# RAG Retrieval System

## Imports

In [1]:
# System
import os
os.environ['USER_AGENT'] = 'JimYin88'

# LLM Models
from langchain_openai import ChatOpenAI

# Templates
from langchain_core.prompts import ChatPromptTemplate

# Document Loaders
from langchain_community.document_loaders import Docx2txtLoader
# from langchain_community.document_loaders import WebBaseLoader
# from langchain.document_loaders import BSHTMLLoader
# from langchain_community.document_loaders import UnstructuredRTFLoader

# Document Splitters
from langchain.text_splitter import TokenTextSplitter
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# Embeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
# from langchain_openai import OpenAIEmbeddings

# Vector Stores
from langchain_chroma import Chroma
# from langchain_community.vectorstores.faiss import FAISS

# LangChain
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# OutputParsers
from langchain.schema.output_parser import StrOutputParser

## Loading Environment Variables

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

## Instantiate LLM model

In [3]:
chat_model = ChatOpenAI(model="gpt-4o-mini-2024-07-18",
                        max_completion_tokens=1028,
                        temperature=0.2)

## Load documents

In [4]:
loader = Docx2txtLoader(".\\data\\JimYin-Resume.docx")

docs = loader.load()

## Split documents into chunks

In [5]:
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=200)

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, 
#                                                chunk_overlap=20)

splitDocs = text_splitter.split_documents(docs)

## Embedding and Vector Store Documents

In [6]:
embedding_function = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [7]:
db = Chroma.from_documents(documents=splitDocs, 
                           embedding=embedding_function, 
                           collection_name="Resume",
                           persist_directory="./resume")

## ChromaDB methods

### Adding Documents

In [36]:
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"}
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"}
)

In [37]:
db.add_documents(documents=[document_1, document_2])

['7b1ea25c-e6d1-491f-aaac-a8a03fb34efa',
 '14b95f9c-4216-4377-9896-320b7fdfcb2c']

### Viewing Documents in Chroma DB

In [38]:
db.get()

{'ids': ['c3dd2869-4d97-4048-863d-51fcda6f2d67',
  '0b872d74-194a-49f7-bd4d-433caa6a2055',
  '67d5d8ba-8ac7-4cac-8963-77408889275a',
  '3d0a0127-0ec6-4ec2-96ac-0e3bff9e04bd',
  '7b1ea25c-e6d1-491f-aaac-a8a03fb34efa',
  '14b95f9c-4216-4377-9896-320b7fdfcb2c'],
 'embeddings': None,
 'documents': ["• \tCreated Excel pivot tables and Tableau dashboards to display summary results, enabling companies to make critical business decisions.\n\n\t\t\t\n\n\t\t\t8/2006 – 4/2014\tS&P Capital IQ\tNew York, NY\n\n\t\t\t\t\tIndustry Analyst\n\n\tCovered companies in computer hardware/software, IT services, video games, restaurant, and household durables industries. Developed detailed financial models, projecting future incomes and cash flows based on fundamental analysis, macroeconomics factors, meetings with management, and industry data. Conducted proprietary research, allowing me to make timely sector calls against prevailing market views.\n\n\n\nWrote research notes and stock reports. Published bia

In [43]:
db.get_by_ids(['7b1ea25c-e6d1-491f-aaac-a8a03fb34efa'])

[Document(id='7b1ea25c-e6d1-491f-aaac-a8a03fb34efa', metadata={'source': 'tweet'}, page_content='I had chocolate chip pancakes and scrambled eggs for breakfast this morning.')]

### Updating Documents in Chroma DB

In [44]:
updated_document_1 = Document(
    page_content="I had chocolate chip pancakes and fried eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

updated_document_2 = Document(
    page_content="The weather forecast for tomorrow is sunny and warm, with a high of 82 degrees.",
    metadata={"source": "news"},
    id=2,
)

db.update_document(document_id='7b1ea25c-e6d1-491f-aaac-a8a03fb34efa', document=updated_document_1)

# You can also update multiple documents at once
# vector_store.update_documents(ids=uuids[:2], documents=[updated_document_1, updated_document_2])

In [48]:
db.get()['documents'][-2]

'I had chocolate chip pancakes and fried eggs for breakfast this morning.'

### Deleting Documents from Chroma DB

In [49]:
db.delete(ids=['7b1ea25c-e6d1-491f-aaac-a8a03fb34efa',
               '14b95f9c-4216-4377-9896-320b7fdfcb2c'])

In [50]:
db.get()

{'ids': ['c3dd2869-4d97-4048-863d-51fcda6f2d67',
  '0b872d74-194a-49f7-bd4d-433caa6a2055',
  '67d5d8ba-8ac7-4cac-8963-77408889275a',
  '3d0a0127-0ec6-4ec2-96ac-0e3bff9e04bd'],
 'embeddings': None,
 'documents': ["• \tCreated Excel pivot tables and Tableau dashboards to display summary results, enabling companies to make critical business decisions.\n\n\t\t\t\n\n\t\t\t8/2006 – 4/2014\tS&P Capital IQ\tNew York, NY\n\n\t\t\t\t\tIndustry Analyst\n\n\tCovered companies in computer hardware/software, IT services, video games, restaurant, and household durables industries. Developed detailed financial models, projecting future incomes and cash flows based on fundamental analysis, macroeconomics factors, meetings with management, and industry data. Conducted proprietary research, allowing me to make timely sector calls against prevailing market views.\n\n\n\nWrote research notes and stock reports. Published biannual industry surveys on the computer data storage, restaurant, and household durab

### Loading an existing Chroma DB

In [51]:
db_new_connection = Chroma(collection_name="Resume",
                           embedding_function=embedding_function,
                           persist_directory="./resume")

## Retrieving Most Relevant Documents

In [8]:
question = "What was Jim C. Yin last job title?"

In [9]:
similar_docs = db.similarity_search(query = question, 
                                    k = 2)

In [10]:
similar_docs

[Document(id='67d5d8ba-8ac7-4cac-8963-77408889275a', metadata={'source': '.\\data\\JimYin-Resume.docx'}, page_content='Jim C. Yin\n\n101 Darling Ave.\n\nBloomfield, NJ 07003\n\n Cell (626) 675-1990 • jimyin88@gmail.com\n\n\n\n\n\nSUMMARY\tHighly skilled data scientist with extensive expertise in the financial markets, possessing more than 20 years of experience in data science and finance.\n\n\t\t\n\nEXPERIENCE\n\n\n\n\t\t\t1/2023 – 9/2024\tPrudential Financial, Inc.\tNewark, NJ\n\n\t\t\t\t\tSenior Data Scientist\n\n\t\t\t• \tDeveloped various software programs to assist clients in retirement planning, such as a Monte Carlo simulation that accurately projects their life savings and recommends optimal asset allocations based on factors including risk tolerance.\n\n\t\t\t• \tConducted thorough analyses of annuity purchase applications to identify and address potential issues, utilizing similarity searches on advisors with comparable characteristics. Contributed to cost savings for the co

## Injecting the documents into the prompt context window

In [11]:
prompt = ChatPromptTemplate.from_template("""
    Answer the user's question.
    Context: {context}
    Question: {input}
    """)

In [12]:
chain = prompt | chat_model | StrOutputParser()

In [13]:
response = chain.invoke({"context": similar_docs,
                         "input": question})

print(response)

Jim C. Yin's last job title was Senior Data Scientist at Prudential Financial, Inc.
