# RAG Retrieval System

## Imports

In [1]:
import os
os.environ['USER_AGENT'] = 'JimYin88'

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
# from langchain_community.document_loaders import WebBaseLoader
# from langchain.document_loaders import BSHTMLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
# from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_chroma import Chroma
# from langchain_community.vectorstores.faiss import FAISS
from langchain.chains import create_retrieval_chain
from langchain_community.document_loaders import Docx2txtLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# from langchain_community.document_loaders import UnstructuredRTFLoader
from langchain.schema.output_parser import StrOutputParser

## Loading Environment Variables

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

## Instantiate LLM model

In [3]:
chat_model = ChatOpenAI(model="gpt-4o-mini-2024-07-18",
                        max_completion_tokens=1028,
                        temperature=0.2)

## Load documents

In [4]:
loader = Docx2txtLoader(".\\data\\JimYin-Resume.docx")

docs = loader.load()

## Split documents into chunks

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, 
                                               chunk_overlap=20)

splitDocs = text_splitter.split_documents(docs)

In [6]:
embedding_function = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [7]:
db = Chroma.from_documents(documents=splitDocs, 
                           embedding=embedding_function, 
                           collection_name="Resume",
                           persist_directory="./resume")

In [8]:
question = "What was Jim C. Yin last job title?"

In [9]:
similar_docs = db.similarity_search(query = question, 
                                    k = 2)

In [10]:
similar_docs

[Document(id='fb8c3f41-7e1a-4eed-aa0e-a5910d40389f', metadata={'source': '.\\data\\JimYin-Resume.docx'}, page_content='Jim C. Yin\n\n101 Darling Ave.\n\nBloomfield, NJ 07003\n\n Cell (626) 675-1990 • jimyin88@gmail.com\n\n\n\n\n\nSUMMARY\tHighly skilled data scientist with extensive expertise in the financial markets, possessing more than 20 years of experience in data science and finance.\n\n\t\t\n\nEXPERIENCE\n\n\n\n\t\t\t1/2023 – 9/2024\tPrudential Financial, Inc.\tNewark, NJ\n\n\t\t\t\t\tSenior Data Scientist\n\n\t\t\t• \tDeveloped various software programs to assist clients in retirement planning, such as a Monte Carlo simulation that accurately projects their life savings and recommends optimal asset allocations based on factors including risk tolerance.\n\n\t\t\t• \tConducted thorough analyses of annuity purchase applications to identify and address potential issues, utilizing similarity searches on advisors with comparable characteristics. Contributed to cost savings for the co

In [11]:
prompt = ChatPromptTemplate.from_template("""
    Answer the user's question.
    Context: {context}
    Question: {input}
    """)

In [12]:
chain = prompt | chat_model | StrOutputParser()

In [13]:
response = chain.invoke({"context": similar_docs,
                         "input": question})

print(response)

Jim C. Yin's last job title was Senior Data Scientist at Prudential Financial, Inc.
