# Rag From Scratch: Overview

### The process of building RAG app(s) from scratch.

## Enviornment

In [None]:
!pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain pypdf sentence-transformers langchainhub

In [1]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.document_loaders import PyPDFLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Loading the PDF file using PyPDFLoader
loder = PyPDFLoader(r'c:\Users\imran\Downloads\BAHRAIN CBB Rulebook Appendix - 2 ESG Reporting Frameworks and Global Initiatives.pdf')
pages = loder.load()
# pages

# Combining all the pages into a single text
document_text = "\n".join([page.page_content for page in pages])
document_text


' Central Bank of Bahrain  \nRulebook  Common Volume  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \nENVIRONMENTAL, SOCIAL AND \nGOVERNANCE REQUIREMENTS \nMODULE  \n \n \n \n \n \n \n \n \n \n \n Central Bank of Bahrain  \nRulebook  Common Volume   \n \n \n MODULE:  ESG  (Environmental, Social and Governance  Requ irements )  \n                                                Table of Contents  \n \n \n \n Date \nLast \nChanged     \nESG -A Introduction   \n ESG -A.1 Introduction and Scope  XX/2023  \n ESG -A.2 Module History  XX/2023  \n    \nESG -1 Reporting Requirements    \n ESG -1.1 ESG Key Performance Indicators  XX/2023  \n ESG -1.2 ESG Reporting Process  XX/2023  \n    \n    \n    \nAPPENDICES     \nAppendix 1  ESG KPIs and Guidance   \nAppendix 2 ESG Reporting Frameworks and Global Initiatives   \n    \n \n \n \n Central Bank of Bahrain  \nRulebook  Common Volume   \n \n \n MODULE  ESG :  Environmental, Social and Governance  Requirements  \nCHAPTER  ESG -A:   Introduction   \n

In [3]:

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_text(document_text)
texts

['Central Bank of Bahrain  \nRulebook  Common Volume  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \nENVIRONMENTAL, SOCIAL AND \nGOVERNANCE REQUIREMENTS \nMODULE  \n \n \n \n \n \n \n \n \n \n \n Central Bank of Bahrain  \nRulebook  Common Volume   \n \n \n MODULE:  ESG  (Environmental, Social and Governance  Requ irements )  \n                                                Table of Contents  \n \n \n \n Date \nLast \nChanged     \nESG -A Introduction   \n ESG -A.1 Introduction and Scope  XX/2023  \n ESG -A.2 Module History  XX/2023  \n    \nESG -1 Reporting Requirements    \n ESG -1.1 ESG Key Performance Indicators  XX/2023  \n ESG -1.2 ESG Reporting Process  XX/2023  \n    \n    \n    \nAPPENDICES     \nAppendix 1  ESG KPIs and Guidance   \nAppendix 2 ESG Reporting Frameworks and Global Initiatives   \n    \n \n \n \n Central Bank of Bahrain  \nRulebook  Common Volume   \n \n \n MODULE  ESG :  Environmental, Social and Governance  Requirements  \nCHAPTER  ESG -A:   Introduction   \n

In [4]:
embeddings = HuggingFaceEmbeddings()
db = Chroma.from_texts(texts, embeddings)
db

  warn_deprecated(


<langchain_community.vectorstores.chroma.Chroma at 0x1f8c4595390>

In [14]:
retriever = db.as_retriever(search_kwargs={"k": 1})


In [15]:
docs = retriever.get_relevant_documents("What is ESG?")

In [16]:
len(docs)

1

In [17]:
# from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'))])

In [18]:
model_id = "facebook/opt-350m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

pipe = pipeline(
    "text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=100,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.15
)

llm = HuggingFacePipeline(pipeline=pipe)
llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x000001F8C92FBA10>)

In [19]:
# Chain
chain = prompt | llm

In [20]:
# Run
chain.invoke({"context":docs,"question":"What is ESG?"})

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


'Human: Answer the question based only on the following context:\n[Document(page_content=\'Central Bank of Bahrain  \\nRulebook  Common Volume  \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nENVIRONMENTAL, SOCIAL AND \\nGOVERNANCE REQUIREMENTS \\nMODULE  \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Central Bank of Bahrain  \\nRulebook  Common Volume   \\n \\n \\n MODULE:  ESG  (Environmental, Social and Governance  Requ irements )  \\n                                                Table of Contents  \\n \\n \\n \\n Date \\nLast \\nChanged     \\nESG -A Introduction   \\n ESG -A.1 Introduction and Scope  XX/2023  \\n ESG -A.2 Module History  XX/2023  \\n    \\nESG -1 Reporting Requirements    \\n ESG -1.1 ESG Key Performance Indicators  XX/2023  \\n ESG -1.2 ESG Reporting Process  XX/2023  \\n    \\n    \\n    \\nAPPENDICES     \\nAppendix 1  ESG KPIs and Guidance   \\nAppendix 2 ESG Reporting Frameworks and Global Initiatives   \\n    \\n \\n \\n \\n Central Bank of Bah

In [24]:
from langchain import hub
prompt_hub_rag = hub.pull("rlm/rag-prompt")

In [25]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What is Task Decomposition?")



'Human: Answer the question based only on the following context:\n[Document(page_content=\'Central Bank of Bahrain  \\nRulebook  Common Volume  \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nENVIRONMENTAL, SOCIAL AND \\nGOVERNANCE REQUIREMENTS \\nMODULE  \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Central Bank of Bahrain  \\nRulebook  Common Volume   \\n \\n \\n MODULE:  ESG  (Environmental, Social and Governance  Requ irements )  \\n                                                Table of Contents  \\n \\n \\n \\n Date \\nLast \\nChanged     \\nESG -A Introduction   \\n ESG -A.1 Introduction and Scope  XX/2023  \\n ESG -A.2 Module History  XX/2023  \\n    \\nESG -1 Reporting Requirements    \\n ESG -1.1 ESG Key Performance Indicators  XX/2023  \\n ESG -1.2 ESG Reporting Process  XX/2023  \\n    \\n    \\n    \\nAPPENDICES     \\nAppendix 1  ESG KPIs and Guidance   \\nAppendix 2 ESG Reporting Frameworks and Global Initiatives   \\n    \\n \\n \\n \\n Central Bank of Bah

In [5]:
model_name = "facebook/opt-350m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=100
)

llm = HuggingFacePipeline(pipeline=pipe)
llm

  warn_deprecated(


HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x000001F8C4488CD0>)