In [2]:
!pip install pypdf
!pip install langchain_community
!pip install chromadb
!pip install rank_bm25
!pip install bitsandbytes
!pip install accelerate


Collecting chromadb
  Downloading chromadb-0.6.1-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.4-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3-

In [3]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.chains import RetrievalQA


In [4]:
doc_path="/content/rag_for_nlp.pdf"

In [5]:
loader=PyPDFLoader(doc_path)
docs=loader.load()


In [6]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=30)

In [7]:
chunks = splitter.split_documents(docs)

In [8]:
chunks

[Document(metadata={'source': '/content/rag_for_nlp.pdf', 'page': 0}, page_content='Retrieval-Augmented Generation for\nKnowledge-Intensive NLP Tasks\nPatrick Lewis†‡, Ethan Perez⋆,\nAleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,'),
 Document(metadata={'source': '/content/rag_for_nlp.pdf', 'page': 0}, page_content='Mike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†\n†Facebook AI Research; ‡University College London; ⋆New York University;\nplewis@fb.com\nAbstract'),
 Document(metadata={'source': '/content/rag_for_nlp.pdf', 'page': 0}, page_content='plewis@fb.com\nAbstract\nLarge pre-trained language models have been shown to store factual knowledge\nin their parameters, and achieve state-of-the-art results when ﬁne-tuned on down-'),
 Document(metadata={'source': '/content/rag_for_nlp.pdf', 'page': 0}, page_content='stream NLP tasks. However, their ability to access and precisely manipulate knowl-\nedge is still limit

In [9]:
HF_TOKEN="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"

In [10]:
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5")

In [11]:
vectorstore=Chroma.from_documents(chunks,embeddings)

In [12]:
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 3})

In [13]:
vectorstore_retreiver

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceInferenceAPIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7db3d4035240>, search_kwargs={'k': 3})

In [14]:
keyword_retriever = BM25Retriever.from_documents(chunks)

In [15]:
keyword_retriever.k =  3

In [16]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,keyword_retriever],weights=[0.3, 0.7])

# Mixing vector search and keyword search for Hybrid search

## hybrid_score = (1 — alpha) * sparse_score + alpha * dense_score

In [17]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

In [18]:
import torch
from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, )
from langchain import HuggingFacePipeline

In [19]:
# function for loading 4-bit quantized model
def load_quantized_model(model_name: str):
    """
    model_name: Name or path of the model to be loaded.
    return: Loaded quantized model.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config,
    )
    return model

In [20]:
# initializing tokenizer
def initialize_tokenizer(model_name: str):
    """
    model_name: Name or path of the model for tokenizer initialization.
    return: Initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)
    tokenizer.bos_token_id = 1  # Set beginning of sentence token id
    return tokenizer

In [21]:
tokenizer = initialize_tokenizer(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [22]:
model = load_quantized_model(model_name)

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [23]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

Device set to use cuda:0


In [24]:
llm = HuggingFacePipeline(pipeline=pipeline)

  llm = HuggingFacePipeline(pipeline=pipeline)


In [25]:
normal_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vectorstore_retreiver
)

In [26]:
hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=ensemble_retriever
)

In [27]:
response1 = normal_chain.invoke("What is Jeopardy Question Generation?")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [28]:
response1

{'query': 'What is Jeopardy Question Generation?',
 'result': 'Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\nQuestion Answering:\nAnswer GenerationRetriever pη \n(Non-Parametric) \nz 4 \nz 3 \nz 2 \nz 1 \nd(z) \nJeopardy Question\nGeneration:\nAnswer Query\n\nRAG can rely on parametric knowledge to generate reasonable responses.\n3.3 Jeopardy Question Generation\nTo evaluate RAG’s generation abilities in a non-QA setting, we study open-domain question gen-\n\neration. Rather than use questions from standard open-domain QA tasks, which typically consist\nof short, simple questions, we propose the more demanding task of generating Jeopardy questions.\n\nQuestion: What is Jeopardy Question Generation?\nHelpful Answer: Jeopardy Question Generation is a task in natural language generation where the system generates questions in the style of Jeopardy clues, which are typ

In [29]:
print(response1.get("result"))

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Question Answering:
Answer GenerationRetriever pη 
(Non-Parametric) 
z 4 
z 3 
z 2 
z 1 
d(z) 
Jeopardy Question
Generation:
Answer Query

RAG can rely on parametric knowledge to generate reasonable responses.
3.3 Jeopardy Question Generation
To evaluate RAG’s generation abilities in a non-QA setting, we study open-domain question gen-

eration. Rather than use questions from standard open-domain QA tasks, which typically consist
of short, simple questions, we propose the more demanding task of generating Jeopardy questions.

Question: What is Jeopardy Question Generation?
Helpful Answer: Jeopardy Question Generation is a task in natural language generation where the system generates questions in the style of Jeopardy clues, which are typically open-ended and require a broad knowledge of various topics. This task challenges 

In [33]:
response2 = hybrid_chain.invoke("What is Jeopardy Question Generation??")

In [34]:
response2

{'query': 'What is Jeopardy Question Generation??',
 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nQuestion Answering:\nAnswer GenerationRetriever pη \n(Non-Parametric) \nz 4 \nz 3 \nz 2 \nz 1 \nd(z) \nJeopardy Question\nGeneration:\nAnswer Query\n\ncorrect text more often than BART. Later, we also show that RAG generations are more diverse than\nBART generations (see §4.5).\n4.3 Jeopardy Question Generation\n\nTriviaQA 78786 8838 11314\nWebQuestions 3418 362 2033\nCuratedTrec 635 134 635\nJeopardy Question Generation 97392 13714 26849\nMS-MARCO 153726 12468 101093*\nFEVER-3-way 145450 10000 10000\n\nRAG can rely on parametric knowledge to generate reasonable responses.\n3.3 Jeopardy Question Generation\nTo evaluate RAG’s generation abilities in a non-QA setting, we study open-domain question gen-\n\neration. Rather than use questions from standard open-dom

In [35]:
print(response2.get("result"))

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Question Answering:
Answer GenerationRetriever pη 
(Non-Parametric) 
z 4 
z 3 
z 2 
z 1 
d(z) 
Jeopardy Question
Generation:
Answer Query

correct text more often than BART. Later, we also show that RAG generations are more diverse than
BART generations (see §4.5).
4.3 Jeopardy Question Generation

TriviaQA 78786 8838 11314
WebQuestions 3418 362 2033
CuratedTrec 635 134 635
Jeopardy Question Generation 97392 13714 26849
MS-MARCO 153726 12468 101093*
FEVER-3-way 145450 10000 10000

RAG can rely on parametric knowledge to generate reasonable responses.
3.3 Jeopardy Question Generation
To evaluate RAG’s generation abilities in a non-QA setting, we study open-domain question gen-

eration. Rather than use questions from standard open-domain QA tasks, which typically consist
of short, simple questions, we propose the more demandi