In [None]:
!pip install -qU langchain accelerate bitsandbytes transformers sentence-transformers faiss-gpu

In [None]:
!pip install -U langchain-community

## RAG(Retrieval-Augmented Generation) with LangChain
Rag typically has two main components:
1. Indexing, basically pipeline to ingest data (Usually done Offline).
2. Retrieival + Generation, this is the actual part of the RAG, receive user query and retrieve relevant data from index and passing the model. <br>
### Indexing
- Loading Document, using document loaders, can be from Google Drive, Notion, Slack, but in this case I am using kaggle database(or basically local kaggle notebook).
- Split, using text splitter to break documents into smaller chunks, useful for indexing and feeding the model.
- Store, place to store and index the splits (VectorDB and Embedding model are here), the VectorDB I am using is FAISS and sentence-transformer for embedding.

### Retrieval + Generation
- Retrieve, given the user input, retrieve relevant splits from the VectorDB.
- Generate, using chatmodel/LLM(in this case Mistral) to produce answer using prompt and retrieval data

In [4]:
import os
import transformers
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

from langchain.document_loaders import TextLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS #VectorDB
from langchain.document_loaders import PyPDFLoader
from langchain.chains import LLMChain
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from glob import glob

In [5]:
#Reducing Model size to save memory and increasing speed
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

In [6]:
# Model Pipeline and Embedding Initialization 
model = AutoModelForCausalLM.from_pretrained(
    "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1", 
    quantization_config = bnb_config,
    do_sample=True,
)

tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

text_generation_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    temperature=0.7,    
    task="text-generation",
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=2000,    
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()
2024-07-04 07:16:05.276686: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-04 07:16:05.276807: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-04 07:16:05.450848: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [7]:
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt_template = """
Instruction: Answer the question based on the following context:
{context}

Question:
{question} 
 """

# Create prompt from prompt template 
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain 
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

  warn_deprecated(
  warn_deprecated(


In [8]:
paper_paths = glob("/kaggle/input/10-transformative-llm-research-papers-of-2023/LLM Research Papers of 2023/*.pdf")
pages = []

for path in paper_paths:
    try:
        loader = PyPDFLoader(path)
        doc = loader.load()
        text_splitter = CharacterTextSplitter(chunk_size=500, 
                                      chunk_overlap=0)
        chunked_documents = text_splitter.split_documents(doc)
        
        pages.extend(chunked_documents)
    except Exception as e:
        print('Skipping', path, e)

In [9]:
# Load chunked documents into the FAISS index
db = FAISS.from_documents(
    pages,
    HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
)

  warn_deprecated(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
retriever = db.as_retriever()

rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

response = rag_chain.invoke("Why are vision-language task more diverse than NLP task?")

print ("Question:", response["question"])
print (response['text'])



Question: Why are vision-language task more diverse than NLP task?

Instruction: Answer the question based on the following context:
[Document(metadata={'source': '/kaggle/input/10-transformative-llm-research-papers-of-2023/LLM Research Papers of 2023/2303.12712.pdf', 'page': 12}, page_content='Figure 2.1: The ﬁrst image is Composition 8, art by Wassily Kandinsky, the second and the third\nare produced by GPT-4 and ChatGPT respectively with the prompt “Produce Javacript code that\ncreates a random graphical image that looks like a painting of Kandinsky”.\n2 Multimodal and interdisciplinary composition\nA key measure of intelligence is the ability to synthesize information from diﬀerent domains or modalities\nand the capacity to apply knowledge and skills across diﬀerent contexts or disciplines. In this section we will\nsee that, not only does GPT-4 demonstrate a high level of proﬁciency in diﬀerent domains such as literature,\nmedicine, law, mathematics, physical sciences, and programm