In [1]:
# Install Pytorch & other libraries
!pip install "torch==2.1.2" tensorboard

# Install Hugging Face libraries
!pip install  --upgrade \
  "transformers==4.38.2" \
  "datasets==2.16.1" \
  "accelerate==0.26.1" \
  "evaluate==0.4.1" \
  "bitsandbytes==0.42.0" \
  "trl==0.7.11" \
  "peft==0.8.2" \
    "langchain" \
"sentence-transformers" \
"faiss-cpu"
!pip install unstructured
!pip install pdfminer
!pip install pdfminer.six
!pip install -U langchain-community==0.2.4

Collecting transformers==4.38.2
  Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.7/130.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.16.1
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting accelerate==0.26.1
  Downloading accelerate-0.26.1-py3-none-any.whl.metadata (18 kB)
Collecting evaluate==0.4.1
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting bitsandbytes==0.42.0
  Downloading bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Collecting trl==0.7.11
  Downloading trl-0.7.11-py3-none-any.whl.metadata (10 kB)
Collecting peft==0.8.2
  Downloading peft-0.8.2-py3-none-any.whl.metadata (25 kB)
Collecting langchain
  Downloading langchain-0.2.3-py3-none-any.whl.metadata (6.9 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloadin

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("hf_token")

In [4]:
import torch
from IPython.display import display_markdown
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline
import transformers
import time
from langchain.document_loaders import UnstructuredPDFLoader,PDFMinerLoader,TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
 

# Hugging Face model id
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,token=hf_token,
    model_kwargs={
        "torch_dtype": torch.float16,
        "quantization_config": {"load_in_4bit": True},
        "low_cpu_mem_usage": True,
    },
)

terminators =  [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

### for semantic cache
vector_store = FAISS()

2024-06-08 06:05:40.505038: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-08 06:05:40.505154: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-08 06:05:40.616144: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [222]:
from langchain_community.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
import faiss
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
# Initialize an empty FAISS index
dimension = embeddings.client.get_sentence_embedding_dimension()
index = faiss.IndexFlatL2(dimension)



In [223]:
docstore = InMemoryDocstore()

In [224]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=docstore,
    index_to_docstore_id={}
)

In [36]:
### Pdf file Path for RAG
pdf_file_path = "/kaggle/input/deep-learning-ian-goodfellow/DeepLearningBook.pdf"

In [197]:
### this class used to retrieve the text from pdf and chunk it 
class Langchain_RAG:
    def __init__(self, pdf_file_path):
        self.embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
        self.pdf_file_path = pdf_file_path
        print("Loading PDF file, this may take time to process...")
        self.loader = PDFMinerLoader(self.pdf_file_path)
        self.data = self.loader.load()
        print("PDF file loaded.")
        print("Chunking...")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separators=[" ", ",", "\n"])
        self.texts = text_splitter.split_documents(self.data)
        print("Chunking completed.")
        self.get_vec_value = FAISS.from_documents(self.texts, self.embeddings)
        print("Vector values saved.")
        self.retriever = self.get_vec_value.as_retriever(search_kwargs={"k": 4})

    def __call__(self, query):
        relevant_docs = self.retriever.get_relevant_documents(query)
        return "".join([doc.page_content for doc in relevant_docs])


In [229]:
import time

# This class is used to generate responses from an LLM model
class Llama3_8B_gen:
    def __init__(self, pipeline, embeddings, vector_store, threshold):
        self.pipeline = pipeline
        self.embeddings = embeddings
        self.vector_store = vector_store
        self.threshold = threshold
        
    @staticmethod
    def generate_prompt(query,retrieved_text):
        messages = [
            {"role": "system", "content": "Answer the Question for the Given below context and information and not prior knowledge, only give the output result \n\ncontext:\n\n{}".format(retrieved_text) },
            {"role": "user", "content": query},]
        return pipeline.tokenizer.apply_chat_template(messages, tokenize=False,add_generation_prompt=True)
    
    def semantic_cache(self, query, prompt):
        query_embedding = self.embeddings.embed_documents([query])
        similar_docs = self.vector_store.similarity_search_with_score_by_vector(query_embedding[0], k=1)
        
        if similar_docs and similar_docs[0][1] <self.threshold:
            self.print_bold_underline("---->> From Cache")
            return similar_docs[0][0].metadata['response']
        else:
            self.print_bold_underline("---->> From LLM")
            output = self.pipeline(prompt, max_new_tokens=512, eos_token_id=terminators, do_sample=True, temperature=0.7, top_p=0.9)
            
            response = output[0]["generated_text"][len(prompt):]
            self.vector_store.add_texts(texts = [query], 
                       metadatas = [{'response': response},])
            
            return response
            
    def generate(self, query, retrieved_context):
        start_time = time.time()
        
        prompt = self.generate_prompt(query, retrieved_context)
        res = self.semantic_cache(query, prompt)   
        
        end_time = time.time()
        execution_time = end_time - start_time
        self.print_bold_underline(f"LLM generated in {execution_time:.6f} seconds")
        
        return res

    @staticmethod
    def print_bold_underline(text):
        print(f"\033[1m\033[4m{text}\033[0m")

 


In [230]:
text_gen = Llama3_8B_gen(pipeline=pipeline,embeddings=embeddings,
                         vector_store=vector_store,threshold=0.1)
retriever = Langchain_RAG(pdf_file_path=pdf_file_path)

In [226]:
def Rag_qa(query):
    retriever_context = retriever(query)
    result = text_gen.generate(query,retriever_context)
    return result

In [227]:
Rag_qa("What is Deep learning ?")

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[1m[4m---->> From LLM[0m
[1m[4mLLM generated in 6.349944 seconds[0m


'According to the given context, Deep Learning is an approach to machine learning that has drawn heavily on our knowledge of the human brain, statistics and applied math. Specifically, it is a type of machine learning that allows computer systems to improve with experience and data.'

In [231]:
Rag_qa("What is Deep learning ?")

[1m[4m---->> From Cache[0m
[1m[4mLLM generated in 0.014279 seconds[0m


'According to the given context, Deep Learning is an approach to machine learning that has drawn heavily on our knowledge of the human brain, statistics and applied math. Specifically, it is a type of machine learning that allows computer systems to improve with experience and data.'

In [232]:
Rag_qa("Explain back propagation algorithm.")

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[1m[4m---->> From LLM[0m
[1m[4mLLM generated in 40.006118 seconds[0m


"The back-propagation algorithm is a strategy for efficiently computing the gradient of the loss function with respect to the model's parameters in a neural network. It's a key component of the training process for neural networks.\n\nThe algorithm works by performing a forward pass through the network, computing the output for a given input, and then performing a backward pass, computing the gradient of the loss function with respect to the model's parameters.\n\nHere's a step-by-step breakdown of the back-propagation algorithm:\n\n1. Forward Pass: The algorithm starts by computing the output of the network for a given input. This is done by propagating the input through the network, layer by layer, using the network's weights and biases.\n2. Loss Computation: The output of the network is compared to the target output, and the loss function is computed. This is typically done using a mean squared error (MSE) or cross-entropy loss function.\n3. Backward Pass: The algorithm then perform

In [234]:
Rag_qa("back propagation algorithm.")

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[1m[4m---->> From LLM[0m
[1m[4mLLM generated in 26.812080 seconds[0m


'The back-propagation algorithm is used to compute the gradient of the loss function with respect to the parameters of a neural network. The algorithm involves two passes:\n\n1. Forward propagation: This involves propagating the input through the network to compute the output.\n2. Backward propagation: This involves propagating the error backward through the network to compute the gradient of the loss function with respect to the parameters.\n\nThe algorithm is as follows:\n\nAlgorithm 6.3:\n\n```\nRequire: x, the input to the network\nRequire: y, the target output\nRequire: ˆy, the output of the neural network\n\n1. Forward Propagation:\n    ˆy = f(x, θ)\n    where f is the neural network and θ are the parameters\n\n2. Compute the loss:\n    L = (ˆy - y)^2\n\n3. Backward Propagation:\n    Compute the gradient of the loss with respect to the output:\n        ∂L/∂ˆy = -2*(ˆy - y)\n\n    Compute the gradient of the output with respect to the hidden states:\n        ∂ˆy/∂h = ∂f/∂h\n\n    

In [235]:
Rag_qa("back propagation algorithm.")

[1m[4m---->> From Cache[0m
[1m[4mLLM generated in 0.013826 seconds[0m


'The back-propagation algorithm is used to compute the gradient of the loss function with respect to the parameters of a neural network. The algorithm involves two passes:\n\n1. Forward propagation: This involves propagating the input through the network to compute the output.\n2. Backward propagation: This involves propagating the error backward through the network to compute the gradient of the loss function with respect to the parameters.\n\nThe algorithm is as follows:\n\nAlgorithm 6.3:\n\n```\nRequire: x, the input to the network\nRequire: y, the target output\nRequire: ˆy, the output of the neural network\n\n1. Forward Propagation:\n    ˆy = f(x, θ)\n    where f is the neural network and θ are the parameters\n\n2. Compute the loss:\n    L = (ˆy - y)^2\n\n3. Backward Propagation:\n    Compute the gradient of the loss with respect to the output:\n        ∂L/∂ˆy = -2*(ˆy - y)\n\n    Compute the gradient of the output with respect to the hidden states:\n        ∂ˆy/∂h = ∂f/∂h\n\n    

### Explanation
When generating text directly from the Large Language Model (LLM), the process may take over 40 seconds. However, by caching the generated text, subsequent requests for the same text experience significantly reduced response times. This caching mechanism stores previously generated text, allowing for quick retrieval without the need to regenerate it, thus improving response times for repetitive requests. By leveraging this cache, the system optimizes performance and enhances user experience by minimizing wait times for text generation.