# RAG system using Llama2 with Hugging Face

In [1]:
import pypdf 
import transformers
import accelerate
import langchain
import torch
import bitsandbytes
import einops
import llama_index


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## for Embedding 
import sentence_transformers

In [3]:
import torch
torch.cuda.is_available()

True

# load all the pdf's 

In [4]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
# SErvice context combines llama2 model with the prompt 
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt









In [5]:
documents=SimpleDirectoryReader('data').load_data()
documents

[Document(id_='d76cf665-6a4e-45fd-97d7-b76a9d054f06', embedding=None, metadata={'page_label': '1', 'file_name': 'Neural Networks and AI Advances.pdf', 'file_path': 'd:\\pythonProjects\\RAG_finetuning\\data\\Neural Networks and AI Advances.pdf', 'file_type': 'application/pdf', 'file_size': 773786, 'creation_date': '2024-11-06', 'last_modified_date': '2024-10-29'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Convolutional Neural Networks (CNN)  \nA Convolutional Neural Network (CNN)  is a type of neural network that is specifically designed  to work \nwell with images and spatial data . It makes certain assumptions about the structure of the input (like an \nimage being made up of pixels in a grid) and uses specialized layers to process this kind of da

In [6]:
system_prompt=""" 
You are a Q&A assistant. Your goal is to answer questions as accurately 
as possible based on the instructions and context provided.
"""
## Default format supportable by Llama2 
query_wrapper_prompt=SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

In [7]:
#login for llama 2 models 
import os
import dotenv
from huggingface_hub import login
dotenv.load_dotenv()

login(token=os.getenv('HUGGINGFACEHUB_API_TOKEN'),add_to_git_credential=True)


Token is valid (permission: fineGrained).
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\26amr\.cache\huggingface\token
Login successful


In [27]:
# call llama2 model from HuggingFace
import torch
llm=HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={'temperature':0.0,'do_sample':False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    model_name="meta-llama/Llama-2-7b-chat-hf",
    device_map='auto',
    model_kwargs={'torch_dtype':torch.float16,'load_in_8bit':True, 'llm_int8_enable_fp32_cpu_offload': True }
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:17<00:00,  8.51s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


In [37]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.core import ServiceContext
from llama_index.embeddings.langchain import LangchainEmbedding

In [38]:
embed_model=LangchainEmbedding(HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))

  embed_model=LangchainEmbedding(HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


# service context
- combine all the techniques and bundle them together


In [51]:
from llama_index.core import Settings

Settings.chunk_size=1024
Settings.embed_model=embed_model
Settings.llm=llm


In [52]:
# use vector store index and conver this entire data into indexes
index=VectorStoreIndex.from_documents(documents=documents, embed_model=Settings.embed_model)

In [53]:
index 

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x18419f69b40>

In [54]:
query_engine=index.as_query_engine()

In [59]:
response=query_engine.query('how do we utilize neural network')



In [60]:
response.response

'utilizing neural networks can be done by first inputting data into layers of neurons hooked together forming layers stacked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked together layers hooked t

# chromadb