In [None]:
! pip install accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.30.2 langchain chromadb --quiet
! pip install pypdf --quiet
! pip install tiktoken --quiet
! pip install sentence_transformers lark auto-gptq optimum --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.0/798.0 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.0/509.0 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In this notebook we will do the below mentioned steps:


1. Load the Llama-2 paper pdf using LangChain document loaders.
2. Create text chunks.
3. Create Embeddings on the text chunks.
4. Save the embeddings in Vectore Store using chroma.
5. Add additional documents to Vectore store..
5. Perform question answering using Retrieval-Augmented-Generation on the document using LLM (Llama-2).


In [None]:
llama2_paper_path = 'https://arxiv.org/pdf/2302.13971.pdf' #LLaMA- Open and Efficient Foundation Language Models.pdf'

### **Loading the document**

In [None]:
from langchain.document_loaders import PyPDFLoader

In [None]:
loader = PyPDFLoader(file_path=llama2_paper_path)
pages = loader.load_and_split()

### **Creating text chunks for Document**

In [None]:
from langchain.text_splitter import CharacterTextSplitter

loader = PyPDFLoader(file_path=llama2_paper_path)
documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(documents)


print(len(documents))

print(len(documents[0].page_content))

27
4056


### **Creation of Embeddings**

We will use the open source sentence transformer embedding to create the embedding , you can also use the OpenAI embedding to create the embeddings.

Update:

We will try out the SoTA BGE Embeddings

Embedding MTEB Leaderboard: https://huggingface.co/spaces/mteb/leaderboard

Model we will use: https://huggingface.co/BAAI/bge-base-en-v1.5

In [None]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.2k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

### **Vector Store**

In [None]:
from langchain.vectorstores import Chroma


db = Chroma.from_documents(documents,
                           embedding=embeddings,
                           persist_directory='./llama-db')

In [None]:
survey_of_llms = 'https://arxiv.org/pdf/2303.18223.pdf'

loader = PyPDFLoader(file_path=survey_of_llms)
pages = loader.load_and_split()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(documents)

In [None]:
_ = db.add_documents(documents)

In [None]:
import torch
import transformers
from langchain import HuggingFacePipeline
from langchain import PromptTemplate,  LLMChain
from transformers import AutoTokenizer, AutoModelForCausalLM , AutoModel

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, GPTQConfig

tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7b-Chat-GPTQ")
tokenizer.pad_token = tokenizer.eos_token
quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True, tokenizer=tokenizer)
model = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7b-Chat-GPTQ",
                              quantization_config=quantization_config_loading,
                              device_map="auto"
                        )


from transformers import pipeline
import torch
from langchain.llms import HuggingFacePipeline

pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 256,
                do_sample=True,
                top_k=1,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.2
                )

llm=HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0})

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. ['use_cuda_fp16', 'use_exllama', 'max_input_length', 'exllama_config', 'disable_exllama']) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.


model.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
print(llm(prompt="What are the Scaling Laws for LLM?"))

  warn_deprecated(




Scaling laws provide a way to estimate how certain quantities change as the size of an LLM system increases. These scaling laws can be used to make predictions about the behavior of LLM systems in different scenarios, and they have important implications for the design and operation of these systems. Here are some common scaling laws that apply to LLM:
1. **Number of layers**: The number of layers in an LLM system grows exponentially with the size of the system. Specifically, the number of layers scales like O(n^2), where n is the number of users or tasks. This means that as the size of the system increases, the number of layers required to process requests also increases rapidly.
2. **Request volume**: The volume of requests handled by an LLM system grows linearly with the size of the system. Specifically, the number of requests processed per second (or other time interval) scales like O(n). This means that as the size of the system increases, more requests will be handled over time

In [None]:
from langchain.chains import RetrievalQA

rag = RetrievalQA.from_chain_type(
        llm=llm, chain_type='stuff',
        retriever=db.as_retriever()
        )

In [None]:
print(llm('What are the Scaling Laws for LLM?'))



Scaling laws provide a way to estimate how certain quantities change as the size of an LLM system increases. These scaling laws can be used to make predictions about the behavior of LLM systems in different scenarios, and they have important implications for the design and operation of these systems. Here are some common scaling laws that apply to LLM:
1. **Number of layers**: The number of layers in an LLM system grows exponentially with the size of the system. Specifically, the number of layers scales like O(n^2), where n is the number of users or tasks. This means that as the size of the system increases, the number of layers required to process requests also increases rapidly.
2. **Request volume**: The volume of requests handled by an LLM system grows linearly with the size of the system. Specifically, the number of requests processed per second (or other time interval) scales like O(n). This means that as the size of the system increases, more requests will be handled over time

In [None]:
print(rag('What are the Scaling Laws for LLM?')['result'])

  warn_deprecated(


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.67 GiB. GPU 0 has a total capacty of 14.75 GiB of which 1.48 GiB is free. Process 3837 has 13.27 GiB memory in use. Of the allocated memory 11.19 GiB is allocated by PyTorch, and 1.95 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

**With RAG we can much improved performence in the output**