# Creating Retrieval Augmented Generation (RAG) - Service Chatbot using Llama 2 and Langchain

In [1]:
!pip install transformers==4.33.0 accelerate==0.22.0 einops==0.6.1 langchain==0.0.300 xformers==0.0.21 \
bitsandbytes==0.41.1 sentence_transformers==2.2.2 chromadb==0.4.12

Collecting transformers==4.33.0
  Downloading transformers-4.33.0-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.22.0
  Downloading accelerate-0.22.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops==0.6.1
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain==0.0.300
  Downloading langchain-0.0.300-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xformers==0.0.21
  Downloading xformers-0.0.21-cp310-cp310-manylinux2014_x86_64.whl (167.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma


# Load and Initialize Model, set Quantization Configuration

In [3]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.cuda.get_device_name(0)
    print(f"GPU: {device}")
else:
    print("GPU is not available. Using CPU.")


GPU: Tesla T4


In [4]:
model_id = 'meta-llama/Llama-2-7b-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

In [5]:
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

# Defining a Query Pipeline and Testing it with a Query

In [6]:
query_pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.float16,
        device_map="auto",)

In [8]:
def test_model(tokenizer, pipeline, prompt_to_test):
    """
    Perform a query
    print the result
    Args:
        tokenizer: the tokenizer
        pipeline: the pipeline
        prompt_to_test: the prompt
    Returns
        None
    """
    sequences = pipeline(
        prompt_to_test,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=200,)
    for seq in sequences:
        print(f"Result: {seq['generated_text']}")

In [9]:
# Query regarding the Service, currently unknown to the LLM

test_model(tokenizer, query_pipeline, "What is ConnectTel? Keep it under 100 Words.")

Result: What is ConnectTel? Keep it under 100 Words. kwieta 2020 roku. 2. The ConnectTel app allows you to: - View your bills - Pay your bills - View your usage - View your data usage - Check your data usage in real-time - Check your data usage in the last 7 days - Check your data usage in the last 30 days - Check your data usage in the last 90 days - Check your data usage in the last 180 days - Check your data usage in the last 365 days - Check your data usage in the last 12 months - Check your data usage in the last 24 months - Check your data usage in the last 36 months - Check your data usage in the last 48 months - Check your data usage in the last 60 months - Check your data usage in the last 72 months - Check


# Retrieval Augmented Generation

In [17]:
#Query through HF Pipeline before RAG

llm = HuggingFacePipeline(pipeline=query_pipeline)
llm(prompt="What is ConnectTel?")

'\n kwietnia 2022\nWhat is ConnectTel?\nConnectTel is a new telephone service that allows you to make and receive calls and messages from your mobile phone.\nConnectTel is a new telephone service that allows you to make and receive calls and messages from your mobile phone. The service is available to everyone in the UK, and it’s free to use.\nConnectTel is a new telephone service that allows you to make and receive calls and messages from your mobile phone. The service is available to everyone in the UK, and it’s free to use. ConnectTel is a new telephone service that allows you to make and receive calls and messages from your mobile phone. The service is available to everyone in the UK, and it’s free to use.\nConnectTel is a new telephone service that allows you to make and receive calls and messages from your mobile phone. The service is available to everyone in the UK, and it’s free to use. ConnectTel is a new telephone service that allows you to make and receive calls and messages

## Loading and Ingesting the Service Data from a .txt File

In [18]:
loader = TextLoader("sample_data/ConnectTelFAQ.txt", encoding="utf8")
documents = loader.load()

In [19]:
#Split the Data

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

In [20]:
#Create Embeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

In [21]:
#Initialize ChromaDB to store Embeddings

vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")

## Initializing the LangChain

In [22]:
retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True
)

## Testing the RAG with a Function

In [23]:
def test_rag(qa, query):
    print(f"Query: {query}\n")
    result = qa.run(query)
    print("\nResult: ", result)

In [24]:
#Query regarding the Service again

query = "What is ConnectTel. Keep it under 100 words."
test_rag(qa, query)

Query: What is ConnectTel. Keep it under 100 words.



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m

Result:   ConnectTel is a leading telecommunications provider in Pakistan, offering a range of innovative and reliable services for individuals and businesses. The company specializes in providing cutting-edge telecom services, including mobile network solutions, broadband internet services, VoIP (Voice over Internet Protocol) solutions, unified communications platforms, data center services, and network security solutions. ConnectTel operates in three major cities: Islamabad, Lahore, and Karachi, providing convenient access to its telecom services across Pakistan. The company's mobile network service boasts extensive coverage, high-speed data, and reliable connectivity, ensuring that customers stay connected wherever they go. ConnectTel's broadband internet services offer lightning-fast speeds, unlimited data usage, and exceptional reliability, empowering user

In [25]:
#Query specific info regarding the Service

query = "Where are ConnectTel Offices Located?"
test_rag(qa, query)

Query: Where are ConnectTel Offices Located?



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m

Result:   ConnectTel has offices in three major cities in Pakistan, including:

Islamabad

Lahore

Karachi

ConnectTel operates in three major cities: Islamabad, Lahore, and Karachi, providing convenient access to its telecom services across Pakistan.
ConnectTel's mobile network service boasts extensive coverage, high-speed data, and reliable connectivity, ensuring that our customers stay connected wherever they go.
ConnectTel's broadband internet services offer lightning-fast speeds, unlimited data usage, and exceptional reliability, empowering users to stream, work, and connect with ease.

Yes, ConnectTel offers customizable telecom solutions tailored to the unique requirements of businesses, ensuring that they get the exact services and features they need to succeed.
Customers can reach out to ConnectTel's dedicated support team via phone, email, or the online port

## Checking Source used to Answer previous Query

In [26]:
docs = vectordb.similarity_search(query)
print(f"Query: {query}")
print(f"Retrieved documents: {len(docs)}")
for doc in docs:
    doc_details = doc.to_json()['kwargs']
    print("Source: ", doc_details['metadata']['source'])
    print("Text: ", doc_details['page_content'], "\n")

Query: Where are ConnectTel Offices Located?
Retrieved documents: 4
Source:  sample_data/ConnectTelFAQ.txt
Text:  ConnectTel specializes in providing cutting-edge telecom services to individuals and businesses, ensuring seamless connectivity and communication solutions.
ConnectTel offers a wide range of telecom services, including:
Mobile network solutions
Broadband internet services
VoIP (Voice over Internet Protocol) solutions
Unified communications platforms
Data center services
Network security solutions
ConnectTel operates in three major cities: Islamabad, Lahore, and Karachi, providing convenient access to its telecom services across Pakistan.
ConnectTel's mobile network service boasts extensive coverage, high-speed data, and reliable connectivity, ensuring that our customers stay connected wherever they go.
ConnectTel's broadband internet services offer lightning-fast speeds, unlimited data usage, and exceptional reliability, empowering users to stream, work, and connect with ea