# RAG with LLaMa-3 70B, Mistral8x7b and gemma7b

Hugging Face Embedding Pipeline

In [57]:
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)
device

'cuda:0'

In [58]:
docs = [
    "this is one document",
    "and another document"
]

embeddings = embed_model.embed_documents(docs)

print(f"We have {len(embeddings)} doc embeddings, each with "
      f"a dimensionality of {len(embeddings[0])}.")

We have 2 doc embeddings, each with a dimensionality of 384.


# Building the vector Index

In [59]:
import os
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = "8a73267f-d64d-4d53-a5ae-0a241afd5517"

# configure client
pc = Pinecone(api_key=api_key)

In [60]:
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [61]:
# index initialize
index_name = "final-llm"

In [62]:
import time

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=len(embeddings[0]),
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [63]:
from datasets import load_dataset

data = load_dataset("wikipedia", "20220301.simple", split='train[:5000]')
data

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 5000
})

In [64]:
import tiktoken
tiktoken.encoding_for_model('gpt2')

<Encoding 'gpt2'>

In [65]:
import tiktoken

tokenizer = tiktoken.get_encoding('gpt2')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    # print(tokens)
    return len(tokens)

tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

28

In [66]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [67]:
import os
# os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'
os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'

In [68]:
from langchain import HuggingFaceHub, LLMChain
from langchain.embeddings import HuggingFaceEmbeddings

model_name = 'sentence-transformers/all-MiniLM-L6-v2'

embed = HuggingFaceEmbeddings(
    model_name=model_name,
)

In [69]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record
    metadata = {
        'wiki-id': str(record['id']),
        'source': record['url'],
        'title': record['title']
    }
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['text'])
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

100%|██████████| 5000/5000 [02:29<00:00, 33.35it/s]


In [70]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 19907}},
 'total_vector_count': 19907}

# Vector store and Querying

In [71]:
import os
# os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'
os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'

In [72]:
from langchain import HuggingFaceHub, LLMChain
from langchain.embeddings import HuggingFaceEmbeddings

model_name = 'sentence-transformers/all-MiniLM-L6-v2'

embed = HuggingFaceEmbeddings(
    model_name=model_name,
)

In [73]:
from pinecone import Pinecone, ServerlessSpec


# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = "8a73267f-d64d-4d53-a5ae-0a241afd5517"

# configure client
pc = Pinecone(api_key=api_key)

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [74]:
from langchain.vectorstores import Pinecone

text_field = "text"

# switch back to normal index for langchain
index_name="final-llm"
index = pc.Index(index_name)

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)



# LLM Benchmarking

In [82]:
!unzip WikiQA.zip

Archive:  WikiQA.zip
  inflating: WikiQA.tsv              


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [83]:
import pandas as pd

# Load the TSV file
df = pd.read_csv('WikiQA.tsv', delimiter='\t')

# Remove duplicate questions
df = df.drop_duplicates(subset=['Question'])

# Reduce to desired columns
df = df[['QuestionID', 'Question', 'Sentence']]

# Select the first 100 questions
df = df.head(100)

# Save the processed file
df.to_csv('processed_WikiQA.tsv', sep='\t', index=False)

In [86]:
df = pd.read_csv('processed_WikiQA.tsv', delimiter='\t')
df

Unnamed: 0,QuestionID,Question,Sentence
0,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States refer...
1,Q1,how are glacier caves formed?,A partly submerged glacier cave on Perito More...
2,Q2,How are the directions of the velocity and for...,"In physics , circular motion is a movement of ..."
3,Q3,how large were early jails,"A prison (from Old French prisoun), also known..."
4,Q4,how a water pump works,"A small, electrically powered pump"
...,...,...,...
95,Q95,how long is tekken blood vengeance movie,is a 2011 Japanese 3D computer-animated film b...
96,Q96,how can hoa collect unpaid fees on property,"For a discussion of nonprofit, voluntary neigh..."
97,Q97,how big can texel guinea pigs become,A prize-winning lilac-and-white Silkie
98,Q98,how are public schools funded,State schools (also known as public schools or...


In [96]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
import pandas as pd
from tqdm import tqdm 

# Load the processed dataset
df = pd.read_csv('processed_WikiQA.tsv', delimiter='\t')
df = df.sample(20, random_state=123)

print(df.shape)

# Placeholder lists for storing outputs
mistral_outs = []
llama_outs = []
gemma_outs = []
real_outs = []

chat_mistral = ChatGroq(temperature=0, groq_api_key="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", model_name="mixtral-8x7b-32768")
chat_llama = ChatGroq(temperature=0, groq_api_key="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", model_name="llama3-70b-8192")
chat_gemma = ChatGroq(temperature=0, groq_api_key="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", model_name="gemma-7b-it")

# Function for each model
def call_mistral(query, relevant_documents):
    # Mistral
    
    matched_info = ' '.join(item.page_content for item in relevant_documents)
    sources = [item.metadata for item in relevant_documents]
    context = f"Information: {matched_info} and the sources: {sources}"
    sys_prompt = f"""
    Instructions:
    - Be helpful and answer questions concisely. If you don't know the answer, say 'I don't know'
    - Utilize the context provided for accurate and specific information.
    - Incorporate your preexisting knowledge to enhance the depth and relevance of your response.
    - Cite your sources
    Context: {context}
    """
    

    system = "You are a helpful assistant."
    human = "{text}"
    prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

    chain = prompt | chat_mistral
    prompt_ = query
    
    return chain.invoke({"text": prompt_ + "\n" + sys_prompt}).content
    
    
    

def call_llama(query, relevant_documents):
    # Implement or simulate LLM response
    matched_info = ' '.join(item.page_content for item in relevant_documents)
    sources = [item.metadata for item in relevant_documents]
    context = f"Information: {matched_info} and the sources: {sources}"
    sys_prompt = f"""
    Instructions:
    - Be helpful and answer questions concisely. If you don't know the answer, say 'I don't know'
    - Utilize the context provided for accurate and specific information.
    - Incorporate your preexisting knowledge to enhance the depth and relevance of your response.
    - Cite your sources
    Context: {context}
    """
    

    system = "You are a helpful assistant."
    human = "{text}"
    prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

    chain = prompt | chat_llama
    prompt_ = query
    
    return chain.invoke({"text": prompt_ + "\n" + sys_prompt}).content

def call_gemma(query, relevant_documents):
    # Implement or simulate LLM response
    matched_info = ' '.join(item.page_content for item in relevant_documents)
    sources = [item.metadata for item in relevant_documents]
    context = f"Information: {matched_info} and the sources: {sources}"
    sys_prompt = f"""
    Instructions:
    - Be helpful and answer questions concisely. If you don't know the answer, say 'I don't know'
    - Utilize the context provided for accurate and specific information.
    - Incorporate your preexisting knowledge to enhance the depth and relevance of your response.
    - Cite your sources
    Context: {context}
    """
    

    system = "You are a helpful assistant."
    human = "{text}"
    prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

    chain = prompt | chat_gemma
    prompt_ = query
    
    return chain.invoke({"text": prompt_ + "\n" + sys_prompt}).content

    
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing questions"):
    query = str(row['Question'])
    
    # Store real output
    real_outs.append(str(row['Sentence']))
    
    # Simulate a vectorstore search (this function must be defined elsewhere)
    relevant_documents = vectorstore.similarity_search(
                            query,  # our search query
                            k=3  # return 3 most relevant docs
                        )
    
    # Call each model and store their outputs
    mistral_outs.append(call_mistral(query, relevant_documents))
    llama_outs.append(call_llama(query, relevant_documents))
    gemma_outs.append(call_gemma(query, relevant_documents))
    
# Optionally, print the outputs to verify
print("Real Outputs:", real_outs[:5])
print("Mistral Outputs:", mistral_outs[:5])
print("LLaMA Outputs:", llama_outs[:5])
print("GEMMA Outputs:", gemma_outs[:5])

(20, 3)


Processing questions: 100%|██████████| 20/20 [06:31<00:00, 19.58s/it]

Real Outputs: ['Cross section of sclerenchyma fibers in plant ground tissue', 'Terminator 3: Rise of the Machines (commonly abbreviated as T3) is a 2003 science fiction action film directed by Jonathan Mostow and starring Arnold Schwarzenegger , Nick Stahl , Claire Danes and Kristanna Loken .', 'TLC is an American girl group whose repertoire spanned R&B , hip hop , soul , funk , and new jack swing .', 'Radial engine timing and cam mechanism.', 'A natural arch produced by the erosion of differentially weathered rock in Jebel Kharaz, Jordan']
Mistral Outputs: ['Epithelial tissues are joined together by a variety of methods, depending on the specific type of epithelial tissue. Generally, epithelial cells adhere to each other through specialized junctions, which include tight junctions, adherens junctions, and desmosomes.\n\nTight junctions, also known as occluding junctions, are the most apically located junctions and form a virtually impermeable barrier to the passage of solutes between 




In [99]:
import evaluate as E
import numpy as np

bleu = E.load("bleu")
meteor_metric = E.load("meteor")
bertscore_metric = E.load("bertscore")

def test_metrics_2b(references, predictions):

  bleu_score = bleu.compute(predictions=predictions, references=references)
  meteor_score = meteor_metric.compute(predictions=predictions, references=references)
  bertscore_score = bertscore_metric.compute(predictions=predictions, references=references, lang="de")

  bs_pre = np.mean(np.array(bertscore_score['precision']))
  bs_recall = np.mean(np.array(bertscore_score['recall']))
  bs_f1 = np.mean(np.array(bertscore_score['f1']))


  print(f'BLEU Score: {bleu_score}')
  print(f'METEOR Score: {meteor_score}')
  print(f'BERTScore-precision: {bs_pre}')
  print(f'BERTScore-recall: {bs_recall}')
  print(f'BERTScore-f1: {bs_f1}')
  
print("Mistral")
test_metrics_2b(real_outs, mistral_outs)
print("--------------------------------------------------------------------------------------------------")

print("Llama")
test_metrics_2b(real_outs, llama_outs)
print("--------------------------------------------------------------------------------------------------")

print("Gemma")
test_metrics_2b(real_outs, gemma_outs)
print("--------------------------------------------------------------------------------------------------")


[nltk_data] Downloading package wordnet to /home/vijay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/vijay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vijay/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Mistral
BLEU Score: {'bleu': 0.013220144898102472, 'precisions': [0.05640033176665745, 0.014178482068390326, 0.007548224769359799, 0.005060444194545966], 'brevity_penalty': 1.0, 'length_ratio': 8.0917225950783, 'translation_length': 3617, 'reference_length': 447}
METEOR Score: {'meteor': 0.18360866517814425}
BERTScore-precision: 0.5501263305544853
BERTScore-recall: 0.6753357648849487
BERTScore-f1: 0.6052185118198394
--------------------------------------------------------------------------------------------------
Llama
BLEU Score: {'bleu': 0.012660943619327073, 'precisions': [0.055486670799752016, 0.014971927635683094, 0.007532956685499058, 0.00410612760581175], 'brevity_penalty': 1.0, 'length_ratio': 7.217002237136465, 'translation_length': 3226, 'reference_length': 447}
METEOR Score: {'meteor': 0.17164903365354653}
BERTScore-precision: 0.5953129217028618
BERTScore-recall: 0.6720740914344787
BERTScore-f1: 0.6287255764007569
-------------------------------------------------------------

# Completed ---------------------------------------------------------------------------------

In [75]:
query = "Who was Albert Einstein?"

relevant_documents = vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)
relevant_documents

[Document(page_content="Albert Einstein (14 March 1879 – 18 April 1955) was a German-born American scientist. He worked on theoretical physics. He developed the theory of relativity. He received the Nobel Prize in Physics in 1921 for theoretical physics.\n\nHis famous equation is  (E = energy, m = mass, c = speed of light (energy = mass X speed of light²).\n\nAt the start of his career, Einstein didn't think that Newtonian mechanics was enough to bring together the laws of classical mechanics and the laws of the electromagnetic field. Between 1902–1909 he made the theory of special relativity to fix it. Einstein also thought that Isaac Newton's idea of gravity was not completely correct. So, he extended his ideas on special relativity to include gravity. In 1916, he published a paper on general relativity with his theory of gravitation.\n\nIn 1933, Einstein was visiting the United States but in Germany, Adolf Hitler and the Nazis came to power (this is before World War II). Einstein, b

In [76]:
relevant_documents[0]

Document(page_content="Albert Einstein (14 March 1879 – 18 April 1955) was a German-born American scientist. He worked on theoretical physics. He developed the theory of relativity. He received the Nobel Prize in Physics in 1921 for theoretical physics.\n\nHis famous equation is  (E = energy, m = mass, c = speed of light (energy = mass X speed of light²).\n\nAt the start of his career, Einstein didn't think that Newtonian mechanics was enough to bring together the laws of classical mechanics and the laws of the electromagnetic field. Between 1902–1909 he made the theory of special relativity to fix it. Einstein also thought that Isaac Newton's idea of gravity was not completely correct. So, he extended his ideas on special relativity to include gravity. In 1916, he published a paper on general relativity with his theory of gravitation.\n\nIn 1933, Einstein was visiting the United States but in Germany, Adolf Hitler and the Nazis came to power (this is before World War II). Einstein, be

In [77]:
matched_info = ' '.join(item.page_content for item in relevant_documents)
sources = [item.metadata for item in relevant_documents]
context = f"Information: {matched_info} and the sources: {sources}"
sys_prompt = f"""
Instructions:
- Be helpful and answer questions concisely. If you don't know the answer, say 'I don't know'
- Utilize the context provided for accurate and specific information.
- Incorporate your preexisting knowledge to enhance the depth and relevance of your response.
- Cite your sources
Context: {context}
"""