# 2023.11.30 - Introduction to RAG | Practical Part

#### Model without RAG

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

###### Download generation model

In [None]:
model_name = "allenai/unifiedqa-t5-small"

In [None]:
local_model_directory="." # mention the localtion where you want to save the model

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/unifiedqa-t5-small")
model.save_pretrained("local_model_directory")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("allenai/unifiedqa-t5-small")
tokenizer.save_pretrained("local_model_directory")

###### Load model from disk

In [7]:
# Load local model
model = AutoModelForSeq2SeqLM.from_pretrained("local_model_directory").to("mps") # Move to M1 GPU
tokenizer = AutoTokenizer.from_pretrained("local_model_directory")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


###### Generation

In [8]:
def generate(prompt):
    token_ids = tokenizer(prompt, return_tensors="pt").to("mps")
    generated_ids = model.generate(**token_ids)
    return tokenizer.batch_decode(generated_ids)

In [9]:
raw_prompt = "What did I eat on November 11th for dinner?"
generate(raw_prompt)



['<pad> dinner</s>']

#### What if we could augment our prompt with additional knowledge?

In [10]:
def get_augmented_promp(prompt, augmentation):
    return f"""
Context information: "{augmentation}".
Given the context information and not prior knowledge, answer the query.
Query: {prompt}
Answer: \
"""

In [11]:
augmentation = "on the 11th november i ate a lovely cheesecake for dinner"
augmented_prompt = get_augmented_promp(raw_prompt, augmentation)
augmented_prompt

'\nContext information: "on the 11th november i ate a lovely cheesecake for dinner".\nGiven the context information and not prior knowledge, answer the query.\nQuery: What did I eat on November 11th for dinner?\nAnswer: '

In [12]:
generate(augmented_prompt)

['<pad> cheesecake</s>']

### Retrival Augmented Generation: Step by Step

#### Data Ingestion

In [13]:
from sentence_transformers import SentenceTransformer

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


###### Download Model

In [None]:
embd_model = SentenceTransformer("BAAI/bge-small-en-v1.5")

In [None]:
embd_model.save('./embd_model/')

###### Load local model

In [14]:
embd_model = SentenceTransformer('./embd_model/')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
# DB Store
raw_data = [
    "on the 11th november i ate a lovely cheesecake for dinner and a carrotte as a breakfast",
    "the second name of my ants second chicken is miranda",
    "the eiffel tower is located in south tirol."
]

In [16]:
# Create Embeddings
db= []
for chunk in raw_data:
    chunk_embd = embd_model.encode(chunk)
    node = {"embd": chunk_embd, "text": chunk}
    db.append(node)

db

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[{'embd': array([-1.30421510e-02,  4.59370427e-02,  5.51192909e-02, -2.30988543e-02,
         -3.15478933e-03,  2.48120166e-02,  6.08760454e-02,  5.93810342e-02,
          5.84299723e-03,  3.28274327e-03,  2.70620483e-04, -3.16813253e-02,
          2.53681242e-02,  5.53279556e-02,  2.66885404e-02, -1.70556773e-02,
          4.59962562e-02, -6.20284900e-02, -1.35608807e-01, -5.65198669e-03,
         -1.67642180e-02, -1.25390813e-02, -4.80024964e-02, -1.86989382e-02,
          2.49041207e-02,  1.18087910e-01,  1.72056872e-02, -1.40382517e-02,
         -6.39178678e-02, -1.02145687e-01,  2.56354809e-02,  9.60217975e-03,
          6.13879599e-02, -5.61945923e-02, -5.04849851e-02,  8.65331793e-04,
          1.31303845e-02,  5.91073111e-02, -4.39514890e-02,  4.12696786e-03,
          9.64904577e-02, -1.74260139e-02,  4.07980867e-02, -1.01896785e-02,
          2.15873439e-02, -1.91477723e-02, -3.92945670e-02,  3.70007604e-02,
          9.23887193e-02,  9.75113828e-03, -2.54794974e-02,  6.45726

#### Retrieval

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [25]:
prompt = "What is the second name of my ants second chicken?"

In [29]:
prompt_embd = embd_model.encode(prompt)

In [30]:
def calculate_similarity(vec1, vec2):
    vec1 = np.array(list(vec1)).reshape(1, -1)
    vec2 = np.array(list(vec2)).reshape(1, -1)
    return cosine_similarity(vec1, vec2)[0][0]

In [31]:
similarities = [calculate_similarity(node.get('embd'), prompt_embd) for node in db]
most_similar_node = db[np.argmax(similarities)]
augemntation_data = most_similar_node.get('text')
augemntation_data

'the second name of my ants second chicken is miranda'

#### Augmented

In [32]:
def get_augmented_promp(prompt, augmentation):
    return f"""
Context information: "{augmentation}".
Given the context information and not prior knowledge, answer the query.
Query: {prompt}
Answer: \
"""

In [33]:
augmented_prompt = get_augmented_promp(prompt, augemntation_data)
augmented_prompt

'\nContext information: "the second name of my ants second chicken is miranda".\nGiven the context information and not prior knowledge, answer the query.\nQuery: What is the second name of my ants second chicken?\nAnswer: '

### Generation

In [34]:
generate(augmented_prompt)



['<pad> miranda</s>']