# Retrieval-Augmented Generation (RAG)

Install the Hugging Face libraries to run this notebook.

In [2]:
!pip install transformers wikipedia



In [3]:
import torch
import torch.nn.functional as F

import wikipedia
import json

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Document ingestion

In [5]:
def extract_wikipedia_pages(page_titles):
    """
    Extracts Wikipedia pages and stores them in a dictionary.

    Args:
        page_titles: A list of Wikipedia page titles to extract.

    Returns:
        A dictionary containing the text of each Wikipedia page.
    """

    page_data = {}
    for title in page_titles:
        try:
            page = wikipedia.page(title)
            content = page.content.strip()
            content = content.replace("\n", "")
            page_data[page.title] = content
        except wikipedia.exceptions.PageError:
            print(f"Page '{title}' not found.")
        except wikipedia.exceptions.DisambiguationError as e:
            print(f"Disambiguation error for '{title}': {e.options}")

    return page_data

In [6]:
page_titles = [
               "Roger Apéry",
               "Owen Willans Richardson",
               "Otto Sackur",
               "Ludvig Lorenz",
               "Klaus von Klitzing",
               "Henri Victor Regnault",
               "Erwin Madelung",
              ]

# Uncomment the next line to scroll through Wikipedia
# wikipedia_data = extract_wikipedia_pages(page_titles)

Save the dictionary using `json.dump()`:

In [7]:
# with open('wikipedia_data.json', 'w') as f:
#     json.dump(wikipedia_data, f, indent=4)

Load the dictionary using `json.load()`:

In [8]:
with open('wikipedia_data.json', 'r') as f:
    wikipedia_data = json.load(f)

In [9]:
for doc in wikipedia_data:
    print(len(wikipedia_data[doc]))

3107
3455
1683
1873
1762
3431
1487


## Document pre-processing

We load just the tokenizer:

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model_max_length = tokenizer.model_max_length
model_max_length

512

In [11]:
encoded_text = tokenizer.encode(["hello", "how are you?"])
tokenizer.decode(encoded_text)

'[CLS] hello [SEP] how are you? [SEP]'

In [12]:
def text_splitting(text, chunk_length = 300, chunk_overlap = 100):
    return [text[i:chunk_length+i] for i in range(0, len(text), chunk_length-chunk_overlap)]

text_splitting("".join([str(x) for x in range(20)]), 5, 2)

['01234',
 '34567',
 '67891',
 '91011',
 '11121',
 '21314',
 '14151',
 '51617',
 '17181',
 '819']

In [13]:
wikipedia_data_splits = {}

for doc in wikipedia_data.keys():
    wikipedia_data_splits[doc] = text_splitting(wikipedia_data[doc])
    #wikipedia_data_splits[doc] = text_splitting_paragraph(wikipedia_data[doc])

first_key = page_titles[0]
wikipedia_data_splits[first_key][:3]

["Roger Apéry (French: [apeʁi]; 14 November 1916, Rouen – 18 December 1994, Caen) was a French mathematician most remembered for Apéry's theorem, which states that ζ(3) is an irrational number. Here, ζ(s) denotes the Riemann zeta function.== Biography ==Apéry was born in Rouen in 1916 to a French moth",
 's) denotes the Riemann zeta function.== Biography ==Apéry was born in Rouen in 1916 to a French mother and Greek father. His childhood was spent in Lille until 1926, when the family moved to Paris, where he studied at the Lycée Ledru-Rollin and the Lycée Louis-le-Grand.  He was admitted  at the Écol',
 'ere he studied at the Lycée Ledru-Rollin and the Lycée Louis-le-Grand.  He was admitted  at the École normale supérieure in 1935.  His studies were interrupted at the start of World War II; he was mobilized in September 1939, taken prisoner of war in June 1940, repatriated with pleurisy in June 1941']

In [14]:
min_doc = min(len(wikipedia_data_splits[doc]) for doc in wikipedia_data_splits)
max_doc = max(len(wikipedia_data_splits[doc]) for doc in wikipedia_data_splits)
av_doc = sum(len(wikipedia_data_splits[doc]) for doc in wikipedia_data_splits) / len(wikipedia_data_splits)

min_doc,max_doc,av_doc

(8, 18, 12.571428571428571)

## Generating embeddings

Now we load the embedder:

In [15]:
from transformers import AutoModel

model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [16]:
inputs = tokenizer("Hello, world!", return_tensors="pt")
outputs = model(**inputs)

output_dim = outputs.last_hidden_state.size(2)
output_dim

384

The embedder needs to know whether the document is a document or a query.

In [17]:
def embed(chunk_list, doc_type="document"):
    encoded_docs = tokenizer(["search_{}: {}".format(doc_type, chunk) for chunk in chunk_list],
                                 padding = True,
                                 return_tensors="pt")
    output = model(**encoded_docs) # (batch, input_length, output_dim)
    token_embeddings = output.last_hidden_state
    output_embeddings = torch.sum(token_embeddings, 1)
    output_embeddings = F.normalize(output_embeddings, p=2, dim=1)
    return output_embeddings # (batch, output_dim)

In [18]:
embed(["hello", "another document", "and another one"]).shape

torch.Size([3, 384])

**Exercise 1**: chunks may lack context. The ideal of `contextual embeddings` is to ask an LLM to write some context about the chunk (given the full document and the chunk), and to embed the chunk together with the context.
Implement this idea here (choose a simple enough model and the appropriate task!).
> Pass

In [19]:
def populate_database(dic_splits, batch_size = 1):
    n_chunks = sum([len(dic_splits[doc]) for doc in dic_splits])
    vectorial_database = torch.zeros([n_chunks, output_dim], requires_grad = False).to(device)
    chunk_list = []
    n = 0
    for i, doc in enumerate(dic_splits):
        split_list = dic_splits[doc]
        print(doc, len(split_list))
        for x in range(0, len(split_list), batch_size):
            batch = split_list[x: x+batch_size]
            chunk_list.append(batch)
            vectorial_database[n:n+len(batch)] = embed(batch, "document")
            n += len(batch)
    return chunk_list, vectorial_database 

chunk_list, vectorial_database = populate_database(wikipedia_data_splits)

Roger Apéry 16
Owen Richardson 18
Otto Sackur 9
Ludvig Lorenz 10
Klaus von Klitzing 9
Henri Victor Regnault 18
Erwin Madelung 8


Save the vectorial database using `torch.save()`:

In [20]:
torch.save(vectorial_database, 'vectorial_database.pth')

with open('chunk_list.json', 'w') as f:
    json.dump(chunk_list, f, indent=4)

Load the database using `torch.load()`:

In [21]:
vectorial_database = torch.load('vectorial_database.pth')
vectorial_database.to(device)
vectorial_database.requires_grad_(False)

with open('chunk_list.json', 'r') as f:
    chunk_list = json.load(f)

In [22]:
len(chunk_list), vectorial_database.shape

(88, torch.Size([88, 384]))

In [23]:
for i, embedding_vector in enumerate(vectorial_database[:20]):
    print(embedding_vector[:5], chunk_list[i][:50])

tensor([-0.0406,  0.0437, -0.0288, -0.0089, -0.0150], device='cuda:0') ["Roger Apéry (French: [apeʁi]; 14 November 1916, Rouen – 18 December 1994, Caen) was a French mathematician most remembered for Apéry's theorem, which states that ζ(3) is an irrational number. Here, ζ(s) denotes the Riemann zeta function.== Biography ==Apéry was born in Rouen in 1916 to a French moth"]
tensor([-0.0140,  0.0815, -0.0226, -0.0117,  0.0452], device='cuda:0') ['s) denotes the Riemann zeta function.== Biography ==Apéry was born in Rouen in 1916 to a French mother and Greek father. His childhood was spent in Lille until 1926, when the family moved to Paris, where he studied at the Lycée Ledru-Rollin and the Lycée Louis-le-Grand.  He was admitted  at the Écol']
tensor([ 0.0196,  0.0287, -0.0544,  0.0098, -0.0017], device='cuda:0') ['ere he studied at the Lycée Ledru-Rollin and the Lycée Louis-le-Grand.  He was admitted  at the École normale supérieure in 1935.  His studies were interrupted at the start of

## Retrieval

In [24]:
def similarity(query_embeddings, doc_embeddings):
    return query_embeddings @ doc_embeddings.T

In [25]:
query_embeddings = embed([
    "What is TSNE?",
    "Who is Laurens van der Maaten?",
], "query")

doc_embeddings = embed([
    "TSNE is a dimensionality reduction algorithm created by Laurens van Der Maaten",
], "document")

with torch.no_grad():
    print(similarity(query_embeddings, doc_embeddings))

tensor([[0.6542],
        [0.4483]])


In [30]:
def retrieve(query, 
             vectorial_database = vectorial_database, 
             chunk_list = chunk_list, 
             topk = 5,
             verbose = False):
    query_embedding = embed([query], "query")
    similarity_scores = similarity(query_embeddings.to(device), vectorial_database.to(device))
    return torch.topk(similarity_scores, 5)

In [31]:
%%timeit
retrieve("What did Erwin Madelung study?")

The slowest run took 243.89 times longer than the fastest. This could mean that an intermediate result is being cached.
559 ms ± 576 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


**Exercise 2**: The similarity measure is based on embeddings. A completely different approach is `lexical matching`, meaning by matching keywords from the query to the documents. It is based on `TF-IDF (Term Frequency-Inverse Document Frequency)`, as follows:
* Compute TF-IDF for each chunk
* BM25 returns the 25 most relevant chunks based on their TF-IDF match to the query

Implement this approach.

> PASS

**Exercise 3**:
A `reranker` is (yet another) LLM which looks at the query and some chunks and ranks them by relevance. 
Implement this approach.

> PASS

For information: Claude combines BM25 with similarity measures as follows:
* Use BM25 to retrieve 25 chunks
* independently, use similarity measure on embeddings to retrieve 25 chunks
* Use a reranker to combine and deduplicate the obtained 50 chunks

### Alternative retrieval: SVM

Return all the closest globally

In [40]:
import numpy as np
from sklearn import svm

def retrieve_SVM(query, 
             vectorial_database = vectorial_database, 
             chunk_list = chunk_list, 
             topk = 5):
    query_embedding = embed([query], "query").to(device)
    x = np.concatenate([query_embedding.detach().cpu().numpy(), vectorial_database.detach().cpu().numpy()])
    y = np.zeros(vectorial_database.size(0) + 1)
    y[0] = 1 # we have a single positive example

    clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1, dual="auto")
    clf.fit(x, y)
    similarities = clf.decision_function(x)
    sorted_ix = np.argsort(-similarities)
    for k in sorted_ix[1:topk+1]:
        print(f"Score: {similarities[k]:.4f}\nText:\n", chunk_list[k-1], "\n")
    return "\n".join([chunk_list[k-1] for k in sorted_ix[1:topk+1]])

In [41]:
retrieve_SVM("What did Erwin Madelung study?")

Score: 0.0703
Text:
 ['erlag, Berlin 1922. subsequent editions: 1925, 1936, 1950, 1953, 1957, 1964.== References ==== External links ==Works by or about Erwin Madelung at the Internet ArchiveLiterature by and about Erwin Madelung in the German National Library cataloguePortrait drawing at Frankfurt University'] 

Score: -0.2228
Text:
 ['Erwin Madelung (18 May 1881 – 1 August 1972) was a German physicist.He was born in 1881 in Bonn. His father was the surgeon Otto Wilhelm Madelung. He earned a doctorate in 1905 from the University of Göttingen, specializing in crystal structure, and eventually became a professor. It was during this '] 

Score: -0.2334
Text:
 ['erences ==== External links ==Works by or about Otto Sackur at the Internet Archive'] 

Score: -0.3141
Text:
 ['                          +                              1            64                          +        ⋯        ≠                              p            q                                {\\displaystyle 1+{\\frac {

TypeError: sequence item 0: expected str instance, list found

## Full pipeline

This model does **extractive** question answering, meaning it can only points to the answer in the provided context.

In [34]:
from transformers import AutoModelForQuestionAnswering, pipeline

model_name = "deepset/tinyroberta-squad2"

QA = pipeline('question-answering', model=model_name, tokenizer=model_name, device=device)

config.json:   0%|          | 0.00/835 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/326M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda


In [35]:
def query(prompt):
    topk_chunks = retrieve(prompt)
#     topk_chunks = retrieve_SVM(prompt)
    return QA(question=prompt, context=topk_chunks)

In [36]:
query("What did Erwin Madelung study?")

ValueError: Arguments can't be understood