# RAG

A notebook to demo how we can use Retrieval Augmented Generation (RAG) with an LLM.

In [34]:
import requests
import pandas as pd
import os 
from dotenv import load_dotenv

load_dotenv() 

True

In [None]:
import requests
import pandas as pd
import os 
from dotenv import load_dotenv

load_dotenv() 

URL = "https://api.infomaniak.com/1/ai/models"
headers = {
  'Authorization': f'Bearer {os.getenv("API_TOKEN")}',
  'Content-Type': 'application/json',
}
req = requests.request("GET", url = URL , headers = headers)
res = req.json()


df = pd.DataFrame(res["data"])
df[df["type"] == "embedding"]

## Chroma Demo

A notebook to demo how we can use Retrieval Augmented Generation (RAG) with an LLM.

We will use ChromaDB as the vector store for this.

In [None]:
from dotenv import load_dotenv
import chromadb
import os

load_dotenv() 

chroma_client = chromadb.Client()

In [None]:
collection = chroma_client.get_or_create_collection(name="main")
collection

In [None]:
chroma_client.list_collections()

In [None]:
# just a quick demo on how this works
# by default this uses a MiniLM L6-v2
collection.add(
    ids=["id1", "id2", "id3", "id4"],
    documents=[
        "document sur un cfc / apprentissage",
        "document sur l'epfl",
        "document sur 42 Lausanne",
        "document sur université de fribougr",
    ]
)

print("Number of items in the collection:", collection.count())

In [None]:
out = collection.query(
    query_texts=["J'aimerais faire un apprentissage"],
    n_results=1,
)

out

In [None]:
out = collection.query(
    query_texts=["J'aimerais aller au collège pour ensuite faire l'université"],
    n_results=2,
)

# key documents returns the text of the relevant docs
out["documents"]

In [None]:
chroma_client.delete_collection(name="main")

## Data Processing: HTML to Text

using beautifulSoup and simple get request, we collect the data from websites.

In [1]:
import requests
from bs4 import BeautifulSoup

url = "www.orientation.ch/dyn/show/1900?id=152"

res = requests.get("https://" + url)

In [2]:
# needed to pretend I am a normal user ;)
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}


def url_to_string(url: str) -> str:
    if not url.startswith("https://"):
        url = "https://" + url
    
    res = requests.get(url, headers=HEADERS)
    res.raise_for_status()

    soup = BeautifulSoup(res.text, "html.parser")
    site_text = " ".join([text for text in soup.stripped_strings])
    return site_text

site_text = url_to_string(url)

print("Number of chars:", len(site_text))
print("Number of words:", len(site_text.split()))

Number of chars: 30161
Number of words: 3883


In [3]:
import pandas as pd

df = pd.read_csv("../links.csv")
df.head()

Unnamed: 0,title,link
0,informaticen cfc,www.orientation.ch/dyn/show/1900?id=152
1,informaticen es,www.orientation.ch/dyn/show/1900?id=885
2,informatique université de fribourg,www.unifr.ch/inf/fr/informatique
3,informatique de gestion université de fribourg,www.unifr.ch/inf/fr/informatique-de-gestion
4,informatique université de genève,www.unige.ch/dinfo/formations/bachelor


In [4]:
url_list = df["link"].to_list()
url_text_list = [url_to_string(url) for url in url_list]

In [5]:
df["url_text"] = url_text_list
df.head()

Unnamed: 0,title,link,url_text
0,informaticen cfc,www.orientation.ch/dyn/show/1900?id=152,Informaticien CFC / Informaticienne CFC - orie...
1,informaticen es,www.orientation.ch/dyn/show/1900?id=885,Informaticien ES / Informaticienne ES - orient...
2,informatique université de fribourg,www.unifr.ch/inf/fr/informatique,Informatique | Département d'informatique | U...
3,informatique de gestion université de fribourg,www.unifr.ch/inf/fr/informatique-de-gestion,Informatique de gestion | Département d'inform...
4,informatique université de genève,www.unige.ch/dinfo/formations/bachelor,Bachelor en sciences informatiques Présentatio...


## RAG Benchmark

First two models are on infomaniak.
The rest we will need to train ourselves by getting the model with a GPU.

| Model name | Link |
|------------|------|
|MiniLM L12 v2 | https://developer.infomaniak.com/docs/api/get/1/ai/models |
|BGE Multilingual Gemma 2 | https://developer.infomaniak.com/docs/api/get/1/ai/models |
| paraphrase-multilingual-MiniLM-L12-v2 | https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 |
| paraphrase-multilingual-mpnet-base-v2 | https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2 |
| distiluse-base-multilingual-cased-v2 | https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2 |
| Alibaba-NLP/gte-Qwen2-1.5B-instruct | https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct |
| BM25 | chromadb through `fastembed` | 

In [15]:
import chromadb
from chromadb.utils.embedding_functions import Bm25EmbeddingFunction, DefaultEmbeddingFunction

DB_PATH = "../data/"

client = chromadb.PersistentClient(path=DB_PATH)
# ensure the DB is available
client.heartbeat()

1760196401520358617

## MultiLingual Gemma 2

In [None]:
ids: list[str] = df["title"].to_list() #df.index.astype("str").to_list()
documents: list[str] = df["url_text"].to_list()


PRODUCT_ID = os.getenv("PRODUCT_ID")
API_TOKEN = os.getenv("API_TOKEN")

URL = f"https://api.infomaniak.com/1/ai/{PRODUCT_ID}/openai/v1/embeddings"

headers = {
  'Authorization': f"Bearer {API_TOKEN}",
  'Content-Type': 'application/json',
}

payload = {
    "input": documents,
    "model": "bge_multilingual_gemma2",
    "mode": "index",
}

req = requests.post(url=URL , json=payload, headers=headers)
res = req.json()
print(req.status_code)

assert len(documents) == len(x["data"])

In [150]:
from chromadb import Documents, EmbeddingFunction, Embeddings

PRODUCT_ID = os.getenv("PRODUCT_ID")
API_TOKEN = os.getenv("API_TOKEN")


class MultinligualGemma2(EmbeddingFunction):
    def __init__(self) -> None:
        self.url = f"https://api.infomaniak.com/1/ai/{PRODUCT_ID}/openai/v1/embeddings"
        self.headers = {
          'Authorization': f"Bearer {API_TOKEN}",
          'Content-Type': 'application/json',
        }
        
    def __call__(self, input_data: Documents) -> Embeddings:
        payload = {
            "input": input_data,
            "model": "bge_multilingual_gemma2",
        }

        req = requests.post(url=self.url, json=payload, headers=self.headers)
        res = req.json()
        data = res["data"]
        embeddings = [np.array(x["embedding"]) for x in data]
        
        return embeddings

In [153]:
multilingual_gemma2_collection = client.get_or_create_collection(
    name="multilingual-gemma2",
    embedding_function=MultinligualGemma2()
)

print(multilingual_gemma2_collection)
print(multilingual_gemma2_collection._embedding_function)

Collection(name=multilingual-gemma2)
<__main__.MultinligualGemma2 object at 0x72d9b2f2b250>


In [154]:
ids: list[str] = df["title"].to_list() #df.index.astype("str").to_list()
documents: list[str] = df["url_text"].to_list()

multilingual_gemma2_collection.add(
    ids=ids,
    documents=documents,
)

In [155]:
multilingual_gemma2_collection.count()

8

In [156]:
results = multilingual_gemma2_collection.query(
    query_texts=["J'aimerais faire un apprentissage"],
    n_results=2,
)   

In [158]:
results["ids"]

[['42 lausanne', 'informaticen cfc']]

## BM25 

In [17]:
bm_25_collection = client.get_or_create_collection(
    name="bm25",
    embedding_function=DefaultEmbeddingFunction()
)

print(bm_25_collection)
print(bm_25_collection._embedding_function)

Collection(name=bm25)
<chromadb.utils.embedding_functions.DefaultEmbeddingFunction object at 0x72d9abe5f3f0>


In [18]:
ids: list[str] = df["title"].to_list() #df.index.astype("str").to_list()
documents: list[str] = df["url_text"].to_list()

# NOT WOKRING: WHY?
bm_25_collection.add(
    ids=ids,
    documents=documents,
)