# RAG Pipeline

This notebook is our pipeline to transform the various websites around CS to text and put them in the vector store.

In [1]:
import chromadb
from chromadb.utils.embedding_functions import Bm25EmbeddingFunction, DefaultEmbeddingFunction, HuggingFaceEmbeddingFunction
from chromadb import Documents, EmbeddingFunction, Embeddings
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os 

load_dotenv()

True

In [2]:
URL = "https://api.infomaniak.com/1/ai/models"
headers = {
  'Authorization': f'Bearer {os.getenv("API_TOKEN")}',
  'Content-Type': 'application/json',
}
req = requests.request("GET", url = URL , headers = headers)
res = req.json()

info_df = pd.DataFrame(res["data"])
info_df[info_df["type"] == "embedding"]

Unnamed: 0,id,name,type,documentation_link,description,info_status,logo_url,last_updated_at,max_token_input,version,meta
6,12,bge_multilingual_gemma2,embedding,https://developer.infomaniak.com/docs/api/post...,Bge Multilingual Gemma2,coming_soon,https://storage4.infomaniak.com/ai-tools/publi...,2024-12-04,8000.0,1.0,{'is_beta': False}
7,13,mini_lm_l12_v2,embedding,https://developer.infomaniak.com/docs/api/post...,All MiniLM L12 v2,coming_soon,https://storage4.infomaniak.com/ai-tools/publi...,2024-12-04,128.0,2.0,{'is_beta': False}


## 1. Data Processing: Conver HTML to Text

using beautifulSoup and simple get request, we collect the data from websites.

In [3]:
# needed to pretend I am a normal user ;)
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}

def url_to_string(url: str) -> str:
    if not url.startswith("https://"):
        url = "https://" + url
    
    res = requests.get(url, headers=HEADERS)
    res.raise_for_status()

    soup = BeautifulSoup(res.text, "html.parser")
    site_text = " ".join([text for text in soup.stripped_strings])
    return site_text


demo_url = "www.orientation.ch/dyn/show/1900?id=152"
site_text = url_to_string(demo_url)

print("Number of chars:", len(site_text))
print("Number of words:", len(site_text.split()))

Number of chars: 30161
Number of words: 3883


In [4]:
df = pd.read_csv("../links.csv")

url_list = df["link"].to_list()
url_text_list = [url_to_string(url) for url in url_list]

In [5]:
df["url_text"] = url_text_list
df.head()

Unnamed: 0,title,link,url_text
0,informaticen cfc,www.orientation.ch/dyn/show/1900?id=152,Informaticien CFC / Informaticienne CFC - orie...
1,informaticen es,www.orientation.ch/dyn/show/1900?id=885,Informaticien ES / Informaticienne ES - orient...
2,informatique université de fribourg,www.unifr.ch/inf/fr/informatique,Informatique | Département d'informatique | U...
3,informatique de gestion université de fribourg,www.unifr.ch/inf/fr/informatique-de-gestion,Informatique de gestion | Département d'inform...
4,informatique université de genève,www.unige.ch/dinfo/formations/bachelor,Bachelor en sciences informatiques Présentatio...


## 2. RAG Benchmark

First two models are on infomaniak.
The rest we will need to train ourselves by getting the model with a GPU.

|evaluate? | Model name | Link |
|:--------:|------------|------|
| |MiniLM L12 v2 | https://developer.infomaniak.com/docs/api/post/1/ai/%7Bproduct_id%7D/openai/v1/embeddings |
| X |BGE Multilingual Gemma 2 | https://developer.infomaniak.com/docs/api/post/1/ai/%7Bproduct_id%7D/openai/v1/embeddings |
| X | paraphrase-multilingual-MiniLM-L12-v2 | https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 |
| X | paraphrase-multilingual-mpnet-base-v2 | https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2 |
| X | distiluse-base-multilingual-cased-v2 | https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2 |
|  | Alibaba-NLP/gte-Qwen2-1.5B-instruct | https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct |
| X | BM25 | chromadb through `fastembed` | 

In [6]:
DB_PATH = "../data/"

client = chromadb.PersistentClient(path=DB_PATH)
# ensure the DB is available
print("hearbeat:", client.heartbeat())

ids: list[str] = df["title"].to_list() #df.index.astype("str").to_list()
documents: list[str] = df["url_text"].to_list()

hearbeat: 1760202354916504953


## Infomaniak Model: MultiLingual Gemma 2 and MiniLM-L12-v2

The latter is english only, we should not evaluate it.

In [7]:
PRODUCT_ID = os.getenv("PRODUCT_ID")
API_TOKEN = os.getenv("API_TOKEN")

URL = f"https://api.infomaniak.com/1/ai/{PRODUCT_ID}/openai/v1/embeddings"

headers = {
  'Authorization': f"Bearer {API_TOKEN}",
  'Content-Type': 'application/json',
}

payload = {
    "input": documents,
    "model": "bge_multilingual_gemma2",
    "mode": "index",
}

req = requests.post(url=URL , json=payload, headers=headers)
res = req.json()
print(req.status_code)

assert len(documents) == len(res["data"])

200


In [8]:
PRODUCT_ID = os.getenv("PRODUCT_ID")
API_TOKEN = os.getenv("API_TOKEN")


class MultinligualGemma2(EmbeddingFunction):
    def __init__(self) -> None:
        self.model_name = "bge_multilingual_gemma2"
        self.url = f"https://api.infomaniak.com/1/ai/{PRODUCT_ID}/openai/v1/embeddings"
        self.headers = {
          'Authorization': f"Bearer {API_TOKEN}",
          'Content-Type': 'application/json',
        }
        
    def __call__(self, input_data: Documents) -> Embeddings:
        payload = {
            "input": input_data,
            "model": self.model_name,
        }

        req = requests.post(url=self.url, json=payload, headers=self.headers)
        res = req.json()
        data = res["data"]
        embeddings = [np.array(x["embedding"]) for x in data]
        
        return embeddings


# Do not evaluate this one
class MiniLML12V2(EmbeddingFunction):
    def __init__(self) -> None:
        self.model_name = "mini_lm_l12_v2"
        self.url = f"https://api.infomaniak.com/1/ai/{PRODUCT_ID}/openai/v1/embeddings"
        self.headers = {
          'Authorization': f"Bearer {API_TOKEN}",
          'Content-Type': 'application/json',
        }
        
    def __call__(self, input_data: Documents) -> Embeddings:
        payload = {
            "input": input_data,
            "model": self.model_name,
        }

        req = requests.post(url=self.url, json=payload, headers=self.headers)
        res = req.json()
        data = res["data"]
        embeddings = [np.array(x["embedding"]) for x in data]
        
        return embeddings

In [9]:
multilingual_gemma2_collection = client.get_or_create_collection(
    name="multilingual-gemma2",
    embedding_function=MultinligualGemma2()
)

print(multilingual_gemma2_collection)
print(multilingual_gemma2_collection._embedding_function)

Collection(name=multilingual-gemma2)
<__main__.MultinligualGemma2 object at 0x7c3b5c567b60>


In [10]:
multilingual_gemma2_collection.add(
    ids=ids,
    documents=documents,
)

In [11]:
multilingual_gemma2_collection.count()

8

In [12]:
results = multilingual_gemma2_collection.query(
    query_texts=["J'aimerais faire un apprentissage"],
    n_results=2,
)   

results["ids"]

## BM25 

In [None]:
bm_25_collection = client.get_or_create_collection(
    name="bm25",
    embedding_function=Bm25EmbeddingFunction()
)

print(bm_25_collection)
print(bm_25_collection._embedding_function)

In [None]:
ids: list[str] = df["title"].to_list() #df.index.astype("str").to_list()
documents: list[str] = df["url_text"].to_list()

# NOT WOKRING: WHY?
bm_25_collection.add(
    ids=ids,
    documents=documents,
)

## Sentence Transformer

For all the HugginFace model.

In [14]:
# model we are interested in 
mini_lm_l12 = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" # 118M
mpnet_base = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" # 278M
distule_base = "sentence-transformers/distiluse-base-multilingual-cased-v2" # 135M

In [47]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(mini_lm_l12)
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)


In [48]:
%%time

sentences = ["text1", "This is a long sentence to demo how fast the model is and see if we can run it on CPU or maybe we need GPU?"]
embeddings = model.encode(sentences)

CPU times: user 95.2 ms, sys: 14.7 ms, total: 110 ms
Wall time: 68.4 ms


In [54]:
[x.shape for x in list(embeddings)]

[(384,), (384,)]

In [66]:
class SentenceTransformerFunction(EmbeddingFunction):
    def __init__(self, model_name: str) -> None:
        self.model_name = model_name
        self.model = SentenceTransformer(self.model_name)
        
    def __call__(self, input_data: Documents) -> Embeddings:
        print(len(input_data))
        embeddings = self.model.encode(input_data)
        print(len(embeddings))
        return embeddings

In [57]:
mini_lm_l12_collection = client.get_or_create_collection(
    name="mini_lm_l12",
    embedding_function=SentenceTransformerFunction(mini_lm_l12),
)

print(mini_lm_l12_collection)
print(mini_lm_l12_collection._embedding_function)

Collection(name=mini_lm_l12)
<__main__.SentenceTransformerFunction object at 0x7c39d95b7770>


In [67]:
model_func = SentenceTransformerFunction(mini_lm_l12)

In [68]:
x1 = model_func(sentences)

2
2


In [69]:
out = model_func(documents)

8
8


In [44]:
[d[0:10] for d in documents]

['Informatic',
 'Informatic',
 'Informatiq',
 'Informatiq',
 'Bachelor e',
 "École d'in",
 'Accueil - ',
 'IDEC Ecole']

In [32]:
mini_lm_l12_collection.add(
    ids=ids,
    documents=documents,
)

8
1


InvalidArgumentError: Inconsistent number of IDs, embeddings, documents, URIs and metadatas