<h1> Data preparation



This journal contains the pipeline to download data from the GitLab handbook and store it into a LanceDB Database with embeddings to be used for RAG


In [32]:
import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
from typing import List
from pydantic_ai import Agent
from pydantic import BaseModel, Field
from pydantic_ai.models.gemini import GeminiModel
import os, datetime, requests, time

from IPython.display import display, HTML
import ipywidgets as widgets

from openai import OpenAI
from duckduckgo_search import DDGS
from __future__ import annotations as _annotations

import os

import re
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter


import pandas as pd

Download gitlab handbook contents from gitlab and store it in Data folder
This code currently stops halfway while retrieving the data, so in the mean time we will continue using a predownloaded dataset

In [None]:
from urllib.parse import quote

def download_files_in_folder(project_id, file, save_path):
    """
    Download current markdown file from gitlab project into local folder.

    args:
    project
    """
    file_path_encoded = quote(file["path"], safe='')
    file_url = f'https://gitlab.com/api/v4/projects/{project_id}/repository/files/{file_path_encoded}/raw'
    file_params = {'ref': 'main'}
    file_response = requests.get(file_url, params=file_params)
    file_response.raise_for_status()

    print(file_url)
    local_file_path = os.path.join(save_path, file['path'])
    os.makedirs(os.path.dirname(local_file_path), exist_ok=True)

    with open(local_file_path, 'wb') as f:
        f.write(file_response.content)
    print(f'Downloaded: {file["path"]} to {local_file_path}')

def download_gitlab_data(project_id, gitlab_path, save_path):
    api_url = f'https://gitlab.com/api/v4/projects/{project_id}/repository/tree'
    params = {
        'path': gitlab_path,
        'recursive': True
    }
    
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()
        files = response.json()

        for file in files:
            if file['type'] == 'blob' and file['name'].endswith('.md'):
                download_files_in_folder(project_id, file, save_path)
            else:
                download_gitlab_data(project_id, f'{file["path"]}', save_path)
    except Exception as e:
        print(f"Error occured at {gitlab_path}: {e}")

PROJECT_ID = "gitlab-com%2Fcontent-sites%2Fhandbook"
FOLDER_PATH = "content"
SAVE_PATH = "../data/gitlab_handbook"

download_gitlab_data(PROJECT_ID, FOLDER_PATH, SAVE_PATH)

https://gitlab.com/api/v4/projects/gitlab-com%2Fcontent-sites%2Fhandbook/repository/files/content%2Fall-content%2F_index.md/raw
Downloaded: content/all-content/_index.md to ../data/gitlab_handbook2\content/all-content/_index.md
https://gitlab.com/api/v4/projects/gitlab-com%2Fcontent-sites%2Fhandbook/repository/files/content%2Fhandbook%2Fabout%2Fediting-handbook%2F_index.md/raw
Downloaded: content/handbook/about/editing-handbook/_index.md to ../data/gitlab_handbook2\content/handbook/about/editing-handbook/_index.md
https://gitlab.com/api/v4/projects/gitlab-com%2Fcontent-sites%2Fhandbook/repository/files/content%2Fhandbook%2Fabout%2Fediting-handbook%2Fedit-team-page.md/raw
Downloaded: content/handbook/about/editing-handbook/edit-team-page.md to ../data/gitlab_handbook2\content/handbook/about/editing-handbook/edit-team-page.md
https://gitlab.com/api/v4/projects/gitlab-com%2Fcontent-sites%2Fhandbook/repository/files/content%2Fhandbook%2Fabout%2Fediting-handbook%2Fpractical-handbook-edits.m

Lets focus on 1 MD file first to test. 

In [2]:
FILE_PATH = "../data/handbook-main-content/content/handbook/communication/_index.md"
EMBEDDING_MODEL = "text-embedding-3-large"

def split_markdown_text(file_path: str, embedding_model: str, min_tokens: int = 200, max_tokens: int = 1000, overlap: int = 30):
    md_regex = r"(^#+\s*.*)" #regex which captures all levels of headers in markdown.
    tokenizer = tiktoken.encoding_for_model(embedding_model)
    text = ""
    #read text
    try:
        file = open(file_path, 'r', encoding="utf-8")
        text = file.read()
    except Exception as e:
        print(f"error while reading file: {e}")

    #split text by headers
    sections = re.split(md_regex, text, flags=re.MULTILINE)
    chunks = []
    temp_chunk = ""
    

    for i in range(1, len(sections), 2): # loop through headers and text in sections
        
        header = sections[i].strip()
        content = sections[i+1].strip() if i + 1 <= len(sections) else ""
        chunk = header + '\n' + content
        token_count = len(tokenizer.encode(chunk))

        # add chunk to chunk list or to temporary chunk to combine with other chunks
        if token_count < min_tokens:
            temp_chunk += chunk + "\n"          
        else:
            if temp_chunk:
                chunks.append(temp_chunk)
                temp_chunk = ""
            chunks.append(chunk)

    # add remaining temp chunk if it exists
    if temp_chunk:
        chunks.append(temp_chunk)

    splitter = RecursiveCharacterTextSplitter(chunk_size = max_tokens, chunk_overlap = overlap)
    split_chunks = []
    for chunk in chunks:
        split_chunks.extend(splitter.split_text(chunk))
    
    return chunks

chunks = split_markdown_text(FILE_PATH, EMBEDDING_MODEL)
tokenizer = tiktoken.encoding_for_model(EMBEDDING_MODEL)
print(len(chunks))
print(len(tokenizer.encode(chunks[2])))


72
209


split all markdown files in the handbook folder up into chunks.
Chunks are split up by headings and large sections of text will be split up further into chunks of 1000 tokens


In [None]:


def split_markdown_text(text: str, embedding_model: str, min_tokens: int = 200, max_tokens: int = 1000, overlap: int = 30):
    md_regex = r"(^#+\s*.*)" #regex which captures all levels of headers in markdown.
    tokenizer = tiktoken.encoding_for_model(embedding_model)

    #split text by headers
    sections = re.split(md_regex, text, flags=re.MULTILINE)
    chunks = []
    temp_chunk = ""
    
    #capture first text which often does not start with a header
    
    if len(tokenizer.encode(sections[0])) < min_tokens:
        temp_chunk = sections[0]
    else:
        chunks.append(sections[0])
    

    for i in range(1, len(sections), 2): # loop through headers and text in sections
        header = sections[i].strip()
        content = sections[i+1].strip() if i + 1 <= len(sections) else ""
        chunk = header + '\n' + content
        token_count = len(tokenizer.encode(chunk))

        # add chunk to chunk list or to temporary chunk to combine with other chunks
        if token_count < min_tokens:
            temp_chunk += chunk + "\n"          
        else:
            if temp_chunk:
                chunks.append(temp_chunk)
                temp_chunk = ""
            chunks.append(chunk)

    # add remaining temp chunk if it exists
    if temp_chunk:
        chunks.append(temp_chunk)

    splitter = RecursiveCharacterTextSplitter(chunk_size = max_tokens, chunk_overlap = overlap)
    split_chunks = []
    for chunk in chunks:
        split_chunks.extend(splitter.split_text(chunk))
    
    return split_chunks

def extract_text_from_data(folder_path):
    file_chunks = []
    id = 0
    # walk through all folders and subfolders
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.md'):                  #only extract text from markdown files
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding="utf-8") as f:
                    content = f.read()
                    chunks = split_markdown_text(content, EMBEDDING_MODEL, min_tokens=200)
                    for chunk in chunks:
                        file_chunks.append((id, file_path, chunk))
                        id += 1
                    
    return file_chunks

DATA_PATH = "../data/handbook-main-content"
EMBEDDING_MODEL = "text-embedding-3-large"



file_chunks = extract_text_from_data(DATA_PATH)
# def create_chunks_df(file_chunks: List[(str, List[str])]):
#     id = 0
print(len(file_chunks))


67102


In [53]:
tokenizer = tiktoken.encoding_for_model(EMBEDDING_MODEL)
total_count = 0
for id, file, chunk in file_chunks:
    token_count = len(tokenizer.encode(chunk))
    total_count += token_count
print(f"total token count = {total_count}")
print(f"API costs at ~0.1 dollar per million tokens equals {total_count/1000000 * 0.1} dollar")
    

total token count = 9518462
API costs at ~0.1 dollar per million tokens equals 0.9518462 dollar


Remove small chunks

In [54]:
tokenizer = tiktoken.encoding_for_model(EMBEDDING_MODEL)
total_count = 0
trimmed_file_chunks = []
for id, file, chunk in file_chunks:
    
    count = len(tokenizer.encode(chunk))
    if count > 100:
        trimmed_file_chunks.append((id, file, chunk))

print(len(trimmed_file_chunks))

47092


67102


In [55]:

client = OpenAI()

def get_embeddings(chunks, model=EMBEDDING_MODEL):
    try:
        
        response = client.embeddings.create(input = chunks, model=model)
        print("batch done")
        return [item.embedding for item in response.data]
    except Exception as e:
        print(f"Embedding failed with error: {e}")
    return [None] * len(chunks)  


In [57]:
db =lancedb.connect("../data/lancedb")
df = pd.DataFrame(data=trimmed_file_chunks, columns=['chunk_id','file', 'chunk'])

batch_size = 500
embeddings = []

for i in range(0, len(df), batch_size):
    batch = df["chunk"].iloc[i:i + batch_size].tolist()
    batch_embeddings = get_embeddings(batch, EMBEDDING_MODEL)
    embeddings.extend(batch_embeddings)
    time.sleep(1)   #avoid OpenAI rate limits

df.loc[:, "embedding"] = embeddings
df.head()
df.to_csv("../data/embedded_dataframe.csv", index=False)


batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done
batch done

In [52]:
trimmed_df.head()

Unnamed: 0,chunk_id,file,chunk,embedding
0,1,../data/handbook-main-content\content\handbook...,---\ntitle: The Handbook\nno_list: true\nmenu:...,"[0.002513919724151492, -0.016943028196692467, ..."
1,9,../data/handbook-main-content\content\handbook...,"## Where we are Headed\nAt GitLab, we encourag...","[-0.020637543871998787, -0.019697371870279312,..."
2,12,../data/handbook-main-content\content\handbook...,---\ntitle: Handbook Escalation\n---\n\nFor in...,"[-0.03570609539747238, -0.01383506041020155, -..."
3,16,../data/handbook-main-content\content\handbook...,### Expectations for the group\n1. Make sure y...,"[-0.0043060556054115295, -0.027220841497182846..."
4,25,../data/handbook-main-content\content\handbook...,> The biggest problem is GitLab not working ha...,"[-0.02353515662252903, -0.002803125884383917, ..."


In [None]:


class HandbookChunk(BaseModel):
    file_path: str = Field(..., description="Path to the original Markdown file")
    chunk_id: str = Field(..., description="Unique ID for the chunk")
    chunk_text: str = Field(..., description="Text content of the chunk")
    


def split_text_into_chunks(extracted_text: list[str], max_tokens = 1000, overlap = 100):
    encoding = tiktoken.encoding_for_model("text-embedding-3-large")
    regex = r"^#{1,2}\s*.*$"
    file_id = 0
    chunks = []

    for file, text in extracted_text:
        file_id += 1
        split_text = re.split(regex, text, flags=re.MULTILINE)
        
        chunk_id = 0

        for item in split_text:
            tokens = encoding.encode(item)
            if len(tokens) < 1000:
                chunk_id += 1
                chunk = HandbookChunk(
                    file_path=file,
                    chunk_id = f"{file_id}_{chunk_id}",
                    chunk_text = item
                                    )
                chunks.append(chunk)
            else:
                step_size = int(max_tokens * (1 - overlap))

                for start in range(0, len(tokens), step_size):
                    end = start + max_tokens
                    text = encoding.decode(tokens[start:end])
                    if text:
                        chunk_id += 1
                        chunk = HandbookChunk(
                        file_path=file,
                        chunk_id = f"{file_id}_{chunk_id}",
                        chunk_text = text
                                        )
                        chunks.append(chunk)

    return chunks

   

def extract_text_from_data(folder_path):
    extracted_text = []

    # walk through all folders and subfolders
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.md'):                  #only extract text from markdown files
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding="utf-8") as f:
                    content = f.read()
                    extracted_text.append({file, content})
    return extracted_text

            
extracted_text = extract_text_from_data("../data/handbook-main-content/")
chunks = split_text_into_chunks(extracted_text)




In [86]:
model = get_registry().get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu")
uri = "../data/db"
db = lancedb.connect(uri)
data = [chunk.dict() for chunk in chunks]

class EmbeddedChunk(LanceModel):
    file_path: str = Field(..., description="Path to the original Markdown file")
    chunk_id: str = Field(..., description="Unique ID for the chunk")
    chunk_text: str = Field(..., description="Text content of the chunk")

    


C:\Users\laure\AppData\Local\Temp\ipykernel_10580\3076086659.py:4: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  data = [chunk.dict() for chunk in chunks]


In [88]:
embedded_table = db.create_table('embedded_table', data=data, schema=EmbeddedChunk, mode='overwrite')
embedded_table.head()

pyarrow.Table
file_path: string not null
chunk_id: string not null
chunk_text: string not null
----
file_path: [["---
title: All Handbook Content
layout: all-content
---
","_index.md","_index.md","_index.md","contributing.md"]]
chunk_id: [["1_1","2_1","2_2","2_3","3_1"]]
chunk_text: [["_index.md","---
title: The Handbook
no_list: true
menu:
  main:
    name: Handbook
    pre: '<i class="fa-solid fa-book"></i>'
cascade:
      type: docs
---

","

The GitLab team handbook is the central repository for how we run the company. Printed, it consists of over
[2,000 pages of text](/handbook/about/#count-handbook-pages). As part of our value of being
transparent the handbook is [open to the world](https://gitlab.com/gitlab-com/content-sites/handbook/), and we welcome
feedback. Please make a [merge request](https://gitlab.com/gitlab-com/content-sites/handbook/merge_requests) to suggest
improvements or add clarifications. Please use [issues](https://gitlab.com/gitlab-com/content-sites/handbook/is