<h1> Data preparation



This journal contains the pipeline to download data from the GitLab handbook and store it into a LanceDB Database with embeddings to be used for RAG


In [1]:
import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
from typing import List
from pydantic_ai import Agent
from pydantic import BaseModel, Field
from pydantic_ai.models.gemini import GeminiModel
import os, datetime, requests, time

from IPython.display import display, HTML
import ipywidgets as widgets

from openai import OpenAI
from duckduckgo_search import DDGS
from __future__ import annotations as _annotations

import os

import re
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter


import pandas as pd

Download gitlab handbook contents from gitlab and store it in Data folder
This code currently stops halfway while retrieving the data, so in the mean time we will continue using a predownloaded dataset

In [16]:
from urllib.parse import quote

def download_files_in_folder(project_id, file, save_path):
    """
    Download current markdown file from gitlab project into local folder.

    args:
    project
    """
    file_path_encoded = quote(file["path"], safe='')
    file_url = f'https://gitlab.com/api/v4/projects/{project_id}/repository/files/{file_path_encoded}/raw'
    file_params = {'ref': 'main'}
    file_response = requests.get(file_url, params=file_params)
    file_response.raise_for_status()

    print(file_url)
    local_file_path = os.path.join(save_path, file['path'])
    os.makedirs(os.path.dirname(local_file_path), exist_ok=True)

    with open(local_file_path, 'wb') as f:
        f.write(file_response.content)
    print(f'Downloaded: {file["path"]} to {local_file_path}')

def download_gitlab_data(project_id, gitlab_path, save_path):
    api_url = f'https://gitlab.com/api/v4/projects/{project_id}/repository/tree'
    params = {
        'path': gitlab_path,
        'recursive': True
    }
    
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()
        files = response.json()

        for file in files:
            if file['type'] == 'blob' and file['name'].endswith('.md'):
                download_files_in_folder(project_id, file, save_path)
            else:
                download_gitlab_data(project_id, f'{file["path"]}', save_path)
    except Exception as e:
        print(f"Error occured at {gitlab_path}: {e}")

PROJECT_ID = "gitlab-com%2Fcontent-sites%2Fhandbook"
FOLDER_PATH = "content"
SAVE_PATH = "../data/gitlab_handbook"

download_gitlab_data(PROJECT_ID, FOLDER_PATH, SAVE_PATH)

https://gitlab.com/api/v4/projects/gitlab-com%2Fcontent-sites%2Fhandbook/repository/files/content%2Fall-content%2F_index.md/raw
Downloaded: content/all-content/_index.md to ../data/gitlab_handbook\content/all-content/_index.md
https://gitlab.com/api/v4/projects/gitlab-com%2Fcontent-sites%2Fhandbook/repository/files/content%2Fhandbook%2Fabout%2Fediting-handbook%2F_index.md/raw
Downloaded: content/handbook/about/editing-handbook/_index.md to ../data/gitlab_handbook\content/handbook/about/editing-handbook/_index.md
https://gitlab.com/api/v4/projects/gitlab-com%2Fcontent-sites%2Fhandbook/repository/files/content%2Fhandbook%2Fabout%2Fediting-handbook%2Fedit-team-page.md/raw
Downloaded: content/handbook/about/editing-handbook/edit-team-page.md to ../data/gitlab_handbook\content/handbook/about/editing-handbook/edit-team-page.md
https://gitlab.com/api/v4/projects/gitlab-com%2Fcontent-sites%2Fhandbook/repository/files/content%2Fhandbook%2Fabout%2Fediting-handbook%2Fpractical-handbook-edits.md/r

KeyboardInterrupt: 

Lets focus on 1 MD file first to test. 

In [None]:
FILE_PATH = "../data/handbook-main-content/content/handbook/communication/_index.md"
EMBEDDING_MODEL = "text-embedding-3-large"

def split_markdown_text(file_path: str, embedding_model: str, min_tokens: int = 200, max_tokens: int = 1000, overlap: int = 30):
    md_regex = r"(^#+\s*.*)" #regex which captures all levels of headers in markdown.
    tokenizer = tiktoken.encoding_for_model(embedding_model)
    text = ""
    #read text
    try:
        file = open(file_path, 'r', encoding="utf-8")
        text = file.read()
    except Exception as e:
        print(f"error while reading file: {e}")

    #split text by headers
    sections = re.split(md_regex, text, flags=re.MULTILINE)
    chunks = []
    temp_chunk = ""
    

    for i in range(1, len(sections), 2): # loop through headers and text in sections
        
        header = sections[i].strip()
        content = sections[i+1].strip() if i + 1 <= len(sections) else ""
        chunk = header + '\n' + content
        token_count = len(tokenizer.encode(chunk))

        # add chunk to chunk list or to temporary chunk to combine with other chunks
        if token_count < min_tokens:
            temp_chunk += chunk + "\n"          
        else:
            if temp_chunk:
                chunks.append(temp_chunk)
                temp_chunk = ""
            chunks.append(chunk)

    # add remaining temp chunk if it exists
    if temp_chunk:
        chunks.append(temp_chunk)

    splitter = RecursiveCharacterTextSplitter(chunk_size = max_tokens, chunk_overlap = overlap)
    split_chunks = []
    for chunk in chunks:
        split_chunks.extend(splitter.split_text(chunk))
    
    return chunks

chunks = split_markdown_text(FILE_PATH, EMBEDDING_MODEL)
tokenizer = tiktoken.encoding_for_model(EMBEDDING_MODEL)
print(len(chunks))
print(len(tokenizer.encode(chunks[2])))


72
209


split all markdown files in the handbook folder up into chunks.
Chunks are split up by headings and large sections of text will be split up further into chunks of 1000 tokens


In [26]:


def split_markdown_text(text: str, embedding_model: str, min_tokens: int = 200, max_tokens: int = 1000, overlap: int = 30):
    md_regex = r"(^#+\s*.*)" #regex which captures all levels of headers in markdown.
    tokenizer = tiktoken.encoding_for_model(embedding_model)

    #split text by headers
    sections = re.split(md_regex, text, flags=re.MULTILINE)
    chunks = []
    temp_chunk = ""
    
    #capture first text which often does not start with a header
    
    if len(tokenizer.encode(sections[0])) < min_tokens:
        temp_chunk = sections[0]
    else:
        chunks.append(sections[0])
    

    for i in range(1, len(sections), 2): # loop through headers and text in sections
        header = sections[i].strip()
        content = sections[i+1].strip() if i + 1 <= len(sections) else ""
        chunk = header + '\n' + content
        token_count = len(tokenizer.encode(chunk))

        # add chunk to chunk list or to temporary chunk to combine with other chunks
        if token_count < min_tokens:
            temp_chunk += chunk + "\n"          
        else:
            if temp_chunk:
                chunks.append(temp_chunk)
                temp_chunk = ""
            chunks.append(chunk)

    # add remaining temp chunk if it exists
    if temp_chunk:
        chunks.append(temp_chunk)

    splitter = RecursiveCharacterTextSplitter(chunk_size = max_tokens, chunk_overlap = overlap)
    split_chunks = []
    for chunk in chunks:
        split_chunks.extend(splitter.split_text(chunk))
    
    return split_chunks

def extract_text_from_data(folder_path):
    file_chunks = []
    id = 0
    HANDBOOK_ROOT_URL = "https://gitlab.com/gitlab-com/content-sites/handbook/-/tree/main/content"

    # walk through all folders and subfolders
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.md'):                  #only extract text from markdown files
                file_path = os.path.join(root, file)
                file_url = file_path.replace("../data/handbook-main-content\\content", HANDBOOK_ROOT_URL)
                file_url = file_url.replace('\\', '/')
               
                with open(file_path, 'r', encoding="utf-8") as f:
                    content = f.read()
                    chunks = split_markdown_text(content, EMBEDDING_MODEL, min_tokens=200)
                    for chunk in chunks:
                        file_chunks.append((id, file_url, chunk))
                        id += 1
                    
    return file_chunks

DATA_PATH = "../data/handbook-main-content"
EMBEDDING_MODEL = "text-embedding-3-large"
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")




file_chunks = extract_text_from_data(DATA_PATH)
# def create_chunks_df(file_chunks: List[(str, List[str])]):
#     id = 0
print(len(file_chunks))


67102


count total tokens and estimate embedding cost

In [27]:
tokenizer = tiktoken.encoding_for_model(EMBEDDING_MODEL)
total_count = 0
for id, file, chunk in file_chunks:
    token_count = len(tokenizer.encode(chunk))
    total_count += token_count
print(f"total token count = {total_count}")
print(f"API costs at ~0.1 dollar per million tokens equals {total_count/1000000 * 0.1} dollar")
    

total token count = 9518462
API costs at ~0.1 dollar per million tokens equals 0.9518462 dollar


Remove small chunks

In [28]:
tokenizer = tiktoken.encoding_for_model(EMBEDDING_MODEL)
total_count = 0
trimmed_file_chunks = []
for id, file, chunk in file_chunks:
    
    count = len(tokenizer.encode(chunk))
    if count > 200:
        trimmed_file_chunks.append((id, file, chunk))

print(len(trimmed_file_chunks))

14056


In [69]:

client = OpenAI()

#FIX SHOULD RETURN A LIST OF FLOATS INSTEAD OF STRING
def get_embeddings(chunks, model=EMBEDDING_MODEL):
    try:
        
        response = client.embeddings.create(input = chunks, model=model)
        print("batch done")
        return [item.embedding for item in response.data]
    except Exception as e:
        print(f"Embedding failed with error: {e}")
    return [None] * len(chunks)  


In [30]:
df = pd.DataFrame(data=trimmed_file_chunks, columns=['chunk_id','source_url', 'chunk'])
df.iloc[0]['source_url']

'https://gitlab.com/gitlab-com/content-sites/handbook/-/tree/main/content/handbook/_index.md'

In [32]:


func = get_registry().get("openai").create(name="text-embedding-3-large")

class HandbookChunk2(LanceModel):
    chunk_id: str
    source_url: str
    chunk: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()

    
db = lancedb.connect("../data/lancedb")

table = db.create_table("embedded_handbook_with_urls", schema = HandbookChunk2)
table.head()


pyarrow.Table
chunk_id: string not null
source_url: string not null
chunk: string not null
vector: fixed_size_list<item: float>[3072]
  child 0, item: float
----
chunk_id: []
source_url: []
chunk: []
vector: []

In [33]:
batch_size = 500

for i in range(0, len(df), batch_size):
    table.add(df.iloc[i:i + batch_size])

<h3>creating history table


In [141]:

class ChatHistory(LanceModel):
    session_id: str
    share_token: str
    history: str

db = lancedb.connect("../data/lancedb")
db.drop_table('history_table')
table = db.create_table("history_table", schema = ChatHistory)
table.head()


pyarrow.Table
session_id: string not null
share_token: string not null
history: string not null
----
session_id: []
share_token: []
history: []

In [138]:
import uuid
import json

history = []
history.append({'role': "user", "content": 'question'})
session_id = str(uuid.uuid4())
share_token = str(uuid.uuid4())
data = [ChatHistory(session_id = session_id, share_token = share_token, history= json.dumps(history))]
table.add(data)

check history table after testing chatbot

In [140]:
first_row = table.head(n=5)
print(table.head())

pyarrow.Table
session_id: string not null
share_token: string not null
history: string not null
----
session_id: [["e717d573-6551-4519-b0c2-57c1ffa884ad"],["cd7594a2-0f58-40a2-904f-68ea8c1c0940"],["088052f2-d864-49a9-86a0-c01b5a92a951"],["7baa2b0c-5353-402d-ac67-539a79325bb7"]]
share_token: [["test"],["6a99e047-1341-4bf1-b37a-c26d68a0a7fd"],["test"],["105a0d85-5b93-4a71-aedc-ede0442a97d8"]]
history: [["test"],["[{"role": "user", "content": "question"}]"],["test"],["[{"role": "user", "content": "question"}]"]]


In [106]:
results = table.search().where(f"session_id = '{session_id}'").limit(1).to_pydantic(ChatHistory)

if results:
    print(results[0].history)
history = results[0].history
history_list = json.loads(history)
history_list.append({'role': "assistant", "content": 'answer'})
print(history_list)


[{"role": "user", "content": "question"}, {"role": "assistant", "content": "answer"}, {"role": "assistant", "content": "answer"}]
[{'role': 'user', 'content': 'question'}, {'role': 'assistant', 'content': 'answer'}, {'role': 'assistant', 'content': 'answer'}, {'role': 'assistant', 'content': 'answer'}]


In [144]:
def update_chat_history(table, session_id: str, share_token: str, new_history: str):
        results = table.search().where(f"session_id = '{session_id}'").limit(1).to_list()
        if results:
            table.update(where=f"session_id = '{session_id}'", values={'history': new_history})
        else:
           
            table.add([{"session_id" : session_id, "share_token": share_token, "history": new_history}])

update_chat_history(table, session_id, share_token, json.dumps(history_list))

<h2> outdated code


In [22]:

class HandbookChunk(LanceModel):
    chunk_id: str
    source_url: str
    chunk: str 
    vector: Vector

In [83]:
# embedded_df = pd.read_csv("../data/embedded_dataframe.csv", index_col="chunk_id")
# trimmed_df = embedded_df.iloc[0:100]
# trimmed_df.to_csv("../data/trimmed_dataframe.csv", index="chunk_id")
# trimmed_df = pd.read_csv("../data/trimmed_dataframe.csv", index_col = "chunk_id")
db = lancedb.connect("../data/lancedb")
db.drop_database()
table = db.create_table("embedded_handbook", data=df, schema=HandbookChunk)
table.head()

TypeError: Converting Pydantic type to Arrow Type: unsupported type <function Vector at 0x00000271428BA0D0>.

In [75]:
import ast
# trimmed_df['embedding'] = trimmed_df['embedding'].apply(ast.literal_eval)
first_row = table.head(n=5)
print(first_row)

pyarrow.Table
chunk_id: string not null
file: string not null
chunk: string not null
vector: list<item: double> not null
  child 0, item: double
----
chunk_id: [["1","9","12","16","25"]]
file: [["../data/handbook-main-content\content\handbook\_index.md","../data/handbook-main-content\content\handbook\about\direction.md","../data/handbook-main-content\content\handbook\about\escalation.md","../data/handbook-main-content\content\handbook\about\escalation.md","../data/handbook-main-content\content\handbook\about\handbook-usage.md"]]
chunk: [["---
title: The Handbook
no_list: true
menu:
  main:
    name: Handbook
    pre: '<i class="fa-solid fa-book"></i>'
cascade:
      type: docs
---

## Introduction
The GitLab team handbook is the central repository for how we run the company. Printed, it consists of over
[2,000 pages of text](/handbook/about/#count-handbook-pages). As part of our value of being
transparent the handbook is [open to the world](https://gitlab.com/gitlab-com/content-sites/h

In [64]:
db = lancedb.connect("../data/lancedb")
db.drop_database()
table = db.create_table("embedded_handbook", data=trimmed_df, schema = HandbookChunk)
table.head()

pyarrow.Table
file: string not null
chunk: string not null
vector: list<item: double> not null
  child 0, item: double
----
file: [["../data/handbook-main-content\content\handbook\_index.md","../data/handbook-main-content\content\handbook\about\contributing.md","../data/handbook-main-content\content\handbook\about\direction.md","../data/handbook-main-content\content\handbook\about\direction.md","../data/handbook-main-content\content\handbook\about\direction.md"]]
chunk: [["---
title: The Handbook
no_list: true
menu:
  main:
    name: Handbook
    pre: '<i class="fa-solid fa-book"></i>'
cascade:
      type: docs
---

## Introduction
The GitLab team handbook is the central repository for how we run the company. Printed, it consists of over
[2,000 pages of text](/handbook/about/#count-handbook-pages). As part of our value of being
transparent the handbook is [open to the world](https://gitlab.com/gitlab-com/content-sites/handbook/), and we welcome
feedback. Please make a [merge request](h

In [None]:
class HandbookChunk(LanceModel):
    chunk_id: str | None = None
    file: str | None = None
    chunk: str | None = None
    vector: Vector | None = None

client = OpenAI()
query_embedding = client.embeddings.create(
input='test',
model="text-embedding-3-large"
).data[0].embedding

results = table.search(query_embedding, vector_column_name='vector').limit(5)
print(results.to_pydantic(HandbookChunk))

RuntimeError: lance error: LanceError(Index): vector is not with valid data type: Float64, C:\bld\lancedb_1739017314986\_build_env\.cargo\registry\src\index.crates.io-6f17d22bba15001f\lance-0.22.0\src\index\vector\utils.rs:73:23

In [None]:
HANDBOOK_ROOT_URL = "https://gitlab.com/gitlab-com/content-sites/handbook/-/tree/main/content"


df2 = df.copy()
df2 = df2[['chunk_id', 'file', 'chunk']]

df['file'] = HANDBOOK_ROOT_URL + df['file'].str.replace("../data/handbook-main-content\\content", "", regex=False)
db.create_table('embedded_handbook_2', data = df)

func = get_registry().get("openai").create(name="text-embedding-3-large")

class HandbookChunk2(LanceModel):
    chunk_id: str
    file: str
    chunk: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()

    
db = lancedb.connect("../data/lancedb")
db.drop_database()
table = db.create_table("embedded_handbook", schema = HandbookChunk2)
table.head()

pyarrow.Table
chunk_id: string not null
file: string not null
chunk: string not null
vector: fixed_size_list<item: float>[3072]
  child 0, item: float
----
chunk_id: []
file: []
chunk: []
vector: []

In [91]:
batch_size = 500

for i in range(0, len(df2), batch_size):
    table.add(df2.iloc[i:i + batch_size])

In [102]:
class HandbookChunk(LanceModel):
    chunk_id: str | None = None
    file: str | None = None
    chunk: str | None = None
    

client = OpenAI()
query_embedding = client.embeddings.create(
input='test',
model="text-embedding-3-large"
).data[0].embedding

results = table.search("What is GitLab's approach to paid time off (PTO)", vector_column_name='vector').limit(5).to_pydantic(HandbookChunk)
for result in results:
    print(result.chunk)

### GitLab BV (Belgium)
Team members are entitled to at least 20 vacation days/year.  These days will be front-loaded at the beginning of each year. The days taken must be communicated to the Belgian payroll provider each month by People Ops. These days do not carry over into the next calendar year.
### GitLab IT BV (Contractors)
Contractors do not have statutory vacation requirements, but are eligible for our Flexible Time Off Policy.
### GitLab GmbH (Germany)
Team members are entitled to at least 20 vacation days. The days will accrue from the start date at a rate of 1.67 days/month. In general, team members must take their annual vacation days during the calendar year, otherwise it is forfeited. However, unused vacation days can be carried forward until the 31st of March of the next calendar year if the employee was unable to take the holiday due to operational or personal reasons.
### GitLab PTY (Australia & New Zealand)
### GitLab Inc. (USA)
The U.S. [Fair Labor Standards Act (FLS

update table to have urls to handbook pages

In [6]:
HANDBOOK_ROOT_URL = "https://gitlab.com/gitlab-com/content-sites/handbook/-/tree/main/content"


In [3]:
db = lancedb.connect("../data/lancedb")
print(db.table_names())

table = db.open_table('embedded_handbook')
print(table.head())

['embedded_handbook']
pyarrow.Table
chunk_id: string not null
file: string not null
chunk: string not null
vector: fixed_size_list<item: float>[3072]
  child 0, item: float
----
chunk_id: [["1","9","12","16","25"]]
file: [["../data/handbook-main-content\content\handbook\_index.md","../data/handbook-main-content\content\handbook\about\direction.md","../data/handbook-main-content\content\handbook\about\escalation.md","../data/handbook-main-content\content\handbook\about\escalation.md","../data/handbook-main-content\content\handbook\about\handbook-usage.md"]]
chunk: [["---
title: The Handbook
no_list: true
menu:
  main:
    name: Handbook
    pre: '<i class="fa-solid fa-book"></i>'
cascade:
      type: docs
---

## Introduction
The GitLab team handbook is the central repository for how we run the company. Printed, it consists of over
[2,000 pages of text](/handbook/about/#count-handbook-pages). As part of our value of being
transparent the handbook is [open to the world](https://gitlab.co

In [None]:
# transform file string to url
def update_file_column(batch):
    batch["file"] = ["new_string" + value.lstrip("../data/handbook-main-content\content") for value in batch["file"]]
    return batch



ValueError: Either updates or updates_sql must be provided

In [9]:
df = table.to_pandas()
df['file'] = HANDBOOK_ROOT_URL + df['file'].str.replace("../data/handbook-main-content\\content", "", regex=False)
db.create_table('embedded_handbook_2', data = df)

LanceTable(name='embedded_handbook_2', version=1, _conn=LanceDBConnection(uri='c:\\Saxion\\Jaar4\\Afstudeerstage\\chatbot_git\\ChatbotExperiment\\journals\\..\\data\\lancedb'))

In [23]:
table = db.open_table('embedded_handbook')
len(table)

14056