<h1> Data preparation



This journal contains the pipeline to download data from the GitLab handbook and store it into a LanceDB Database with embeddings to be used for RAG


In [1]:
import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
from typing import List
from pydantic_ai import Agent
from pydantic import BaseModel, Field
from pydantic_ai.models.gemini import GeminiModel
import os
import datetime
import requests
import time

from openai import OpenAI

import re
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
from urllib.parse import quote

  from .autonotebook import tqdm as notebook_tqdm


Download gitlab handbook contents from gitlab and store it in Data folder
This code currently stops halfway while retrieving the data, so in the mean time we will continue using a predownloaded dataset

In [None]:
from urllib.parse import quote

def download_files_in_folder(project_id, file, save_path):
    """
    Download current markdown file from gitlab project into local folder.

    args:
    project
    """
    file_path_encoded = quote(file["path"], safe='')
    file_url = f'https://gitlab.com/api/v4/projects/{project_id}/repository/files/{file_path_encoded}/raw'
    file_params = {'ref': 'main'}
    file_response = requests.get(file_url, params=file_params)
    file_response.raise_for_status()

    print(file_url)
    local_file_path = os.path.join(save_path, file['path'])
    os.makedirs(os.path.dirname(local_file_path), exist_ok=True)

    with open(local_file_path, 'wb') as f:
        f.write(file_response.content)
    print(f'Downloaded: {file["path"]} to {local_file_path}')

def download_gitlab_data(project_id, gitlab_path, save_path):
    api_url = f'https://gitlab.com/api/v4/projects/{project_id}/repository/tree'
    params = {
        'path': gitlab_path,
        'recursive': True
    }
    
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()
        files = response.json()

        for file in files:
            if file['type'] == 'blob' and file['name'].endswith('.md'):
                download_files_in_folder(project_id, file, save_path)
            else:
                download_gitlab_data(project_id, f'{file["path"]}', save_path)
    except Exception as e:
        print(f"Error occured at {gitlab_path}: {e}")

PROJECT_ID = "gitlab-com%2Fcontent-sites%2Fhandbook"
FOLDER_PATH = "content"
SAVE_PATH = "../data/gitlab_handbook"

download_gitlab_data(PROJECT_ID, FOLDER_PATH, SAVE_PATH)

Lets focus on 1 MD file first to test. 

In [None]:
FILE_PATH = "../data/handbook-main-content/content/handbook/communication/_index.md"
EMBEDDING_MODEL = "text-embedding-3-large"

def split_markdown_text(file_path: str, embedding_model: str, min_tokens: int = 200, max_tokens: int = 1000, overlap: int = 30):
    md_regex = r"(^#+\s*.*)" #regex which captures all levels of headers in markdown.
    tokenizer = tiktoken.encoding_for_model(embedding_model)
    text = ""
    #read text
    try:
        file = open(file_path, 'r', encoding="utf-8")
        text = file.read()
    except Exception as e:
        print(f"error while reading file: {e}")

    #split text by headers
    sections = re.split(md_regex, text, flags=re.MULTILINE)
    chunks = []
    temp_chunk = ""
    

    for i in range(1, len(sections), 2): # loop through headers and text in sections
        
        header = sections[i].strip()
        content = sections[i+1].strip() if i + 1 <= len(sections) else ""
        chunk = header + '\n' + content
        token_count = len(tokenizer.encode(chunk))

        # add chunk to chunk list or to temporary chunk to combine with other chunks
        if token_count < min_tokens:
            temp_chunk += chunk + "\n"          
        else:
            if temp_chunk:
                chunks.append(temp_chunk)
                temp_chunk = ""
            chunks.append(chunk)

    # add remaining temp chunk if it exists
    if temp_chunk:
        chunks.append(temp_chunk)

    splitter = RecursiveCharacterTextSplitter(chunk_size = max_tokens, chunk_overlap = overlap)
    split_chunks = []
    for chunk in chunks:
        split_chunks.extend(splitter.split_text(chunk))
    
    return chunks

chunks = split_markdown_text(FILE_PATH, EMBEDDING_MODEL)
tokenizer = tiktoken.encoding_for_model(EMBEDDING_MODEL)
print(len(chunks))
print(len(tokenizer.encode(chunks[2])))


split all markdown files in the handbook folder up into chunks.
Chunks are split up by headings and large sections of text will be split up further into chunks of 1000 tokens


In [41]:


def split_markdown_text(text: str, embedding_model: str, min_tokens: int = 200, max_tokens: int = 1000, overlap: int = 30):
    md_regex = r"(^#+\s*.*)" #regex which captures all levels of headers in markdown.
    tokenizer = tiktoken.encoding_for_model(embedding_model)

    #split text by headers
    sections = re.split(md_regex, text, flags=re.MULTILINE)
    chunks = []
    temp_chunk = ""
    
    #capture first text which often does not start with a header
    
    if len(tokenizer.encode(sections[0])) < min_tokens:
        temp_chunk = sections[0]
    else:
        chunks.append(sections[0])
    

    for i in range(1, len(sections), 2): # loop through headers and text in sections
        header = sections[i].strip()
        content = sections[i+1].strip() if i + 1 <= len(sections) else ""
        chunk = header + '\n' + content
        token_count = len(tokenizer.encode(chunk))

        # add chunk to chunk list or to temporary chunk to combine with other chunks
        if token_count < min_tokens:
            temp_chunk += chunk + "\n"          
        else:
            if temp_chunk:
                chunks.append(temp_chunk)
                temp_chunk = ""
            chunks.append(chunk)

    # add remaining temp chunk if it exists
    if temp_chunk:
        chunks.append(temp_chunk)

    splitter = RecursiveCharacterTextSplitter(chunk_size = max_tokens, chunk_overlap = overlap)
    split_chunks = []
    for chunk in chunks:
        split_chunks.extend(splitter.split_text(chunk))
    
    return split_chunks

def extract_text_from_data(folder_path):
    file_chunks = []
    id = 0
    HANDBOOK_ROOT_URL = "https://gitlab.com/gitlab-com/content-sites/handbook/-/tree/main/content"

    # walk through all folders and subfolders
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.md'):                  #only extract text from markdown files
                file_path = os.path.join(root, file)
                file_url = file_path.replace("../data/handbook-main-content\\content", HANDBOOK_ROOT_URL)
                file_url = file_url.replace('\\', '/')
               
                with open(file_path, 'r', encoding="utf-8") as f:
                    content = f.read()
                    chunks = split_markdown_text(content, EMBEDDING_MODEL, min_tokens=200)
                    for chunk in chunks:

                        file_chunks.append((id, file_url, chunk))
                        id += 1
                    
    return file_chunks

DATA_PATH = "../data/handbook-main-content"
EMBEDDING_MODEL = "text-embedding-3-large"
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")




file_chunks = extract_text_from_data(DATA_PATH)
# def create_chunks_df(file_chunks: List[(str, List[str])]):
#     id = 0
print(len(file_chunks))


67102


In [44]:
df = pd.DataFrame(data=file_chunks, columns=['chunk_id','source_url', 'chunk'])
df.iloc[0]['chunk']

'---\ntitle: The Handbook\nno_list: true\nmenu:\n  main:\n    name: Handbook\n    pre: \'<i class="fa-solid fa-book"></i>\'\ncascade:\n      type: docs\n---\n\n## Introduction\nThe GitLab team handbook is the central repository for how we run the company. Printed, it consists of over\n[2,000 pages of text](/handbook/about/#count-handbook-pages). As part of our value of being\ntransparent the handbook is [open to the world](https://gitlab.com/gitlab-com/content-sites/handbook/), and we welcome\nfeedback. Please make a [merge request](https://gitlab.com/gitlab-com/content-sites/handbook/merge_requests) to suggest\nimprovements or add clarifications. Please use [issues](https://gitlab.com/gitlab-com/content-sites/handbook/issues) to\nask questions.\n\nFor a very specific set of [internal](/handbook/communication/confidentiality-levels/#internal)\ninformation we also maintain an [Internal Handbook](https://internal.gitlab.com/handbook/)\n## Handbook Contents\n{{< cardpane >}}'

count total tokens and estimate embedding cost

In [None]:
tokenizer = tiktoken.encoding_for_model(EMBEDDING_MODEL)
total_count = 0
for id, file, chunk in file_chunks:
    token_count = len(tokenizer.encode(chunk))
    total_count += token_count
print(f"total token count = {total_count}")
print(f"API costs at ~0.1 dollar per million tokens equals {total_count/1000000 * 0.1} dollar")
    

Remove small chunks

In [45]:
tokenizer = tiktoken.encoding_for_model(EMBEDDING_MODEL)
total_count = 0
trimmed_file_chunks = []
for id, file, chunk in file_chunks:
    
    count = len(tokenizer.encode(chunk))
    if count > 200:
        trimmed_file_chunks.append((id, file, chunk))

print(len(trimmed_file_chunks))

14056


In [16]:

client = OpenAI()

#FIX SHOULD RETURN A LIST OF FLOATS INSTEAD OF STRING
def get_embeddings(chunks, model=EMBEDDING_MODEL):
    try:
        response = client.embeddings.create(input = chunks, model=model)
        print("batch done")
        return [item.embedding for item in response.data]
    except Exception as e:
        print(f"Embedding failed with error: {e}")
    return [None] * len(chunks)  


In [47]:
df = pd.DataFrame(data=trimmed_file_chunks, columns=['chunk_id','source_url', 'chunk'])
df.iloc[1]['chunk']

"## Where we are Headed\nAt GitLab, we encourage everyone to work [handbook first](/handbook/about/handbook-usage/#why-handbook-first) in order to promote asynchronous collaboration and documentation. Working this way has its challenges, not the least of which is the time and effort involved in making a change. While this extra investment can encourage contributors to be more considered and deliberate with their changes, at a certain point it discourages meaningful collaboration and works against our goals.\n\nOur hope is that the GitLab Handbook is something that others want to emulate. To facilitate that, we want to ensure that any user can easily use and update the handbook. Ideally, the handbook has:\n\n- Organized, and up-to-date content\n- Fast, predictable deployments\n- A clean, scalable information architecture and modern codebase\n### What's Next & Why\nWe are focused on maintaining the handbook's fast (pipeline under 10 minutes) and stable (minimal pipeline failures) state."

In [None]:
class HandbookChunk2(LanceModel):
    chunk_id: str
    source_url: str
    chunk: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()


In [None]:


func = get_registry().get("openai").create(name="text-embedding-3-large")

class HandbookChunk2(LanceModel):
    chunk_id: str
    source_url: str
    chunk: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()

    
db = lancedb.connect("../data/lancedb")


table = db.create_table("embedded_handbook_with_urls", schema = HandbookChunk2)


batch_size = 500

for i in range(0, len(df), batch_size):
    table.add(df.iloc[i:i + batch_size])
    break
table.head()

In [33]:
batch_size = 500

for i in range(0, len(df), batch_size):
    table.add(df.iloc[i:i + batch_size])

In [48]:
    
db = lancedb.connect("../data/lancedb")

table = db.open_table("embedded_handbook_with_urls")
print(table.head())

pyarrow.Table
chunk_id: string not null
source_url: string not null
chunk: string not null
vector: fixed_size_list<item: float>[3072]
  child 0, item: float
----
chunk_id: [["1","9","12","16","25"]]
source_url: [["https://gitlab.com/gitlab-com/content-sites/handbook/-/tree/main/content/handbook/_index.md","https://gitlab.com/gitlab-com/content-sites/handbook/-/tree/main/content/handbook/about/direction.md","https://gitlab.com/gitlab-com/content-sites/handbook/-/tree/main/content/handbook/about/escalation.md","https://gitlab.com/gitlab-com/content-sites/handbook/-/tree/main/content/handbook/about/escalation.md","https://gitlab.com/gitlab-com/content-sites/handbook/-/tree/main/content/handbook/about/handbook-usage.md"]]
chunk: [["---
title: The Handbook
no_list: true
menu:
  main:
    name: Handbook
    pre: '<i class="fa-solid fa-book"></i>'
cascade:
      type: docs
---

## Introduction
The GitLab team handbook is the central repository for how we run the company. Printed, it consists 

In [9]:
# Get a sample embedding
sample_row = table.to_pandas().iloc[0]  # Get the first row
sample_embedding = sample_row["vector"]

# Print the dimension
print(f"Vector dimension: {len(sample_embedding)}")

Vector dimension: 3072


Sanitising text for reranker

<h3>creating history table


In [None]:

class ChatHistory(LanceModel):
    session_id: str
    share_token: str
    history: str

db = lancedb.connect("../data/lancedb")
db.drop_table('history_table')
table = db.create_table("history_table", schema = ChatHistory)
table.head()


In [138]:
import uuid
import json

history = []
history.append({'role': "user", "content": 'question'})
session_id = str(uuid.uuid4())
share_token = str(uuid.uuid4())
data = [ChatHistory(session_id = session_id, share_token = share_token, history= json.dumps(history))]
table.add(data)

check history table after testing chatbot

In [None]:
first_row = table.head(n=10)
print(table.head())
print(len(table))

In [None]:
results = table.search().where(f"session_id = '{session_id}'").limit(1).to_pydantic(ChatHistory)

if results:
    print(results[0].history)

history_list = []
history_list.append({'role': "assistant", "content": 'answer'})
print(history_list)


In [208]:
def update_chat_history(table, session_id: str, share_token: str, new_history: str):
        results = table.search().where(f"session_id = '{session_id}'").limit(1).to_list()
        if results:
            table.update(where=f"session_id = '{session_id}'", values={'history': new_history})
        else:
           
            table.add([{"session_id" : session_id, "share_token": share_token, "history": new_history}])

update_chat_history(table, session_id, share_token, json.dumps(history_list))

<h2> migrating to PostGres and SQLalchemy


In [12]:
from sqlalchemy import create_engine, Column, Integer, String, LargeBinary
from sqlalchemy.orm import declarative_base, sessionmaker, Session, scoped_session
from sqlalchemy.dialects.postgresql import UUID
from pgvector.sqlalchemy import Vector

from sqlalchemy.sql import func
import uuid
from sqlalchemy import (
    Column,
    Integer,
    String,
    Boolean,
    DateTime,
    ForeignKey,
    JSON,
    Text,
)



In [3]:
Base = declarative_base()

# declare models
class ChatSession(Base):
    __tablename__ = "chat_session"
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    share_token = Column(String, unique=True, index=True)
    created_at = Column(DateTime(timezone=True), server_default=func.now())
    updated_at = Column(DateTime(timezone=True), onupdate=func.now())

    class Config:
        orm_mode = True


class ChatMessage(Base):
    __tablename__ = "chat_message"
    id = Column(Integer, primary_key=True)
    session_id = Column(UUID(as_uuid=True), ForeignKey("chat_session.id"), index=True)
    question = Column(Text)
    answer = Column(Text)
    language = Column(String)
    message_metadata = Column(JSON)
    sources = Column(JSON)
    tools_used = Column(JSON)
    able_to_answer = Column(Boolean, default=True)
    question_classification = Column(String)
    trace_id = Column(String, index=True)
    created_at = Column(DateTime(timezone=True), server_default=func.now())

    class Config:
        orm_mode = True

class Chunk(Base):
    __tablename__ = "chunks"

    id = Column(Integer, primary_key=True, autoincrement=True)
    source = Column(String, nullable=False)
    chunk = Column(String, nullable=False)
    embedding = Column(Vector(3072))  # Adjust vector dimension to match your embeddings
    
    class Config:
        orm_mode = True

In [4]:
DATABASE_URL = "postgresql+psycopg://postgres:password@localhost:5432/handbook_db"
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(bind=engine)

Base.metadata.create_all(engine)

In [5]:
# connect to lancedb
db = lancedb.connect("../data/lancedb")
table = db.open_table("embedded_handbook_with_urls")
handbook_df = table.to_lance().to_table().to_pandas()

# connect to postgreSQL
session = SessionLocal()

# migrate data
for row in handbook_df.itertuples(index=False):
    new_chunk = Chunk(
        source=row.source_url,
        chunk=row.chunk,
        embedding=row.vector  # Ensure this is a NumPy array or list
    )
    session.add(new_chunk)

session.commit()
session.close()

In [6]:
handbook_df.iloc[0]['chunk']


'---\ntitle: The Handbook\nno_list: true\nmenu:\n  main:\n    name: Handbook\n    pre: \'<i class="fa-solid fa-book"></i>\'\ncascade:\n      type: docs\n---\n\n## Introduction\nThe GitLab team handbook is the central repository for how we run the company. Printed, it consists of over\n[2,000 pages of text](/handbook/about/#count-handbook-pages). As part of our value of being\ntransparent the handbook is [open to the world](https://gitlab.com/gitlab-com/content-sites/handbook/), and we welcome\nfeedback. Please make a [merge request](https://gitlab.com/gitlab-com/content-sites/handbook/merge_requests) to suggest\nimprovements or add clarifications. Please use [issues](https://gitlab.com/gitlab-com/content-sites/handbook/issues) to\nask questions.\n\nFor a very specific set of [internal](/handbook/communication/confidentiality-levels/#internal)\ninformation we also maintain an [Internal Handbook](https://internal.gitlab.com/handbook/)\n## Handbook Contents\n{{< cardpane >}}'

In [13]:
session_factory = scoped_session(SessionLocal)
session = session_factory()

In [14]:
import openai
import numpy as np
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(bind=engine)
session = SessionLocal()
session_factory = scoped_session(SessionLocal)
session = session_factory()
query_embedding = openai.embeddings.create(
                input = 'gitlab core values',
                model="text-embedding-3-large"

            ).data[0].embedding

query_vector = np.array(query_embedding).tolist()
            
results = (
    session.query(Chunk)
    .order_by(Chunk.embedding.l2_distance(query_vector))  # L2 distance for similarity
    .limit(5)
    .all()
            )


In [11]:
for chunk in results:
    print(chunk.chunk)

## General Principles
We believe the [GitLab core values](/handbook/values/) should guide all our
coding decisions:

- Collaboration
  - Collaborate with teammates, review each other’s code, and share knowledge.
  - Always be open to feedback and improve your code based on it.
  - Ensure your contributions are readable and understandable to others on the
    team
- Results
  - Focus on the impact and effectiveness of the code.
  - Prioritize working solutions over perfect solutions (but try to avoid
    shortcuts that undermine quality)
- Efficiency
  - Write clear and maintainable code.
  - Strive for simplicity without sacrificing functionality.
- Transparency
  - Code should be easy to understand and accessible to all.
  - Document assumptions, choices, and design decisions clearly when needed.
- Iteration
  - Code should be flexible and easy to refactor.
  - We aim for continuous improvement and don't shy away from updating or
    reworking code when necessary.
We do this by having