### exploration of data and ingestion

In [3]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

print(f"Projektroten: {project_root}")

Projektroten: c:\Users\milou\Documents\Git\rag-lab-milou-de24


In [4]:
from pathlib import Path
from backend.constants import DATA_DIR

files = list(DATA_DIR.glob("*.md"))
print(f"Hittade {len(files)} filer")

if files:
    sample = files[0]
    text = sample.read_text(encoding="utf-8")
    print(text[:500])
else:
    print("Inga .md-filer hittades i mappen data/")

Hittade 53 filer
# An introduction to the vector database LanceDB

[00:00:00] Hello and welcome to this video where we'll go into Lance db, the fundamentals of it. Lance Db is an open source vector database designed to handle large scale ~~data, ~~vector data efficiently. It provides a robust platform for storing, indexing and querying high dimensional vector. Which is very good for working with for example, LLMs and rag applications.

Rags are retrieval, augmented generation. ~~When, ~~whenever you want to chat


### Connecta till lanceDB och skapa tabell

In [5]:
import lancedb
from backend.constants import VECTOR_DB_PATH
from backend.data_models import Article
# visar var tabellen skapas
print("VECTOR_DB_PATH:", VECTOR_DB_PATH)

db = lancedb.connect(str(VECTOR_DB_PATH))
# skapar articels - eller öppnar den om den redan existerar
if "articles" in db.table_names():
    table = db.open_table("articles")
    print("Öppnade befintlig tabell 'articles'")
else:
    table = db.create_table("articles", schema=Article)
    print("Skapade ny tabell 'articles'")

VECTOR_DB_PATH: c:\Users\milou\Documents\Git\rag-lab-milou-de24\lancedb
Skapade ny tabell 'articles'


In [6]:
from backend.constants import BASE_DIR, VECTOR_DB_PATH

print("BASE_DIR:", BASE_DIR)
print("VECTOR_DB_PATH:", VECTOR_DB_PATH)
print("VECTOR_DB_PATH (absolut):", VECTOR_DB_PATH.resolve())

BASE_DIR: c:\Users\milou\Documents\Git\rag-lab-milou-de24
VECTOR_DB_PATH: c:\Users\milou\Documents\Git\rag-lab-milou-de24\lancedb
VECTOR_DB_PATH (absolut): C:\Users\milou\Documents\Git\rag-lab-milou-de24\lancedb


#### Skapa lista med dictar för .md filerna

In [13]:
from backend.constants import DATA_DIR

files = list(DATA_DIR.glob("*.md"))
print(f"Hittade {len(files)} filer")

records = []

for path in files:
    text = path.read_text(encoding="utf-8")
    doc_id = path.stem  # filnamn utan .md

    record = {
        "doc_id": doc_id,
        "file_name": path.name,
        "content": text,
    }
    records.append(record)

print(f"Skapade {len(records)} records")
records[0]


Hittade 53 filer
Skapade 53 records


{'doc_id': 'An introduction to the vector database LanceDB',
 'file_name': 'An introduction to the vector database LanceDB.md',
 'content': "# An introduction to the vector database LanceDB\n\n[00:00:00] Hello and welcome to this video where we'll go into Lance db, the fundamentals of it. Lance Db is an open source vector database designed to handle large scale ~~data, ~~vector data efficiently. It provides a robust platform for storing, indexing and querying high dimensional vector. Which is very good for working with for example, LLMs and rag applications.\n\nRags are retrieval, augmented generation. ~~When, ~~whenever you want to chat with your data or chat with your documentation you are providing a query, ~~right? ~~A prompt or a query, a text which is then transformed into vectors. And this vector is compared. To all the other chunks or all the other documents in your database.\n\nAnd ~~you take a, ~~you find the one that is closest to it, semantically based on vector search. And

In [14]:
table.add(records)
print("Ingestion klar!")

Ingestion klar!


In [15]:
print("Antal rader i tabellen:", len(table))

Antal rader i tabellen: 159


In [16]:
table

LanceTable(name='articles', version=4, _conn=LanceDBConnection(uri='c:\\Users\\milou\\Documents\\Git\\rag-lab-milou-de24\\lancedb'))

In [17]:
print("Antal rader i tabellen:", len(table))

Antal rader i tabellen: 159


#### testar sökning

In [18]:
query = "What is LanceDB and why is it useful for analytics or RAG?"
results = table.search(query).limit(3).to_list()

print("Antal träffar:", len(results))
print("Första filen:", results[0]["file_name"])
print()
print(results[0]["content"][:500]) # printar dom första 500-tecknen

Antal träffar: 3
Första filen: An introduction to the vector database LanceDB.md

# An introduction to the vector database LanceDB

[00:00:00] Hello and welcome to this video where we'll go into Lance db, the fundamentals of it. Lance Db is an open source vector database designed to handle large scale ~~data, ~~vector data efficiently. It provides a robust platform for storing, indexing and querying high dimensional vector. Which is very good for working with for example, LLMs and rag applications.

Rags are retrieval, augmented generation. ~~When, ~~whenever you want to chat


- testar ragresponse

In [19]:
from backend.data_models import RagResponse

test = RagResponse(
    file_name="An introduction to the vector database LanceDB.md",
    file_path="/some/path/An introduction to the vector database LanceDB.md",
    answer="This is a test answer."
)
test

RagResponse(file_name='An introduction to the vector database LanceDB.md', answer='This is a test answer.')