# LanceDB - a vector database for LLM application

In [2]:
import lancedb 

db = lancedb.connect(uri= "vector_database")

db

LanceDBConnection(uri='c:\\Users\\MAER\\Documents\\STI\\dataplattform_maskininlärning_artificiell_intelligens\\AI_engineering_Marcus_Ericsson_de24\\video_alongs\\09_lancedb_vector_database\\vector_database')

In [3]:
db.uri

'c:\\Users\\MAER\\Documents\\STI\\dataplattform_maskininlärning_artificiell_intelligens\\AI_engineering_Marcus_Ericsson_de24\\video_alongs\\09_lancedb_vector_database\\vector_database'

## Create a table

In [4]:
import json 

with open("data/animals_text_embeddings.json", "r") as file:
    data = json.loads(file.read())
    
data

[{'text': 'A small brown dog running.', 'vector': [0.12, 0.85, 0.33]},
 {'text': 'A cat resting quietly on a sofa.', 'vector': [0.4, 0.91, 0.1]},
 {'text': 'A large gray elephant drinking water.',
  'vector': [0.88, 0.22, 0.55]},
 {'text': 'A fast cheetah sprinting across the savannah.',
  'vector': [0.95, 0.12, 0.72]},
 {'text': 'A colorful parrot perched on a branch.',
  'vector': [0.25, 0.66, 0.81]},
 {'text': 'A frog sitting on a lily pad.', 'vector': [0.14, 0.44, 0.27]}]

In [5]:
table_animals = db.create_table("animal_text", exist_ok= True, data= data)

table_animals

LanceTable(name='animal_text', version=3, _conn=LanceDBConnection(uri='c:\\Users\\MAER\\Documents\\STI\\dataplattform_maskininlärning_artificiell_intelligens\\AI_engineering_Marcus_Ericsson_de24\\video_alongs\\09_lancedb_vector_database\\vector_database'))

In [6]:
table_animals = db.create_table("animal_text", exist_ok= True, data= data, mode= "overwrite")

table_animals

LanceTable(name='animal_text', version=4, _conn=LanceDBConnection(uri='c:\\Users\\MAER\\Documents\\STI\\dataplattform_maskininlärning_artificiell_intelligens\\AI_engineering_Marcus_Ericsson_de24\\video_alongs\\09_lancedb_vector_database\\vector_database'))

In [7]:
table_animals.head()

pyarrow.Table
text: string
vector: fixed_size_list<item: float>[3]
  child 0, item: float
----
text: [["A small brown dog running.","A cat resting quietly on a sofa.","A large gray elephant drinking water.","A fast cheetah sprinting across the savannah.","A colorful parrot perched on a branch."]]
vector: [[[0.12,0.85,0.33],[0.4,0.91,0.1],[0.88,0.22,0.55],[0.95,0.12,0.72],[0.25,0.66,0.81]]]

In [8]:
table_animals.to_pandas()

Unnamed: 0,text,vector
0,A small brown dog running.,"[0.12, 0.85, 0.33]"
1,A cat resting quietly on a sofa.,"[0.4, 0.91, 0.1]"
2,A large gray elephant drinking water.,"[0.88, 0.22, 0.55]"
3,A fast cheetah sprinting across the savannah.,"[0.95, 0.12, 0.72]"
4,A colorful parrot perched on a branch.,"[0.25, 0.66, 0.81]"
5,A frog sitting on a lily pad.,"[0.14, 0.44, 0.27]"


In [9]:
more_data = [{
    "text": "A panda eating bamboo peacefully.", "vector": [0.51, 0.37, 0.82]},
    {"text": "A lion roaring loudly on a rock", "vector": [0.93, 0.18, 0.41]}
    ]


table_animals.add(more_data)


AddResult(version=5)

In [10]:
table_animals.to_pandas()

Unnamed: 0,text,vector
0,A small brown dog running.,"[0.12, 0.85, 0.33]"
1,A cat resting quietly on a sofa.,"[0.4, 0.91, 0.1]"
2,A large gray elephant drinking water.,"[0.88, 0.22, 0.55]"
3,A fast cheetah sprinting across the savannah.,"[0.95, 0.12, 0.72]"
4,A colorful parrot perched on a branch.,"[0.25, 0.66, 0.81]"
5,A frog sitting on a lily pad.,"[0.14, 0.44, 0.27]"
6,A panda eating bamboo peacefully.,"[0.51, 0.37, 0.82]"
7,A lion roaring loudly on a rock,"[0.93, 0.18, 0.41]"


## Create an empty table and then delete it

In [11]:
from lancedb.pydantic import LanceModel


# A pydantic model base class that can be converted to a Lancedb Table
# LanceModel

class JokeSchema(LanceModel):
    joke: str
    rating: int 
    
db.create_table(name= "jokes", schema= JokeSchema, exist_ok= True)

db





LanceDBConnection(uri='c:\\Users\\MAER\\Documents\\STI\\dataplattform_maskininlärning_artificiell_intelligens\\AI_engineering_Marcus_Ericsson_de24\\video_alongs\\09_lancedb_vector_database\\vector_database')

In [12]:
db.table_names()

['animal_text', 'jokes']

In [13]:
db.drop_table("jokes")

In [14]:
db.table_names()

['animal_text']

## open existing table

In [15]:
db.open_table("animal_text").head(2)

pyarrow.Table
text: string
vector: fixed_size_list<item: float>[3]
  child 0, item: float
----
text: [["A small brown dog running.","A cat resting quietly on a sofa."]]
vector: [[[0.12,0.85,0.33],[0.4,0.91,0.1]]]

## Vector search LanceDB

In [16]:
table_animals.to_pandas()

Unnamed: 0,text,vector
0,A small brown dog running.,"[0.12, 0.85, 0.33]"
1,A cat resting quietly on a sofa.,"[0.4, 0.91, 0.1]"
2,A large gray elephant drinking water.,"[0.88, 0.22, 0.55]"
3,A fast cheetah sprinting across the savannah.,"[0.95, 0.12, 0.72]"
4,A colorful parrot perched on a branch.,"[0.25, 0.66, 0.81]"
5,A frog sitting on a lily pad.,"[0.14, 0.44, 0.27]"
6,A panda eating bamboo peacefully.,"[0.51, 0.37, 0.82]"
7,A lion roaring loudly on a rock,"[0.93, 0.18, 0.41]"


In [17]:
query_vector = [0.5, 0.2, 0.9]

table_animals.search(query_vector).limit(3).to_pandas()


Unnamed: 0,text,vector,_distance
0,A panda eating bamboo peacefully.,"[0.51, 0.37, 0.82]",0.0354
1,A fast cheetah sprinting across the savannah.,"[0.95, 0.12, 0.72]",0.2413
2,A large gray elephant drinking water.,"[0.88, 0.22, 0.55]",0.2673


## Embedding API

- idea: want to put in text -> and it will automagically generate vector embeddings
- put in a query and it will automagically generate vector embeddings
- calculate closest distance

In [18]:


from lancedb.pydantic import LanceModel, Vector 
from lancedb.embeddings import get_registry

model = get_registry().get("gemini-text").create(name= "gemini-embedding-001")

model






GeminiText(max_retries=7, name='gemini-embedding-001', query_task_type='retrieval_query', source_task_type='retrieval_document')

In [21]:
import numpy as np


hello_embedding = np.array(model.generate_embeddings("hello"))

hello_embedding.shape




(5, 3072)

In [22]:
model.ndims()

768

In [24]:

class JokeModel(LanceModel):
    joke: str = model.SourceField()
    vector: Vector(3072) = model.VectorField()
    


table_jokes = db.create_table("jokes", schema= JokeModel, exist_ok= True)

table_jokes




LanceTable(name='jokes', version=1, _conn=LanceDBConnection(uri='c:\\Users\\MAER\\Documents\\STI\\dataplattform_maskininlärning_artificiell_intelligens\\AI_engineering_Marcus_Ericsson_de24\\video_alongs\\09_lancedb_vector_database\\vector_database'))

In [28]:
import pandas as pd

with open("data/jokes.json", "r", encoding= "utf-8") as file:
    jokes_data = json.loads(file.read())
    
    
df_jokes = pd.DataFrame(jokes_data).rename({"jokes": "joke"}, axis=1)

In [29]:
df_jokes["joke"].iloc[0]

'Parallel lines have so much in common—it’s sad they’ll never meet.'

add data to table

In [30]:
table_jokes.add(df_jokes)

AddResult(version=2)

In [31]:
table_jokes.head()

pyarrow.Table
joke: string not null
vector: fixed_size_list<item: float>[3072]
  child 0, item: float
----
joke: [["Parallel lines have so much in common—it’s sad they’ll never meet.","ETL stands for “Extract, Transform, Leave for the next person.”","What do you call a snake that runs your scripts? A py-thon (ton of errors).","Gold walks into a bar. The bartender says, “Au, get out!”","C# devs don’t argue; they just throw exceptions."]]
vector: [[[-0.024001757,0.01247358,-0.024144737,-0.06704516,0.017059995,...,0.011402721,-0.017770408,0.011606138,0.0004488597,0.012175956],[-0.015356114,0.0211365,-0.021389864,-0.07957475,0.008829045,...,0.023671096,0.00070166244,0.003873497,0.0006493248,0.01587487],[-0.01761013,0.0031474787,-0.015632002,-0.060952485,0.003696307,...,0.015392491,-0.00025712984,0.0064440705,0.0017164479,-0.00869067],[-0.024867292,0.013314825,-0.016261652,-0.07383461,0.0085606,...,0.031690735,-0.0027016324,0.017647061,0.008275158,-0.0067860777],[-0.0068662865,-0.005415149,

In [35]:
table_jokes.to_pandas()["vector"][0]

array([-0.02400176,  0.01247358, -0.02414474, ...,  0.01160614,
        0.00044886,  0.01217596], shape=(3072,), dtype=float32)

## Perform vector search

In [37]:
table_jokes.search("data engineering jokes").limit(8).to_pandas()

Unnamed: 0,joke,vector,_distance
0,Why do data engineers hate nature? Too many un...,"[-0.027916763, 0.0047387416, -0.018934403, -0....",0.461179
1,"Data engineer motto: If it works, don’t touch ...","[-0.020296954, 0.020327171, -0.009069326, -0.0...",0.542293
2,What do you call a snake that runs your script...,"[-0.01761013, 0.0031474787, -0.015632002, -0.0...",0.655434
3,"Gold walks into a bar. The bartender says, “Au...","[-0.024867292, 0.013314825, -0.016261652, -0.0...",0.667807
4,I tried to explain async/await to my friend… n...,"[-0.016174542, 0.012602222, -0.015254588, -0.0...",0.679036
5,I asked the data lake if it had my file. It sa...,"[-0.026169129, 0.01796462, -0.013938847, -0.09...",0.680632
6,Why did the Python programmer get bitten? Beca...,"[-0.021944793, 0.0030636177, -0.019837778, -0....",0.684943
7,I told a chemistry joke… there was no reaction.,"[-0.022922393, 0.017959604, -0.029222224, -0.0...",0.70169


In [38]:
table_jokes.search("Give me some jokes of C#").limit(8).to_pandas()

Unnamed: 0,joke,vector,_distance
0,The C# compiler walked into a bar. The bartend...,"[-0.01868987, 0.018796643, -0.009748903, -0.07...",0.466637
1,Why did the C# developer go broke? He kept usi...,"[-0.02175711, 0.0073709465, -0.015515881, -0.0...",0.499808
2,Why is C# like a musical? It has so many classes.,"[-0.012926053, 0.0034431089, -0.017093537, -0....",0.532261
3,C# devs don’t argue; they just throw exceptions.,"[-0.0068662865, -0.005415149, 0.0044965413, -0...",0.618182
4,C# programmers love coffee—they're always work...,"[-0.0079965545, 0.0151333995, -0.009256328, -0...",0.63291
5,Why did the Python programmer get bitten? Beca...,"[-0.021944793, 0.0030636177, -0.019837778, -0....",0.640459
6,I tried to explain async/await to my friend… n...,"[-0.016174542, 0.012602222, -0.015254588, -0.0...",0.643772
7,What do you call a snake that runs your script...,"[-0.01761013, 0.0031474787, -0.015632002, -0.0...",0.648257


## Hybrid search

combines traditional keyword-base search with vector similarity search

In [39]:
# To enable keyword search, we need to create an index on the joke column
table_jokes.create_fts_index("joke", replace=True)

In [41]:
from lancedb import rerankers


reranker = rerankers.RRFReranker()

result = table_jokes.search(
    "give me nature related jokes",
    query_type="hybrid",
    vector_column_name="vector",
    fts_columns= "joke"  
).rerank(reranker=reranker).limit(8).to_pandas()


result

Unnamed: 0,joke,vector,_relevance_score
0,Why do data engineers hate nature? Too many un...,"[-0.027916763, 0.0047387416, -0.018934403, -0....",0.032522
1,I told a chemistry joke… there was no reaction.,"[-0.022922393, 0.017959604, -0.029222224, -0.0...",0.032018
2,"Gold walks into a bar. The bartender says, “Au...","[-0.024867292, 0.013314825, -0.016261652, -0.0...",0.016129
3,Why did the chemist ground his kids? Because t...,"[-0.023257235, 0.016145445, -0.029016329, -0.0...",0.015873
4,The C# compiler walked into a bar. The bartend...,"[-0.01868987, 0.018796643, -0.009748903, -0.07...",0.015385
5,What do you call a snake that runs your script...,"[-0.01761013, 0.0031474787, -0.015632002, -0.0...",0.015152
6,Why did the Python programmer get bitten? Beca...,"[-0.021944793, 0.0030636177, -0.019837778, -0....",0.014925
7,Why’s 6 afraid of 7? Because 7 8 9.,"[-0.040820926, 0.0074244644, -0.02972158, -0.0...",0.014706


## Rule of thumb

- for exact matching -> FTS
- meaning based matching -> vector search
- both, unpredicatble or mixed queries -> hybrid search
