# Helpful links:
Database connection:
https://docs.intersystems.com/irislatest/csp/docbook/DocBook.UI.Page.cls?KEY=BPYNAT_pyapi

SQL:
https://docs.intersystems.com/irislatest/csp/docbook/DocBook.UI.Page.cls?KEY=RSQL_createtable

Data Types:
https://docs.intersystems.com/irislatest/csp/docbook/DocBook.UI.Page.cls?KEY=RSQL_datatype

# Loading the data to a dataframe
This tutorial covers how to use IRIS as a vector database. 

For this tutorial, we will use a dataset of 2.2k online reviews of scotch (
dataset from https://www.kaggle.com/datasets/koki25ando/22000-scotch-whisky-reviews) . With our latest vector database functionality, we can leverage the latest embedding models to run semantic search on the online reviews of scotch whiskeys. In addition, we'll be able to apply filters on columns with structured data. For example, we will be able to search for whiskeys that are priced under $100, and are 'earthy, smooth, and easy to drink'. Let's find our perfect whiskey!

In [1]:
import pandas as pd
import csv


# Load the CSV file
df = pd.read_csv('../data/B-IHOK-AH_AMB-FINAL.csv', sep="|", on_bad_lines="skip")

# View the data
df.head()


Unnamed: 0,ic_amb_zad,ic_amb_karta,ic_pac,dat_zad,cas_zad,prac_od,dg_kod,i_dg_kod,text_dg,i_text_dg,poz_text,amb_zaz_text
0,25775488,3702287,257353,2023-01-09,14:47:00,41742,,C911,,Chronická lymfocytická leukemie z B-buněk ...,,Odběr:09.01.2023 10:53 -----------------------...
1,26043815,4712677,1773067,2023-01-09,13:11:00,41742,,C911,calquence ...,Chronická lymfocytická leukemie z B-buněk ...,,Počátek vyšetření: 09.01.2023 12:53 Odběr:09.0...
2,26015332,2480460,3436,2023-01-09,15:06:00,41742,,C911,venetoklax příjem IHOK ...,Chronická lymfocytická leukemie z B-buněk ...,,"studie LOXO 20022, pacient číslo 2201-440 plat..."
3,25567942,5809559,2135408,2023-01-09,15:04:00,41742,,C911,kontrola ...,Chronická lymfocytická leukemie z B-buněk ...,,Počátek vyšetření: 09.01.2023 14:39 Odběr:09.0...
4,25833197,6059137,86335,2023-01-09,15:09:00,41742,,C911,imbruvica ...,Chronická lymfocytická leukemie z B-buněk ...,,Odběr:09.01.2023 10:54 -----------------------...


In [2]:
# Clean data
# Remove the specified columns
#df.drop(['currency'], axis=1, inplace=True)

# Drop the first column
#df.drop(columns=df.columns[0], inplace=True)

# Remove rows without a price
#df.dropna(subset=['price'], inplace=True)

# Ensure values in 'price' are numbers
#df = df[pd.to_numeric(df['price'], errors='coerce').notna()]

# Replace NaN values in other columns with an empty string
df.fillna('', inplace=True)
df.truncate()

# View cleaned data
df.head()

Unnamed: 0,ic_amb_zad,ic_amb_karta,ic_pac,dat_zad,cas_zad,prac_od,dg_kod,i_dg_kod,text_dg,i_text_dg,poz_text,amb_zaz_text
0,25775488,3702287,257353,2023-01-09,14:47:00,41742,,C911,,Chronická lymfocytická leukemie z B-buněk ...,,Odběr:09.01.2023 10:53 -----------------------...
1,26043815,4712677,1773067,2023-01-09,13:11:00,41742,,C911,calquence ...,Chronická lymfocytická leukemie z B-buněk ...,,Počátek vyšetření: 09.01.2023 12:53 Odběr:09.0...
2,26015332,2480460,3436,2023-01-09,15:06:00,41742,,C911,venetoklax příjem IHOK ...,Chronická lymfocytická leukemie z B-buněk ...,,"studie LOXO 20022, pacient číslo 2201-440 plat..."
3,25567942,5809559,2135408,2023-01-09,15:04:00,41742,,C911,kontrola ...,Chronická lymfocytická leukemie z B-buněk ...,,Počátek vyšetření: 09.01.2023 14:39 Odběr:09.0...
4,25833197,6059137,86335,2023-01-09,15:09:00,41742,,C911,imbruvica ...,Chronická lymfocytická leukemie z B-buněk ...,,Odběr:09.01.2023 10:54 -----------------------...


# IRIS database operations

In [3]:
import iris
import time
import os

## Database connection settings

In [4]:
username = 'demo'
password = 'demo'
hostname = os.getenv('IRIS_HOSTNAME', 'localhost')
port = '1972' 
namespace = 'USER'
CONNECTION_STRING = f"{hostname}:{port}/{namespace}"

In [5]:
print(CONNECTION_STRING)

localhost:1972/USER


In [6]:
# Note: Ideally conn and cursor should be used with context manager or with try-execpt-finally 
conn = iris.connect(CONNECTION_STRING, username, password)
cursor = conn.cursor()

## Creating a new table

In [7]:
tableName = "SchemaName.TableName"
tableDefinition = """(ic_amb_zad INT, ic_amb_karta INT, ic_pac INT, 
                    dat_zad DATE, cas_zad TIME, prac_od INT, dg_kod MEDIUMTEXT, i_dg_kod MEDIUMTEXT, 
                    text_dg MEDIUMTEXT, i_text_dg MEDIUMTEXT, poz_text MEDIUMTEXT, amb_zaz_text LONGTEXT)"""

In [8]:
try:
    cursor.execute(f"DROP TABLE {tableName}")  
except:
    pass
cursor.execute(f"CREATE TABLE {tableName} {tableDefinition}")

0

This created table can be viewed by going to the IRIS management portal at: http://localhost:52773/csp/sys/UtilHome.csp 

Note: When you create a table and specify the desired data fields, a RowID field is automatically created
https://docs.intersystems.com/irisforhealthlatest/csp/docbook/DocBook.UI.Page.cls?KEY=GSQL_tables#GSQL_tables_idfield 

## Deleting rows

In [9]:
sql = f"DELETE FROM {tableName} WHERE ID = 1"
cursor.execute(sql) 

0

## Adding data one row at a time by looping over the dataframe

In [10]:
try:
    cursor.execute(f"DROP TABLE {tableName}")  
except:
    pass
cursor.execute(f"CREATE TABLE {tableName} {tableDefinition}")

0

In [11]:
##looping through dataframe and adding all the data to IRIS table
sql = f"""Insert into {tableName} (ic_amb_zad, ic_amb_karta, ic_pac, 
                    dat_zad, cas_zad, prac_od, dg_kod, i_dg_kod, 
                    text_dg, i_text_dg, poz_text, amb_zaz_text) values (?,?,?,?,?,?,?,?,?,?,?,?)"""
start_time = time.time()
for index,row in df.iterrows():
    data = (row['ic_amb_zad'], row['ic_amb_karta'], row['ic_pac'], row['dat_zad'], row['cas_zad'], row['prac_od'], row['dg_kod'], row['i_dg_kod'], row['text_dg'], row['i_text_dg'], row['poz_text'], row['amb_zaz_text'])
    cursor.execute(sql, data)
end_time = time.time()
print(f"time taken to add {len(df)} entries: {end_time-start_time} seconds")

time taken to add 5572 entries: 2.5327608585357666 seconds


## Reading from the table

In [12]:
## Read only select columns
cursor.execute(f"select * from {tableName}")
fetched_data = cursor.fetchmany(3)
for row in fetched_data:
    print(row)

(25775488, 3702287, 257353, datetime.date(2023, 1, 9), datetime.time(14, 47), 41742, '', 'C911 ', '', 'Chronická lymfocytická leukemie z B-buněk                   ', '', 'Odběr:09.01.2023 10:53 ------------------------------------------------------------  Ret% 1.67 %   Reta 59.50 10^9/l   IRF 0.258    RET-He 35.3 pg   WBC 27.700 10^9/l   RBC 3.56 10^12/l   HGB 122.0 g/l   HCT 0.360 l/l MCV 101.7 fL   PLT 129.0 10^9/l   MCH 34.3 pg   MCHC 337.0 g/l   MPV 9.30 fl   RDW 14.2 %   NEU% 1.60 %   LYM% 94.40 %   MON% 3.40 % EOS% 0.30 %   BAS% 0.30 %   NEU 0.480 10^9/l   LYM 26.150 10^9/l MONO 0.930 10^9/l   EOS 0.070 10^9/l   BASO 0.070 10^9/l   NRBC 0.04 Odběr:09.01.2023 10:52 Urea S-Urea 5.5 mmol/l  (2.8-8.1) Kreat. S-Kreatinin 91 umol/l  (59-104) CKD-EPI CKD-EPI Krea 1.39 ml/s/1.73m2  (1-2.4) KM S-Kys.močová 417 umol/l  (202-417) Na S-Na 143 mmol/l  (136-145) K S-K 4.4 mmol/l  (3.5-5.1) Cl S-Cl 107 mmol/l  (98-107) Bi-celk. S-Bilirubin celk 13.1 umol/l  (2-21) ALT S-ALT 0.17 ukat/l  (0.17-0

In [13]:
##fetching all columns from database
cursor.execute(f"select * from {tableName}")
fetched_data = cursor.fetchmany(3)
for row in fetched_data:
    print(row)

(25775488, 3702287, 257353, datetime.date(2023, 1, 9), datetime.time(14, 47), 41742, '', 'C911 ', '', 'Chronická lymfocytická leukemie z B-buněk                   ', '', 'Odběr:09.01.2023 10:53 ------------------------------------------------------------  Ret% 1.67 %   Reta 59.50 10^9/l   IRF 0.258    RET-He 35.3 pg   WBC 27.700 10^9/l   RBC 3.56 10^12/l   HGB 122.0 g/l   HCT 0.360 l/l MCV 101.7 fL   PLT 129.0 10^9/l   MCH 34.3 pg   MCHC 337.0 g/l   MPV 9.30 fl   RDW 14.2 %   NEU% 1.60 %   LYM% 94.40 %   MON% 3.40 % EOS% 0.30 %   BAS% 0.30 %   NEU 0.480 10^9/l   LYM 26.150 10^9/l MONO 0.930 10^9/l   EOS 0.070 10^9/l   BASO 0.070 10^9/l   NRBC 0.04 Odběr:09.01.2023 10:52 Urea S-Urea 5.5 mmol/l  (2.8-8.1) Kreat. S-Kreatinin 91 umol/l  (59-104) CKD-EPI CKD-EPI Krea 1.39 ml/s/1.73m2  (1-2.4) KM S-Kys.močová 417 umol/l  (202-417) Na S-Na 143 mmol/l  (136-145) K S-K 4.4 mmol/l  (3.5-5.1) Cl S-Cl 107 mmol/l  (98-107) Bi-celk. S-Bilirubin celk 13.1 umol/l  (2-21) ALT S-ALT 0.17 ukat/l  (0.17-0

# Adding vector embeddings to the table

## Create embeddings for the descriptions and add to the dataframe

In [13]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained sentence transformer model. This model's output vectors are of size 384
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

In [14]:
# Generate embeddings for all descriptions at once. Batch processing makes it faster
embeddings = model.encode(df['amb_zaz_text'].tolist(), normalize_embeddings=True)

# Add the embeddings to the DataFrame
df['amb_zaz_text_vector'] = embeddings.tolist()

In [25]:
df

Unnamed: 0,ic_amb_zad,ic_amb_karta,ic_pac,dat_zad,cas_zad,prac_od,dg_kod,i_dg_kod,text_dg,i_text_dg,poz_text,amb_zaz_text,amb_zaz_text_vector
0,25775488,3702287,257353,2023-01-09,14:47:00,41742,,C911,,Chronická lymfocytická leukemie z B-buněk ...,,Odběr:09.01.2023 10:53 -----------------------...,"[0.0362723171710968, 0.01610576920211315, 0.03..."
1,26043815,4712677,1773067,2023-01-09,13:11:00,41742,,C911,calquence ...,Chronická lymfocytická leukemie z B-buněk ...,,Počátek vyšetření: 09.01.2023 12:53 Odběr:09.0...,"[0.040446750819683075, 0.12473584711551666, 0...."
2,26015332,2480460,3436,2023-01-09,15:06:00,41742,,C911,venetoklax příjem IHOK ...,Chronická lymfocytická leukemie z B-buněk ...,,"studie LOXO 20022, pacient číslo 2201-440 plat...","[0.05710926279425621, 0.013156612403690815, -0..."
3,25567942,5809559,2135408,2023-01-09,15:04:00,41742,,C911,kontrola ...,Chronická lymfocytická leukemie z B-buněk ...,,Počátek vyšetření: 09.01.2023 14:39 Odběr:09.0...,"[0.054597243666648865, 0.07014299929141998, 0...."
4,25833197,6059137,86335,2023-01-09,15:09:00,41742,,C911,imbruvica ...,Chronická lymfocytická leukemie z B-buněk ...,,Odběr:09.01.2023 10:54 -----------------------...,"[0.0338958241045475, 0.0340065136551857, 0.037..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,27493925,6395394,19532,2023-12-28,12:42:00,41742,,C911,"LOXO, jen odběr KO ...",Chronická lymfocytická leukemie z B-buněk ...,,Počátek vyšetření: 28.12.2023 08:41 C10 Studie...,"[0.06402770429849625, 0.05141748860478401, 0.0..."
5568,27551604,5302898,2325924,2023-12-28,12:38:00,41742,,C911,pouze odběr KO a odejít ...,Chronická lymfocytická leukemie z B-buněk ...,,Počátek vyšetření: 28.12.2023 08:35 LOXO-BTK-2...,"[0.035097360610961914, -0.0036271417047828436,..."
5569,27551590,4465360,374750,2023-12-28,12:43:00,41742,,C911,pouze odběr KO a odejít ...,Chronická lymfocytická leukemie z B-buněk ...,,"C10D1 studie LOXO-BTK-20030, subj 2201-405 ---...","[0.010818377137184143, 0.03816291689872742, 0...."
5570,27565846,6849977,2100404,2023-12-29,09:09:00,41742,,C911,,Chronická lymfocytická leukemie z B-buněk ...,,Pac. s CLL/SLL léčená původně v r 2020 cestou ...,"[-0.04853430017828941, 0.03359058126807213, -0..."


## Delete and create the table in IRIS again to add embeddings
### (Note: Alternately, the Alter Table command can be used to just add the new embeddings column : https://docs.intersystems.com/irislatest/csp/docbook/DocBook.UI.Page.cls?KEY=RSQL_altertable#RSQL_altertable_synopsis )

In [18]:
try:
    cursor.execute(f"DROP TABLE {tableName}")  
except:
    pass

## note the additional description_vector VECTOR(DOUBLE, 384) field to store embeddings
tableDefinition = """(ic_amb_zad INT, ic_amb_karta INT, ic_pac INT, 
                    dat_zad DATE, cas_zad DATETIME, prac_od INT, dg_kod MEDIUMTEXT, i_dg_kod MEDIUMTEXT, 
                    text_dg MEDIUMTEXT, i_text_dg MEDIUMTEXT, poz_text MEDIUMTEXT, amb_zaz_text LONGTEXT, amb_zaz_text_vector VECTOR(DOUBLE, 384))"""
cursor.execute(f"CREATE TABLE {tableName} {tableDefinition}")

0

## Add all the rows to the table in IRIS

### Adding the entire dataframe as a single batch (faster)

In [26]:
sql = f"""Insert into {tableName} (ic_amb_zad, ic_amb_karta, ic_pac, 
                    dat_zad, cas_zad, prac_od, dg_kod, i_dg_kod, 
                    text_dg, i_text_dg, poz_text, amb_zaz_text, amb_zaz_text_vector) values (?,?,?,?,?,?,?,?,?,?,?,?, TO_VECTOR(?))"""

start_time = time.time()
# Prepare the list of tuples (parameters for each row)
data = [
    (
        row['ic_amb_zad'], row['ic_amb_karta'], row['ic_pac'], row['dat_zad'], row['cas_zad'], row['prac_od'], row['dg_kod'], row['i_dg_kod'], row['text_dg'], row['i_text_dg'], row['poz_text'], row['amb_zaz_text'],
        str(row['amb_zaz_text_vector']) 
    )
    for index, row in df.iterrows()
]

cursor.executemany(sql, data)
end_time = time.time()
print(f"time taken to add {len(df)} entries: {end_time-start_time} seconds")

time taken to add 5572 entries: 4.309087753295898 seconds


In [35]:
df.iloc[12]

ic_amb_zad                                                      26053765
ic_amb_karta                                                     4402239
ic_pac                                                           2106293
dat_zad                                                       2023-01-02
cas_zad                                                         08:46:00
prac_od                                                            41742
dg_kod                                                                  
i_dg_kod                                                           C911 
text_dg                                                                 
i_text_dg              Chronická lymfocytická leukemie z B-buněk     ...
poz_text                                                                
amb_zaz_text           Počátek vyšetření: 02.01.2023 08:45 pacient ze...
amb_zaz_text_vector    [0.01057382021099329, 0.027324587106704712, -0...
Name: 12, dtype: object

In [33]:
## note the TO_VECTOR(?) used to add vector data type to the table
## also note the "str" conversion of the embeddings from the dataframe before they are added to the table in IRIS
# sql = f"Insert into {tableName} (name, category, review_point, price, description, description_vector) values (?,?,?,?,?,TO_VECTOR(?))"
sql = f"""INSERT INTO {tableName} (ic_amb_zad, ic_amb_karta, ic_pac, 
                    dat_zad, cas_zad, prac_od, dg_kod, i_dg_kod, 
                    text_dg, i_text_dg, poz_text, amb_zaz_text, amb_zaz_text_vector) values (?,?,?,?,?,?,?,?,?,?,?,?, TO_VECTOR(?))"""

start_time = time.time()
for index,row in df.iterrows():
    data = [(row['ic_amb_zad'], 
             row['ic_amb_karta'], 
             row['ic_pac'], 
             row['dat_zad'], 
             row['cas_zad'], 
             row['prac_od'], 
             row['dg_kod'], 
             row['i_dg_kod'], 
             row['text_dg'], 
             row['i_text_dg'], 
             row['poz_text'], 
             row['amb_zaz_text'], 
             str(row['amb_zaz_text_vector']))]
    cursor.execute(sql, data)
end_time = time.time()
print(f"time taken to add {len(df)} entries: {end_time-start_time} seconds")

RuntimeError: [SQLCODE: <-104>:<Field validation failed in INSERT, or value failed to convert in DisplayToLogical or OdbcToLogical>]
[Location: <ServerLoop>]
[%msg: <Field 'SchemaName.TableName.amb_zaz_text_vector' (value '3F989CC6329BAEEFC70B9E00D6C1AD50@$vector') failed validation>]

# Using IRIS Vector Search

## Let's look for a scotch that costs less than $100, and has an earthy and creamy taste.

In [76]:
# This is our search phrase
searchPhrase = "Počátek vyšetření"

# Convert search phrase into a vector
searchVector = model.encode(searchPhrase, normalize_embeddings=True).tolist() 

In [75]:
sql = f"""
    SELECT TOP ? VECTOR_DOT_PRODUCT(amb_zaz_text_vector, TO_VECTOR(?)) AS POMOC, *
    FROM {tableName}
    WHERE ic_pac=2127268
    ORDER BY VECTOR_DOT_PRODUCT(amb_zaz_text_vector, TO_VECTOR(?)) DESC 
"""

numberOfResults = 3

# Execute the query with the number of results and search vector as parameters
cursor.execute(sql, [numberOfResults, str(searchVector), str(searchVector)])

# Fetch all results
results = cursor.fetchall()
for row in results:
    print(row[:-1])


RuntimeError: [SQLCODE: <-257>:<Cannot perform vector operation on vectors of different lengths>]
[Location: <ServerLoop - Query Open()>]
[%msg: <Cannot perform vector operation on vectors of different lengths>]