# Demonstration of the Soft-Query-Evaluation-System

## Python & Schema Imports

To ensure reproducibility, we insert all records in the Database

In [1]:
import json

from db.db import DBConnector

from db.operators import *
from db.criteria import *

from models import ModelMgr
from models.embedding import SentenceTransformerEmbeddingModel
from models.semantic_validation import DeepSeekValidationModel, LLaMAValidationModel

In [2]:
m = ModelMgr("config.ini")
em = SentenceTransformerEmbeddingModel(m)
# sv_ds = DeepSeekValidationModel(m)
sv = LLaMAValidationModel(m)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
db = DBConnector("config.ini")

with open("evaluation/schema.json", encoding="utf-8") as f:
    database = json.load(f)

def cast_schema_type(schema_type: str):
    schema_type = schema_type.lower().strip()
    if schema_type == "string" or schema_type == "text":
        return "TEXT"
    if schema_type == "integer":
        return "INTEGER"
    if schema_type == "boolean":
        return "BOOLEAN"
    if schema_type == "number":
        return "DOUBLE PRECISION"
    raise Exception("Unknown schema type: " + schema_type)

def sanitize_table_name(table_name: str):
    return table_name.replace("/", "_").replace(" ", "_")\
        .replace("(", "").replace(")", "").replace(".", "")\
        .replace("-", "_").lower()

with db.get_cursor() as cursor:
    cursor.execute("DROP SCHEMA IF EXISTS demo CASCADE ;")
    cursor.execute("CREATE SCHEMA demo;")

    for table in database:
        table_name = f"demo.{table}"
        column_map = {x['name']: sanitize_table_name(x['name']) for x in database[table]["schema"]}
        columns = [column_map[x['name']] for x in database[table]["schema"]]
        types = [cast_schema_type(x['type']) for x in database[table]["schema"]]
        table_column = [f"{col} {col_type}" for col, col_type in zip(columns, types)]

        cursor.execute(f"CREATE TABLE {table_name} ({', '.join(table_column)})")

        print("Creating Table ", table_name)

        for data in database[table]["data"]:
            param_names = [f"%({col})s" for col in columns]
            params = {v: data[k] for k, v in column_map.items()}
            cursor.execute(f"INSERT INTO {table_name} VALUES ({', '.join(param_names)})", params)

db.conn.commit()

Creating Table  demo.language_detection
Creating Table  demo.chemicals
Creating Table  demo.elements
Creating Table  demo.elements_phase
Creating Table  demo.movies
Creating Table  demo.movies_de
Creating Table  demo.actors
Creating Table  demo.plays_in
Creating Table  demo.top_artists_2023
Creating Table  demo.diseases
Creating Table  demo.diseases_symptom
Creating Table  demo.human_vital_sign
Creating Table  demo.companies_1
Creating Table  demo.companies_2
Creating Table  demo.user_data
Creating Table  demo.random_countries
Creating Table  demo.countries


In [4]:
db = DBConnector("config.ini", load_db=True)

## Soft Scan

Demonstrate the semantic search for database tables:
* $Scan(\text{actors}) \rightarrow demo.actors$
* $Scan(\text{movies\_german}) \rightarrow \text{demo.actors\_de}$

In [5]:
op = Scan("actors", db=db, em=em, sv=sv, threshold=0.2).open()
print(op.table.table_schema, op.table.table_name, op.fetch_one())

demo actors RealDictRow({'name': 'Johnny Depp', 'birth_year': 1963})


In [6]:
op = Scan("demo movies german", db=db, em=em, sv=sv, threshold=0).open()
print(op.table.table_schema, op.table.table_name, op.fetch_one())

demo movies_de RealDictRow({'name': 'Fluch der Karibik', 'release': '2003', 'genres': 'Action,Adventure,Fantasy', 'description': 'Waffenschmied Will Turner tut sich mit dem exzentrischen Piraten "Captain" Jack Sparrow zusammen, um seine große Liebe, die Tochter des Gouverneurs, aus den Händen der Piraten zu retten, die früher mit Jack verbündet waren, jetzt aber Untote sind'})


## Soft Select

Demonstrates how the SoftSelect Operator can be used to capture semantic meanings.
* $\sigma_{\text{name, description} \approx_{\tau = 0.6} \text{'First Pirate of the Caribbean movie'}}(Movies)$
* $\sigma_{\checkmark(\text{Is '\{name\}' the first pirates of the Caribbean movie})}(Movies)$
* $\sigma_{\checkmark(\text{The chemical \{scientific\_name\} with ph \{ph\} is neutral})}(Chemical)$
* $\sigma_{\checkmark(\text{The chemical \{scientific\_name\} with ph \{ph\} is acidic})}(Chemical)$
* $\sigma_{\checkmark(\text{The chemical \{scientific\_name\} with ph \{ph\} is base})}(Chemical)$

In [7]:
op = Select(Scan("demo.movies", db=db, use_semantic_table_search=False), SoftEqual(["name", "description"], Constant("First pirates of the Caribbean movie"), em=em, threshold=0.7)).open()
[x["name"] for x in op]

['Pirates of the Caribbean: The Curse of the Black Pearl']

In [8]:
op = Select(Scan("demo.movies", db=db, use_semantic_table_search=False), SoftValidate("Is '{name}' the first pirates of the Caribbean movie", sv=sv)).open()
[x["name"] for x in op]

['Pirates of the Caribbean: The Curse of the Black Pearl', 'The Matrix']

In [9]:
op = Select(Scan("demo.language_detection", db=db, use_semantic_table_search=False), SoftEqual(["text"], Constant("Is Dutch"), em=em, threshold=0.3)).open()
[(x["text"], x["language"]) for x in op]

[('de b begint weer in het noorden van de stad halle op een kruising met de l en de lb de bb loopt als een vierbaans stadsweg in zuidoostelijke richting door halle op een kruising bij halle-haupbahnhof slaat de b in westelijke richting terwijl de b hier begint en samen met de b naar een zuidelijker gelegen kruispunt loopt op dit bewuste kruispunt slaat de b af en loopt in zuidoostelijke richting de stad uit de weg loopt langs gröbers en großkugel waarna net voor de a de deelstaatgrens met saksen volgt',
  'Dutch'),
 ('bij de volkstelling in  werd het aantal inwoners vastgesteld op  in  is het aantal inwoners door het united states census bureau geschat op  een stijging van  ',
  'Dutch'),
 ('in  werd de gemeente kethel en spaland geannexeerd waardoor er ruimte was voor grootschalige woningbouw ten noorden van schiedam hier liggen nu de wijken tuindorp kethel groenoord woudhoek en spaland',
  'Dutch'),
 ('philocaenus jinjaensis is een vliesvleugelig insect uit de familie pteromalidae de

In [10]:
op = Select(Scan("demo.chemicals", db=db, use_semantic_table_search=False), SoftValidate("The chemical {scientific_name} with ph {ph} is neutral", sv=sv)).open()
[(x["ph"], x["scientific_name"]) for x in op]

[('7.0', 'H2O'), ('Neutral', 'CH3OH'), ('7.0', 'C2H5OH'), ('14.0', 'NaOH')]

In [11]:
op = Select(Scan("demo.chemicals", db=db, use_semantic_table_search=False), SoftValidate("The chemical {scientific_name} with ph {ph} is acidic", sv=sv)).open()
[(x["ph"], x["scientific_name"]) for x in op]

[('0.3', 'H2SO4'), ('1.0', 'HCl'), ('Acidic in water', 'Cl2')]

In [12]:
op = Select(Scan("demo.chemicals", db=db, use_semantic_table_search=False), SoftValidate("The chemical {scientific_name} with ph {ph} is base", sv=sv)).open()
[(x["ph"], x["scientific_name"]) for x in op]

[('7.0', 'C2H5OH'), ('14.0', 'NaOH'), ('Base', 'C8H10N4O2')]