# Evaluation

To evaluate our database's performance, we analyze every operator separately and perform an integration test afterwards.

In [1]:
import configparser
import copy
import logging
import random
import pandas as pd
from jinja2.ext import debug
from tqdm import tqdm
from collections import Counter
from sklearn.metrics import classification_report

from models import ModelMgr
from db.structure import Column, Constant, SQLColumn
from db.criteria import Negation, HardEqual, SoftEqual, SoftValidate
from db.operators import Dummy, Scan, Transform, Select, Project, Join, InnerHashJoin, InnerSoftJoin, ColumnTransform
from db.operators.Aggregate import HashAggregate, SumAggregation, DistinctAggregation
from models.text_generation.LLaMA import LLaMATextGenerationModel

from utils import CosineSimilarity, get_config

from models.semantic_validation import GeminiValidationModel, LLaMAValidationModel
from models.embedding import GenericEmbeddingModel, LLaMAEmbeddingModel, SentenceTransformerEmbeddingModel

from db.db import DBConnector

In [2]:
# Load Models
m = ModelMgr()
em = SentenceTransformerEmbeddingModel(m)
sv = LLaMAValidationModel(m)
gm = LLaMATextGenerationModel(m)

# Load DB
db = DBConnector("./config.ini", load_db=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Evaluate Select Operator

### Company Data

We used the [People Data Labs 2019 Global Company Dataset](https://www.kaggle.com/datasets/peopledatalabssf/free-7-million-company-dataset) to evaluate the `SoftEqual`- and `SoftValidate`-Operator.

#### Data Understanding

First, we evaluate the data understanding. The dataset contains a size estimate as ranges ('1 - 10', '5001 - 10000', '10001+').
We test if the `SoftValidate` is able to determine if an arbitrary test number is 1) bigger than, 2) smaller than or 3) in between a given size range.

So, we test the performance of:
* $\sigma_{\checkmark ( \text{ Is x in range \{size\_range\}? })}(Companies)$
* $\sigma_{\checkmark ( \text{ Is x lower than the range \{size\_range\}? })}(Companies)$
* $\sigma_{\checkmark ( \text{ Is x bigger than the range \{size\_range\}? })}(Companies)$

In [17]:
# Determine all size ranges
scan_companies = Scan("companies", em=em, sv=sv, db=db)
agg = HashAggregate(scan_companies, ["size_range"], [DistinctAggregation("size_range", "text")])
size_ranges = {row["size_range"] for row in agg}
size_ranges

{'1 - 10',
 '10001+',
 '1001 - 5000',
 '11 - 50',
 '201 - 500',
 '5001 - 10000',
 '501 - 1000',
 '51 - 200'}

In [36]:
gt =  [
    (8, "=", "1 - 10"), (-5, "<", "1 - 10"), (35, ">", "1 - 10"),
    (35, "=", "11 - 50"), (8, "<", "11 - 50"), (172, ">", "11 - 50"),
    (172, "=", "51 - 200"), (35, "<", "51 - 200"), (354, ">", "51 - 200"),
    (354, "=", "201 - 500"), (172, "<", "201 - 500"), (502, ">", "201 - 500"),
    (502, "=", "501 - 1000"), (354, "<", "501 - 1000"), (3581, ">", "501 - 1000"),
    (3581, "=", "1001 - 5000"), (502, "<", "1001 - 5000"), (6000, ">", "1001 - 5000"),
    (6000, "=", "5001 - 10000"), (3581, "<", "5001 - 10000"), (32100, ">", "5001 - 10000"),
    (32100, "=", "10001+"), (6000, "<", "10001+"),
]


results = []
for row in gt:
    result = {"test_no": row[0], "gt": row[1], "size_range": row[2]}
    data = Dummy("data", ["test_no", "size_range"], [(row[0], row[2])])
    sel_equal = Select(data, SoftValidate("Is {test_no} in range of {size_range}?", sv=sv, full_record=False))
    result["="] = len(list(sel_equal)) > 0
    sel_greater = Select(data, SoftValidate("Is {test_no} lower than the range {size_range}?", sv=sv, full_record=False))
    result[">"] = len(list(sel_greater)) > 0
    sel_lower = Select(data, SoftValidate("Is {test_no} bigger than the range {size_range}", sv=sv, full_record=False))
    result["<"] = len(list(sel_lower)) > 0
    results.append(result)

df_eval = pd.DataFrame(results)
df_eval.head()

Unnamed: 0,test_no,gt,size_range,=,>,<
0,8,=,1 - 10,True,False,False
1,-5,<,1 - 10,False,False,False
2,35,>,1 - 10,False,False,False
3,35,=,11 - 50,True,True,False
4,8,<,11 - 50,True,False,False


In [37]:
print("Classification for {test_no} in range of {size_range}")
print(classification_report(df_eval["gt"] == "=", df_eval["="]))

Classification for {test_no} in range of {size_range}
              precision    recall  f1-score   support

       False       1.00      0.47      0.64        15
        True       0.50      1.00      0.67         8

    accuracy                           0.65        23
   macro avg       0.75      0.73      0.65        23
weighted avg       0.83      0.65      0.65        23



In [38]:
print("Classification for {test_no} lower than range {size_range}")
print(classification_report(df_eval["gt"] == ">", df_eval[">"]))

Classification for {test_no} lower than range {size_range}
              precision    recall  f1-score   support

       False       0.60      0.19      0.29        16
        True       0.28      0.71      0.40         7

    accuracy                           0.35        23
   macro avg       0.44      0.45      0.34        23
weighted avg       0.50      0.35      0.32        23



In [39]:
print("Classification for {test_no} bigger than range {size_range}")
print(classification_report(df_eval["gt"] == "<", df_eval["<"]))

Classification for {test_no} bigger than range {size_range}
              precision    recall  f1-score   support

       False       0.56      0.60      0.58        15
        True       0.14      0.12      0.13         8

    accuracy                           0.43        23
   macro avg       0.35      0.36      0.36        23
weighted avg       0.42      0.43      0.43        23



#### Domain Knowledge

To evaluate the domain knowledge of the operators (and the LLM), we predict if a company has 'automotive' as industry.
Therefore, we collect all big companies (`size > 1000`) as ground truth and store the classification value (true for all rows with 'automotive'=industry).

To generate the predictions, we execute two other queries and classify the row with `True` if the query returns a result. To avoid data leakage, we remove the 'industry' column first.
 * `SoftEqual`-Operator: $ \sigma_{\text{(name, country, size\_range, locality, year\_founded}) \approx \text{'car company'}} (Companies)$
 * `SoftValidate`-Operator: $\sigma_{\checkmark ( \text{Is this company record:  \{name: \{name\}, country:\{country\}, size\_range: \{size\_range\}, locality: \{locality\}, year\_founded: \{year\_founded\}\} of a car company? })}(Companies)$

In [76]:
scan_companies = Scan("companies", em=em, sv=sv, db=db, sql_annex=f"  WHERE size_range IN ('10001+', '1001 - 5000')")
gt = [row for row in scan_companies]
len(gt), gt[:3]

(26632,
 [RealDictRow([('id', 1537541),
               ('name', 'british american tobacco'),
               ('domain', 'bat.com'),
               ('year_founded', 1902),
               ('industry', 'tobacco'),
               ('size_range', '10001+'),
               ('locality', 'london, greater london, united kingdom'),
               ('country', 'united kingdom'),
               ('linkedin_url',
                'linkedin.com/company/british-american-tobacco'),
               ('current_employee_estimate', '15511'),
               ('total_employee_estimate', '46381')]),
  RealDictRow([('id', 3678648),
               ('name', 'nordea'),
               ('domain', 'nordea.com'),
               ('year_founded', None),
               ('industry', 'banking'),
               ('size_range', '10001+'),
               ('locality', 'ase, vasternorrlands lan, sweden'),
               ('country', 'sweden'),
               ('linkedin_url', 'linkedin.com/company/nordea'),
               ('current_empl

In [84]:
p = Project(scan_companies, [x.column_name for x in scan_companies.table.table_structure if x.column_name != 'industry'], em=em)
sel = Select(p, SoftEqual(["name", "country", "size_range", "locality", "year_founded"], Constant('car company'), em=em, threshold=0.3))
str(sel)

σ_{name, country, size_range, locality, year_founded ≈ 'car company'} (π_{"country"≈>country, "current_employee_estimate"≈>current_employee_estimate, "domain"≈>domain, "id"≈>id, "linkedin_url"≈>linkedin_url, "locality"≈>locality, "name"≈>name, "size_range"≈>size_range, "total_employee_estimate"≈>total_employee_estimate, "year_founded"≈>year_founded} ("companies"≈>"people_data_labs.companies"))


In [None]:
prediction = [row for row in sel]
len(prediction), prediction[0:3]

In [78]:
df_gt = pd.DataFrame(gt).set_index('id')
df_prediction = pd.DataFrame(prediction).set_index('id')
df_merge = df_gt.merge(df_prediction, left_index=True, right_index=True, how='left')
df_merge["class"] = df_merge["industry"] == "automotive"
df_merge["prediction"] = df_merge["name_y"].notna()

print(classification_report(df_merge["class"], df_merge["prediction"]))

              precision    recall  f1-score   support

       False       0.99      0.89      0.94     26112
        True       0.10      0.59      0.17       520

    accuracy                           0.89     26632
   macro avg       0.55      0.74      0.56     26632
weighted avg       0.97      0.89      0.93     26632



In [85]:
p = Project(scan_companies, [x.column_name for x in scan_companies.table.table_structure if x.column_name != 'industry'], em=em)
sel = Select(p, SoftValidate("Is this company record: {{name: {name}, country: {country}, size_range: {size_range}, locality: {locality}, year_founded: {year_founded}}} of a car company? ", sv=sv, full_record=False))
str(sel)

'σ_{✓_{Is this company record: {{name: {name}, country: {country}, size_range: {size_range}, locality: {locality}, year_founded: {year_founded}}} of a car company? }} (π_{"country"≈>country, "current_employee_estimate"≈>current_employee_estimate, "domain"≈>domain, "id"≈>id, "linkedin_url"≈>linkedin_url, "locality"≈>locality, "name"≈>name, "size_range"≈>size_range, "total_employee_estimate"≈>total_employee_estimate, "year_founded"≈>year_founded} ("companies"≈>"people_data_labs.companies"))'

In [None]:
prediction = [row for row in sel]
len(prediction), prediction[0:3]

In [81]:
df_gt = pd.DataFrame(gt).set_index('id')
df_prediction = pd.DataFrame(prediction).set_index('id')
df_merge = df_gt.merge(df_prediction, left_index=True, right_index=True, how='left')
df_merge["class"] = df_merge["industry"] == "automotive"
df_merge["prediction"] = df_merge["name_y"].notna()

print(classification_report(df_merge["class"], df_merge["prediction"]))

              precision    recall  f1-score   support

       False       0.98      1.00      0.99     26112
        True       0.94      0.10      0.18       520

    accuracy                           0.98     26632
   macro avg       0.96      0.55      0.58     26632
weighted avg       0.98      0.98      0.98     26632

