# Mixed value repair example

This notebook will walk through how to use the `EmbeddingFeaturizer` to perform mixed value repair (i.e. when there are both numerical and categorical columns).

In [1]:
import sys
sys.path.append('../')

import holoclean
from detect import *
from repair.featurize import *

## 1. Setup a HoloClean session.

In [6]:
hc = holoclean.HoloClean(
    db_name='holo',
    domain_thresh_1=0.0,
    domain_thresh_2=0.0,
    max_domain=10000,
    cor_strength=0.6,
    weight_decay=0.,
    learning_rate=0.001,
    threads=1,
    batch_size=1,
    verbose=True,
    timeout=3 * 60000,
).session

11:49:48 - [DEBUG] - initiating session with parameters: {'db_user': 'holocleanuser', 'db_pwd': 'abcd1234', 'db_host': 'localhost', 'db_name': 'holo', 'threads': 1, 'timeout': 180000, 'seed': 45, 'layer_sizes': [1], 'learning_rate': 0.001, 'optimizer': 'adam', 'epochs': 20, 'weight_decay': 0.0, 'momentum': 0.0, 'batch_size': 1, 'weak_label_thresh': 0.9, 'domain_thresh_1': 0.0, 'domain_thresh_2': 0.0, 'max_domain': 10000, 'cor_strength': 0.6, 'nb_cor_strength': 0.3, 'feature_norm': False, 'weight_norm': False, 'estimator_type': 'NaiveBayes', 'estimator_epochs': 10, 'estimator_batch_size': 32, 'estimator_embedding_size': 10, 'train_attrs': None, 'infer_mode': 'dk', 'verbose': True, 'bias': False, 'print_fw': False, 'debug_mode': False}


## 2. Load training data and denial constraints

We load the hospital dataset where there are two numerical columns: Sample and Score.

We specify which attributes are numerical.

In [8]:
hc.load_data('hospital', '../testdata/hospital/hospital_numerical.csv',
            numerical_attrs=['Score', 'Sample'])
hc.load_dcs('../testdata/hospital/hospital_constraints.txt')
hc.ds.set_constraints(hc.get_dcs())

11:53:30 - [ INFO] - Loaded 1000 rows with 18000 cells
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [DEBUG] - Time to create index: 0.00 secs
11:53:30 - [ INFO] - DONE Loading hospital_n

## 3. Detect erroneous cells using these two detectors.

In [9]:
detectors = [NullDetector(), ViolationDetector()]
hc.detect_errors(detectors)

11:54:17 - [DEBUG] - DONE with Error Detector: NullDetector in 0.08 secs
11:54:17 - [DEBUG] - Preparing to execute 15 queries.
11:54:17 - [DEBUG] - Starting to execute query SELECT t1._tid_ FROM "hospital" as t1 WHERE   EXISTS (SELECT t2._tid_ FROM "hospital" as t2 WHERE t1."Condition"=t2."Condition" AND t1."MeasureName"=t2."MeasureName" AND t1."HospitalType"<>t2."HospitalType" AND t1."Condition"!='_nan_' AND t1."MeasureName"!='_nan_' AND t1."HospitalType"!='_nan_' AND t2."Condition"!='_nan_' AND t2."MeasureName"!='_nan_' AND t2."HospitalType"!='_nan_') with id 0
11:54:18 - [DEBUG] - Time to execute query with id 0: 0.00 secs
11:54:18 - [DEBUG] - Starting to execute query SELECT t1._tid_ FROM "hospital" as t1 WHERE   EXISTS (SELECT t2._tid_ FROM "hospital" as t2 WHERE t1."HospitalName"=t2."HospitalName" AND t1."ZipCode"<>t2."ZipCode" AND t1."HospitalName"!='_nan_' AND t1."ZipCode"!='_nan_' AND t2."HospitalName"!='_nan_' AND t2."ZipCode"!='_nan_') with id 1
11:54:18 - [DEBUG] - Time to 

## 4. Quantize numerical columns to generate a better domain (optional)

Generating domains requires computing co-occurrence statistics between columns. For numerical columns, co-occurrence is not well defined, thus we can choose to quantize the column to allow for better domain generation using co-occurrence statistics.

In [11]:
# Quantize each of Score and Sample into 50 bins.
_ = hc.quantize_numericals([(50, ['Score']), (50, ['Sample'])])

11:56:24 - [ INFO] - DONE with quantization
11:56:24 - [DEBUG] - Time to quantize the dataset: 0.41 secs
11:56:25 - [DEBUG] - Time to create index: 0.00 secs
11:56:25 - [DEBUG] - Time to create index: 0.00 secs
11:56:25 - [DEBUG] - Time to create index: 0.00 secs
11:56:25 - [DEBUG] - Time to create index: 0.00 secs
11:56:25 - [DEBUG] - Time to create index: 0.00 secs
11:56:25 - [DEBUG] - Time to create index: 0.00 secs
11:56:25 - [DEBUG] - Time to create index: 0.00 secs
11:56:25 - [DEBUG] - Time to create index: 0.00 secs
11:56:25 - [DEBUG] - Time to create index: 0.00 secs
11:56:25 - [DEBUG] - Time to create index: 0.00 secs
11:56:25 - [DEBUG] - Time to create index: 0.00 secs
11:56:25 - [DEBUG] - Time to create index: 0.00 secs
11:56:25 - [DEBUG] - Time to create index: 0.00 secs
11:56:25 - [DEBUG] - Time to create index: 0.00 secs
11:56:25 - [DEBUG] - Time to create index: 0.00 secs
11:56:25 - [DEBUG] - Time to create index: 0.00 secs
11:56:25 - [DEBUG] - Time to create index: 0.00

## 5. Generate the domain.

In [12]:
hc.generate_domain()

11:57:32 - [DEBUG] - Computing correlations...
11:57:36 - [DEBUG] - correlations:
attr              ProviderNumber  HospitalName  Address1      City     State  \
cond_attr                                                                      
Address1                0.973058      0.977819  1.000000  0.968957  0.926332   
City                    0.927848      0.932173  0.927911  1.000000  0.928117   
Condition               0.149259      0.141563  0.156809  0.185834  0.835806   
CountyName              0.881920      0.885445  0.882335  0.923666  0.924064   
EmergencyService        0.205623      0.202773  0.214343  0.243758  0.832506   
HospitalName            0.973002      1.000000  0.972486  0.968905  0.929428   
HospitalOwner           0.537764      0.537251  0.541772  0.548234  0.864220   
HospitalType            0.119127      0.115310  0.129153  0.157915  0.832457   
MeasureCode             0.174872      0.166578  0.185173  0.214309  0.851806   
MeasureName             0.183367      

11:57:39 - [DEBUG] - generating initial set of un-pruned domain values...
100%|██████████| 1000/1000 [00:02<00:00, 425.42it/s]
11:57:42 - [DEBUG] - domain size stats: count    16744.000000
mean         5.005017
std          2.645148
min          2.000000
25%          3.000000
50%          5.000000
75%          6.000000
max         13.000000
Name: domain_size, dtype: float64
11:57:42 - [DEBUG] - domain count by attr: HospitalName        1000
State               1000
CountyName          1000
MeasureName         1000
City                1000
PhoneNumber         1000
Address1            1000
Stateavg            1000
ZipCode             1000
MeasureCode         1000
EmergencyService    1000
ProviderNumber      1000
HospitalType        1000
Condition           1000
HospitalOwner       1000
Sample               934
Score                810
Name: attribute, dtype: int64
11:57:42 - [DEBUG] - DONE generating initial set of domain values in 2.17s
11:57:46 - [DEBUG] - Time to create index: 0.00 se

## 6. Use EmbeddingFeaturizer to repair errors

The `EmbeddingFeaturizer` is an autoencoder model that supports both regression for numerical columns and classification for categorical columns.

It can be used as a replacement for the `OccurAttrFeaturizer` (co-occurrence featurizer).

One must specify the numerical groups (usually each group contains 1 numerical attribute, but for multi-D numerical columns like latitude-longitude it may be useful to specify them as one group of 2 attributes).

To use the embedding model as a standalone, simply omit all other featurizers.

In [20]:
featurizers = [
    EmbeddingFeaturizer(numerical_attr_groups=[['Score'], ['Sample']], epochs=20),
    ConstraintFeaturizer(),
]

hc.repair_errors(featurizers)

12:38:05 - [DEBUG] - Time to execute query: 0.00 secs
12:38:05 - [DEBUG] - Time to execute query: 0.00 secs
12:38:05 - [DEBUG] - EmbeddingFeaturizer: training TupleEmbedding model since it was not used as an estimator
12:38:05 - [DEBUG] - EmbeddingFeaturizer: training with 20 epochs and 32 batch size
12:38:06 - [DEBUG] - LookupDataset: init categorical attributes: ['Address1', 'City', 'Condition', 'CountyName', 'EmergencyService', 'HospitalName', 'HospitalOwner', 'HospitalType', 'MeasureCode', 'MeasureName', 'PhoneNumber', 'ProviderNumber', 'State', 'Stateavg', 'ZipCode']
12:38:06 - [DEBUG] - LookupDataset: init numerical attributes: ['Sample', 'Score']
12:38:06 - [DEBUG] - LookupDataset: train categorical attributes: ['Address1', 'City', 'Condition', 'CountyName', 'EmergencyService', 'HospitalName', 'HospitalOwner', 'HospitalType', 'MeasureCode', 'MeasureName', 'PhoneNumber', 'ProviderNumber', 'State', 'Stateavg', 'ZipCode']
12:38:06 - [DEBUG] - LookupDataset: train numerical attribut

12:41:27 - [DEBUG] - Time to execute query with id 3: 0.00 secs
12:41:27 - [DEBUG] - Starting to execute query SELECT _vid_, val_id, count(*) violations FROM   "hospital" as t1, "hospital" as t2, pos_values as t3 WHERE  t1._tid_ != t2._tid_   AND  t1._tid_ = t3._tid_   AND  t3.attribute = 'ZipCode'   AND  t1."HospitalName"=t2."HospitalName"   AND  t3.rv_val::TEXT <> t2."ZipCode" GROUP BY _vid_, val_id with id 4
12:41:27 - [DEBUG] - Time to execute query with id 4: 0.00 secs
12:41:27 - [DEBUG] - Starting to execute query SELECT _vid_, val_id, count(*) violations FROM   "hospital" as t1, "hospital" as t2, pos_values as t3 WHERE  t1._tid_ != t2._tid_   AND  t1._tid_ = t3._tid_   AND  t3.attribute = 'HospitalName'   AND  t1."PhoneNumber"<>t2."PhoneNumber"   AND  t3.rv_val::TEXT = t2."HospitalName" GROUP BY _vid_, val_id with id 5
12:41:27 - [DEBUG] - Time to execute query with id 5: 0.00 secs
12:41:27 - [DEBUG] - Starting to execute query SELECT _vid_, val_id, count(*) violations FROM   "h

12:41:29 - [DEBUG] - Starting to execute query SELECT _vid_, val_id, count(*) violations FROM   "hospital" as t1, "hospital" as t2, pos_values as t3 WHERE  t1._tid_ != t2._tid_   AND  t1._tid_ = t3._tid_   AND  t3.attribute = 'HospitalOwner'   AND  t1."HospitalName"=t2."HospitalName" AND t1."PhoneNumber"=t2."PhoneNumber" AND t1."State"<>t2."State"   AND  t3.rv_val::TEXT = t2."HospitalOwner" GROUP BY _vid_, val_id with id 23
12:41:29 - [DEBUG] - Time to execute query with id 23: 0.00 secs
12:41:29 - [DEBUG] - Starting to execute query SELECT _vid_, val_id, count(*) violations FROM   "hospital" as t1, "hospital" as t2, pos_values as t3 WHERE  t1._tid_ != t2._tid_   AND  t1._tid_ = t3._tid_   AND  t3.attribute = 'State'   AND  t1."HospitalName"=t2."HospitalName" AND t1."PhoneNumber"=t2."PhoneNumber" AND t1."HospitalOwner"=t2."HospitalOwner"   AND  t3.rv_val::TEXT <> t2."State" GROUP BY _vid_, val_id with id 24
12:41:30 - [DEBUG] - Time to execute query with id 24: 0.00 secs
12:41:30 - [DE

12:42:27 - [DEBUG] - Time to store repaired dataset: 0.19 secs


## 7. Evaluate the correctness of the results.

In [21]:
report = hc.evaluate(fpath='../testdata/hospital/hospital_numerical_clean.csv',
            tid_col='tid',
            attr_col='attribute',
            val_col='correct_val')

12:44:30 - [DEBUG] - Time to create index: 0.00 secs
12:44:30 - [DEBUG] - Time to create index: 0.00 secs
12:44:30 - [ INFO] - DONE Loading hospital_numerical_clean.csv
12:44:30 - [DEBUG] - Time to evaluate repairs: 1.00 secs
12:44:30 - [DEBUG] - Time to execute query: 0.00 secs
12:44:30 - [DEBUG] - Preparing to execute 15 queries.
12:44:30 - [DEBUG] - Starting to execute query SELECT COUNT(*) FROM   (SELECT t2._tid_, t2._attribute_, t2._value_      FROM "hospital" as t1, "hospital_clean" as t2     WHERE t1._tid_ = t2._tid_       AND t2._attribute_ = 'ProviderNumber'       AND NOT t1."ProviderNumber" = ANY(string_to_array(regexp_replace(t2._value_,'[{""}]','','gi'),'|'))) as errors, "inf_values_dom" as repairs WHERE errors._tid_ = repairs._tid_   AND errors._attribute_ = repairs.attribute   AND repairs.rv_value = ANY(string_to_array(regexp_replace(errors._value_,'[{""}]','','gi'),'|')) with id 0
12:44:30 - [DEBUG] - Time to execute query with id 0: 0.00 secs
12:44:30 - [DEBUG] - Starti

12:44:31 - [DEBUG] - Time to execute query with id 12: 0.00 secs
12:44:31 - [DEBUG] - Starting to execute query SELECT COUNT(*) FROM   (SELECT t2._tid_, t2._attribute_, t2._value_      FROM "hospital" as t1, "hospital_clean" as t2     WHERE t1._tid_ = t2._tid_       AND t2._attribute_ = 'MeasureName'       AND NOT t1."MeasureName" = ANY(string_to_array(regexp_replace(t2._value_,'[{""}]','','gi'),'|'))) as errors, "inf_values_dom" as repairs WHERE errors._tid_ = repairs._tid_   AND errors._attribute_ = repairs.attribute   AND repairs.rv_value = ANY(string_to_array(regexp_replace(errors._value_,'[{""}]','','gi'),'|')) with id 13
12:44:31 - [DEBUG] - Time to execute query with id 13: 0.00 secs
12:44:31 - [DEBUG] - Starting to execute query SELECT COUNT(*) FROM   (SELECT t2._tid_, t2._attribute_, t2._value_      FROM "hospital" as t1, "hospital_clean" as t2     WHERE t1._tid_ = t2._tid_       AND t2._attribute_ = 'Stateavg'       AND NOT t1."Stateavg" = ANY(string_to_array(regexp_replace(t

In [22]:
report

EvalReport(precision=0.9812646370023419, recall=0.9208791208791208, repair_recall=0.9632183908045977, f1=0.9501133786848073, repair_f1=0.9721577726218098, detected_errors=435.0, total_errors=455.0, correct_repairs=419.0, total_repairs=427, total_repairs_grdt=427.0, total_repairs_grdt_correct=8.0, total_repairs_grdt_incorrect=419.0, rmse=0.0)