# A Step-by-Step Guide to Holofusion example

# Setup

Firstly, we import all the module from Holoclean that we will use.

In [1]:
from holoclean.holoclean import HoloClean, Session
from holoclean.errordetection.errordetector import ErrorDetectors
from holoclean.featurization.featurizer import SignalInit, SignalCooccur, SignalDC, SignalSource
from holoclean.featurization.featurizer import Featurizer
from holoclean.learning.softmax import SoftMax
from holoclean.learning.accuracy import Accuracy
import time

##   Initialization
In this part, we create the Holoclean and Session object that we will use for this example.

In [2]:
holo_obj = HoloClean(mysql_driver = "../holoclean/lib/mysql-connector-java-5.1.44-bin.jar" )
session = Session("Session", holo_obj)
        

## Read Input and DC from file
Test data and the Denial Constraints will be read using the Session's ingestor.
After ingesting the test data will be loaded into MySQL tables along with entries in the a metadata table.

In [3]:
dataset = "../datasets/flights/flight_input_holo.csv"

denial_constraints = "../datasets/flights/flight_constraints.txt"

ground_truth = "../datasets/flights/flights_clean.csv"

# Ingesting Dataset and Denial Constraints
start_time = time.time()
t0 = time.time()
session.ingest_dataset(dataset)
t1 = time.time()
total = t1 - t0


print 'time for ingesting file: ' + str(total) + '\n'
session.denial_constraints(denial_constraints)
print 'Init table'
sql = holo_obj.dataengine.get_table_to_dataframe("Init", session.dataset)
sql.show()

time for ingesting file: 4.22856211662

Init table
+-----+---+---------------+--------------+------------+--------------------+------------+
|index|src|         flight|sched_dep_time|act_dep_time|      sched_arr_time|act_arr_time|
+-----+---+---------------+--------------+------------+--------------------+------------+
|    1| aa|AA-3859-IAH-ORD|     7:10 a.m.|   7:16 a.m.|           9:40 a.m.|   9:32 a.m.|
|    2| aa|AA-1733-ORD-PHX|     7:45 p.m.|   7:58 p.m.|          10:30 p.m.|        null|
|    3| aa|AA-1640-MIA-MCO|     6:30 p.m.|        null|           7:25 p.m.|        null|
|    4| aa| AA-518-MIA-JFK|     6:40 a.m.|   6:54 a.m.|           9:25 a.m.|   9:28 a.m.|
|    5| aa|AA-3756-ORD-SLC|    12:15 p.m.|  12:41 p.m.|           2:45 p.m.|   2:50 p.m.|
|    6| aa| AA-204-LAX-MCO|    11:25 p.m.|        null|12/02/2011 6:55 a.m.|        null|
|    7| aa|AA-3468-CVG-MIA|     7:00 a.m.|   7:25 a.m.|           9:55 a.m.|   9:45 a.m.|
|    8| aa| AA-484-DFW-MIA|     4:15 p.m.|   4:29

## Error Detection
In this part, we create the error detection. The output of this part is the C_dk table that contains all the noisy cells and the C_Clean table that contains the clean cells

In [4]:
t0 = time.time()
err_detector = ErrorDetectors(session.Denial_constraints, holo_obj.dataengine,
                             holo_obj.spark_session, session.dataset)
session.add_error_detector(err_detector)
session.ds_detect_errors()

t1 = time.time()
total = t1 - t0
holo_obj.logger.info('error dectection time: '+str(total)+'\n')
print 'error dectection time: '+str(total)+'\n'

Denial Constraint Queries: 
SELECT table1.index as ind,table2.index as indexT2 FROM df table1,df table2 WHERE (table1.flight=table2.flight AND table1.sched_dep_time<>table2.sched_dep_time)
SELECT table1.index as ind,table2.index as indexT2 FROM df table1,df table2 WHERE (table1.flight=table2.flight AND table1.act_dep_time<>table2.act_dep_time)
SELECT table1.index as ind,table2.index as indexT2 FROM df table1,df table2 WHERE (table1.flight=table2.flight AND table1.sched_arr_time<>table2.sched_arr_time)
SELECT table1.index as ind,table2.index as indexT2 FROM df table1,df table2 WHERE (table1.flight=table2.flight AND table1.act_arr_time<>table2.act_arr_time)
error dectection time: 294.977092981



## Domain Pruning
In this part, we prune the domain. The output of this part is the possible_values tables that contains all the possible values for each cell

In [5]:
t0 = time.time()
pruning_threshold = 0.3
session.ds_domain_pruning(pruning_threshold)

t1 = time.time()
total = t1 - t0
holo_obj.logger.info('domain pruning time: '+str(total)+'\n')
print 'domain pruning time: '+str(total)+'\n'

print 'Possible_values_clean'
sql = holo_obj.dataengine.get_table_to_dataframe("Possible_values_clean", session.dataset)
sql.show()

print 'Possible values dk'
sql = holo_obj.dataengine.get_table_to_dataframe("Possible_values_dk", session.dataset)
sql.show()

domain pruning time: 33.3899209499

Possible_values_clean
+---+----+--------------+---------------+--------+---------+
|vid| tid|     attr_name|       attr_val|observed|domain_id|
+---+----+--------------+---------------+--------+---------+
|261| 956|        flight|UA-2314-ATL-PHL|       1|        2|
|262| 956|sched_dep_time|           None|       0|        1|
|262| 956|sched_dep_time|      2:55 p.m.|       1|        2|
|263| 956|sched_arr_time|           None|       0|        1|
|263| 956|sched_arr_time|      5:00 p.m.|       1|        2|
|263| 956|sched_arr_time|      5:55 p.m.|       0|        3|
|264| 958|sched_dep_time|     12:00 p.m.|       0|        1|
|264| 958|sched_dep_time|           None|       0|        2|
|264| 958|sched_dep_time|     12:10 p.m.|       1|        3|
|265| 962|sched_dep_time|           None|       0|        1|
|265| 962|sched_dep_time|      4:00 p.m.|       0|        2|
|265| 962|sched_dep_time|      3:05 p.m.|       1|        3|
|266| 973|sched_dep_time|  

# Featurization

In this part, we implement the featurization module of holoclean. We choose the signals that we want to use and the output of this part is the featurization table that contains the factors that we will use. For fusion we will only use the Source Signal.

## Feature Signals

In [6]:
t0 = time.time()
source_signal = SignalSource(session.Denial_constraints, holo_obj.dataengine, session.dataset,
                 holo_obj.spark_session, 1, 0)

session.add_featurizer(source_signal)
t1 = time.time()
total = t1 - t0
print "Feature Signal Time:", total

Feature Signal Time: 0.571250200272


We use the signals that we choose in the previous step. The output of this part is the featurization table that contains the factors that we will use in the next step.

In [7]:
t0 = time.time()
session.ds_featurize()

t1 = time.time()

total = t1 - t0

holo_obj.logger.info('featurization time: '+str(total)+'\n')
print 'featurization time: '+str(total)+'\n'

featurization time: 5.72740602493



#  Learning
We create the X-tensor from the feature_clean table and run softmax on it, then we save results to MySQL and output the accuracy

In [8]:
t0 = time.time()
soft = SoftMax(holo_obj.dataengine, session.dataset, holo_obj.spark_session,
                       session.X_training)

soft.logreg()
t1 = time.time()
total = t1 - t0

print 'time for training model: '+str(total)+'\n'


time for training model: 8.11107587814



In this part, we use the new weight, to learn the probabilities for each value for the cells


In [9]:
t0 = time.time()
session.ds_featurize(0)
t1 = time.time()
total = t1 - t0
print 'time for test featurization: ' + str(total) + '\n'

Y = soft.predict(soft.model, session.X_testing, soft.setupMask(0, session.N, session.L))
t1 = time.time()
total = t1 - t0
print 'time for inference: ', total
soft.save_prediction(Y)

soft.repair_init()


print 'Inferred values for dk cells'
sql = holo_obj.dataengine.get_table_to_dataframe("Inferred_values", session.dataset)
sql.show()

flattening = 0

acc = Accuracy(holo_obj.dataengine, ground_truth, session.dataset,
               holo_obj.spark_session)
acc.accuracy_calculation(flattening)


endtime = time.time()
print 'total time: ', endtime - start_time

time for test featurization: 58.2881071568

time for inference:  59.1708080769
Inferred values for dk cells
+------------------+----+--------------+---------------+---+---------+
|       probability| vid|     attr_name|       attr_val|tid|domain_id|
+------------------+----+--------------+---------------+---+---------+
|0.9941208958625793| 200|sched_dep_time|      8:35 a.m.| 44|        2|
|               1.0|   3|  act_dep_time|      7:16 a.m.|  1|        1|
|               1.0| 737|        flight|AA-1733-ORD-PHX|156|        1|
|0.8649541735649109| 512|  act_arr_time|     10:45 a.m.|108|        2|
|               1.0| 756|sched_dep_time|           None|160|        1|
|0.9998480081558228| 822|        flight|UA-2314-ATL-PHL|174|        2|
|0.9591429829597473| 205|  act_dep_time|      3:38 p.m.| 45|        2|
|               1.0| 874|sched_arr_time|           None|184|        1|
|0.8586862683296204|1318|  act_dep_time|     12:41 a.m.|279|        2|
|0.9756609201431274| 525|  act_dep_time|

In [10]:
sql = holo_obj.dataengine.get_table_to_dataframe("Repairs", session.dataset)
sql.show()

+----+-----------------+-------------+---------------+-----------------+--------------+
| tid|     act_arr_time| act_dep_time|         flight|   sched_arr_time|sched_dep_time|
+----+-----------------+-------------+---------------+-----------------+--------------+
|2213|        4:23 p.m.|    1:48 p.m.| UA-664-ORD-PHL|        4:31 p.m.|     1:33 p.m.|
|  13|       11:50 a.m.|   10:19 a.m.|AA-1664-MIA-ATL|       12:10 p.m.|    10:15 a.m.|
| 218|    Not Available|Not Available| AA-649-ORD-SNA|        3:50 p.m.|          null|
| 280|        1:27 p.m.|   10:11 a.m.| AA-616-DFW-DTW|       12:35 p.m.|     9:05 a.m.|
| 148|        6:19 a.m.|    6:11 a.m.|AA-3809-PHX-LAX|        6:40 a.m.|     6:00 a.m.|
| 229|        3:36 p.m.|    2:44 p.m.|AA-3804-PHL-ORD|        4:05 p.m.|          null|
| 348|12/2/11 6:35 a.m.|   11:29 p.m.| AA-204-LAX-MCO|             None|    11:25 p.m.|
| 307|       11:44 a.m.|    8:52 a.m.| AA-431-MIA-SFO|       11:50 a.m.|     8:35 a.m.|
|1085|        9:09 a.m.|    7:56

In [11]:
sql = holo_obj.dataengine.get_table_to_dataframe("Repaired", session.dataset)
sql.show()

+-----+---+---------------+--------------+------------+--------------------+--------------------+
|index|src|         flight|sched_dep_time|act_dep_time|      sched_arr_time|        act_arr_time|
+-----+---+---------------+--------------+------------+--------------------+--------------------+
|    1| aa|AA-3859-IAH-ORD|     7:10 a.m.|   7:16 a.m.|           9:40 a.m.|           9:22 a.m.|
|    2| aa|AA-1733-ORD-PHX|     7:45 p.m.|   7:59 p.m.|          10:30 p.m.|10:30 p.m. (Estim...|
|    3| aa|AA-1640-MIA-MCO|     6:30 p.m.|        None|           7:25 p.m.|                None|
|    4| aa| AA-518-MIA-JFK|     6:40 a.m.|   6:54 a.m.|           9:25 a.m.|           9:28 a.m.|
|    5| aa|AA-3756-ORD-SLC|    12:15 p.m.|  12:42 p.m.|           2:45 p.m.|           2:52 p.m.|
|    6| aa| AA-204-LAX-MCO|    11:25 p.m.|        None|12/02/2011 6:55 a.m.|                None|
|    7| aa|AA-3468-CVG-MIA|     7:00 a.m.|   7:25 a.m.|           9:55 a.m.|           9:45 a.m.|
|    8| aa| AA-484-D