# HOLOCLEAN DEMO


In [1]:
from holoclean.holoclean import HoloClean, Session
from holoclean.errordetection.errordetector import ErrorDetectors
from holoclean.featurization.featurizer import SignalInit, SignalCooccur, SignalDC
from holoclean.learning.accuracy import Accuracy
from time import time as t

## Start Holoclean demo
Create HoloClean object and Session.
The HoloClean object will log all info for the test.
Session will be used to ingest data from input files.

In [2]:
        holo_obj = HoloClean()
        session = Session("Session", holo_obj) 
        print "Testing started :"+str(t())
        

Testing started :1517285806.76


  cursor.execute('SELECT @@tx_isolation')


## Read Input and DC from file
Test data and the Denial Constraints will be read using the Session's ingestor.
After ingesting the test data will be loaded into MySQL tables along with entries in the a metadata table.

In [3]:
        fx = open('execution_time.txt', 'w')
        list_time = []
        start_time = t()
        
        #session.ingest_dataset("test/inputDatabase.csv")
        session.ingest_dataset("test/test.csv")
        d = t()-start_time
        list_time.append(d)
        holo_obj.logger.info('ingest csv time: '+str(d)+'\n')
        fx.write('ingest csv time: '+str(d)+'\n')
        print 'Init table'
        sql = holo_obj.dataengine.get_table_to_dataframe("Init", session.dataset)
        sql.show()
        print 'ingest csv time: '+str(d)+'\n'
        
        start_time = t()
        
        #session.denial_constraints("test/inputConstraint.txt")
        session.denial_constraints("test/dc1.txt")
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('read denial constraints time: '+str(d)+'\n')
        fx.write('read denial constraints time: '+str(d)+'\n')
        print 'read denial constraints time: '+str(d)+'\n'


Init table
+-----+---+---+---+-------+
|index|  A|  B|  C|Number1|
+-----+---+---+---+-------+
|    1|  p|  a|  d|      1|
|    2|  p|  b|  e|      2|
|    3|  o|  b|  f|      3|
+-----+---+---+---+-------+

ingest csv time: 4.4314250946

read denial constraints time: 0.000128984451294



## Error Detection
Using the Denial Constraints read through the ingestor, apply Error Detection on the test data

In [4]:
        start_time = t()
        err_detector = ErrorDetectors(session.Denial_constraints, holo_obj.dataengine,
                                      holo_obj.spark_session, session.dataset)
        session.add_error_detector(err_detector)
        session.ds_detect_errors()
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('error dectection time: '+str(d)+'\n')
        fx.write('error dectection time: '+str(d)+'\n')
        
        print 'Clean table'
        sql = holo_obj.dataengine.get_table_to_dataframe("C_clean", session.dataset)
        sql.show()
        print 'Don\'t know table'
        sql = holo_obj.dataengine.get_table_to_dataframe("C_dk", session.dataset)
        sql.show()
        print 'error dectection time: '+str(d)+'\n'

Clean table
+---+-------+
|ind|   attr|
+---+-------+
|  3|      C|
|  3|Number1|
|  3|      A|
|  3|      B|
|  1|      C|
|  2|      C|
+---+-------+

Don't know table
+---+-------+
|ind|   attr|
+---+-------+
|  1|      A|
|  2|Number1|
|  2|      B|
|  1|      B|
|  1|Number1|
|  2|      A|
+---+-------+

error dectection time: 50.5282518864



## Domain Pruning


In [5]:
        start_time = t()
        pruning_threshold = 0.5
        session.ds_domain_pruning(pruning_threshold)
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('domain pruning time: '+str(d)+'\n')
        fx.write('domain pruning time: '+str(d)+'\n')
        
        print 'Init Flat table'
        sql = holo_obj.dataengine.get_table_to_dataframe("Init_flat", session.dataset)
        sql.show()
        print 'domain pruning time: '+str(d)+'\n'

Init Flat table
+---+---------+--------+
|tid|attr_name|attr_val|
+---+---------+--------+
|  1|        A|       p|
|  3|        C|       f|
|  3|  Number1|       3|
|  2|  Number1|       2|
|  1|        B|       a|
|  1|  Number1|       1|
|  2|        A|       p|
|  3|        B|       b|
|  3|        A|       o|
|  2|        C|       e|
|  2|        B|       b|
|  1|        C|       d|
+---+---------+--------+

domain pruning time: 1.14676117897



## Initial Value Signal 

In [6]:
        start_time = t()
        start_time1 = t()
        initial_value_signal = SignalInit(session.Denial_constraints, holo_obj.dataengine,
                                          session.dataset)
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('init signal time: '+str(d)+'\n')
        fx.write('init signal time: '+str(d)+'\n')
        print 'init signal time: '+str(d)+'\n'

init signal time: 0.13618183136



## Co-occurence Signal 

In [7]:
        start_time = t()
        session.add_featurizer(initial_value_signal)
        statistics_signal = SignalCooccur(session.Denial_constraints, holo_obj.dataengine,
                                          session.dataset)
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('cooccur signal time: '+str(d)+'\n')
        fx.write('cooccur signal time: '+str(d)+'\n')
        print 'cooccur signal time: '+str(d)+'\n'

cooccur signal time: 0.162782907486



## DC Featurization 

In [8]:
        start_time = t()
        session.add_featurizer(statistics_signal)
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('dc signal time: '+str(d)+'\n')
        fx.write('dc signal time: '+str(d)+'\n')
        print 'dc signal time: '+str(d)+'\n'
        start_time = t()
        dc_signal = SignalDC(session.Denial_constraints, holo_obj.dataengine, session.dataset)
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('dc featurize time: '+str(d)+'\n')
        fx.write('dc featurize time: '+str(d)+'\n')
        print 'dc featurize time: '+str(d)+'\n'
        session.add_featurizer(dc_signal)
        session.ds_featurize()
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('total featurization time: '+str(d)+'\n')
        fx.write('total featurization time: '+str(d)+'\n')
        print 'Feature table'
        sql = holo_obj.dataengine.get_table_to_dataframe("Feature_temp", session.dataset)
        sql.show()
        
        print 'total featurization time: '+str(d)+'\n'

dc signal time: 0.000458002090454

dc featurize time: 0.176409006119

adding weight_id to feature table...
adding weight_id to feature table is finished
Feature table
+---------+--------+-------+------------+---------+-------+---------+
|var_index|rv_index|rv_attr|assigned_val|  feature|   TYPE|weight_id|
+---------+--------+-------+------------+---------+-------+---------+
|        1|       1|      A|           p|   Init=p|   init|         |
|        2|       3|Number1|           3|   Init=3|   init|         |
|        3|       2|Number1|           2|   Init=2|   init|         |
|        4|       1|      B|           a|   Init=a|   init|         |
|        5|       1|Number1|           1|   Init=1|   init|         |
|        6|       2|      A|           p|   Init=p|   init|         |
|        7|       3|      B|           b|   Init=b|   init|         |
|        8|       3|      A|           o|   Init=o|   init|         |
|        9|       2|      B|           b|   Init=b|   init|    

## Data Repair

In [9]:
        start_time = t()
        session._numskull()
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('numbskull time: '+str(d)+'\n')
        fx.write('numbskull time: '+str(d)+'\n')
        print 'numbskull time: '+str(d)+'\n'
        start_time = t()
        session.ds_repair()
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('repair time: '+str(d)+'\n')
        fx.write('repair time: '+str(d)+'\n')
        print 'repair time: '+str(d)+'\n'

        holo_obj.logger.info('Total time: ' + str(sum(list_time)) + '\n')
        fx.write('Total time: ' + str(sum(list_time)) + '\n')
        print 'Total time: ' + str(sum(list_time)) + '\n'

        fx.close()

numbskull is starting
wrapper is starting
wrapper is finished
1
numbskull is finished
adding weight is finished is finished
numbskull time: 2.97180604935

starting repairs
repairs are finished
+-------+-------+--------+------------+-----------+
|rv_attr|rv_attr|rv_index|assigned_val|probability|
+-------+-------+--------+------------+-----------+
|      A|      A|       1|           p|        1.0|
|      A|      A|       2|           p|        1.0|
|      A|      A|       3|           o|        1.0|
|      B|      B|       1|           a|        1.0|
|      B|      B|       2|           b|        1.0|
|      B|      B|       3|           b|        1.0|
|Number1|Number1|       1|           1|        1.0|
|Number1|Number1|       2|           2|        1.0|
|Number1|Number1|       3|           3|        1.0|
+-------+-------+--------+------------+-----------+

repair time: 1.44347000122

Total time: 63.0538208485

