# HOLOCLEAN DEMO


In [1]:
from holoclean.holoclean import HoloClean, Session
from holoclean.errordetection.errordetector import ErrorDetectors
from holoclean.featurization.featurizer import SignalInit, SignalCooccur, SignalDC
from holoclean.learning.accuracy import Accuracy
from time import time as t

## Start Holoclean demo
Create HoloClean object and Session.
The HoloClean object will log all info for the test.
Session will be used to ingest data from input files.

In [2]:
        holo_obj = HoloClean()
        session = Session("Session", holo_obj) 
        print "Testing started :"+str(t())
        

Testing started :1515439092.83


  cursor.execute('SELECT @@tx_isolation')


## Read Input and DC from file
Test data and the Denial Constraints will be read using the Session's ingestor.
After ingesting the test data will be loaded into MySQL tables along with entries in the a metadata table.

In [3]:
        fx = open('execution_time.txt', 'w')
        list_time = []
        start_time = t()
        
        #session.ingest_dataset("test/inputDatabase.csv")
        session.ingest_dataset("test/test.csv")
        d = t()-start_time
        list_time.append(d)
        holo_obj.logger.info('ingest csv time: '+str(d)+'\n')
        fx.write('ingest csv time: '+str(d)+'\n')
        print 'ingest csv time: '+str(d)+'\n'
        
        start_time = t()
        
        #session.denial_constraints("test/inputConstraint.txt")
        session.denial_constraints("test/dc1.txt")
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('read denial constraints time: '+str(d)+'\n')
        fx.write('read denial constraints time: '+str(d)+'\n')
        print 'read denial constraints time: '+str(d)+'\n'


ingest csv time: 7.58988213539

read denial constraints time: 0.0154159069061



## Error Detection
Using the Denial Constraints read through the ingestor, apply Error Detection on the test data

In [4]:
        start_time = t()
        err_detector = ErrorDetectors(session.Denial_constraints, holo_obj.dataengine,
                                      holo_obj.spark_session, session.dataset)
        session.add_error_detector(err_detector)
        session.ds_detect_errors()
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('error dectection time: '+str(d)+'\n')
        fx.write('error dectection time: '+str(d)+'\n')
        print 'error dectection time: '+str(d)+'\n'

error dectection time: 26.1363401413



## Domain Pruning


In [5]:
        start_time = t()
        pruning_threshold = 0.5
        session.ds_domain_pruning(pruning_threshold)
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('domain pruning time: '+str(d)+'\n')
        fx.write('domain pruning time: '+str(d)+'\n')
        print 'domain pruning time: '+str(d)+'\n'

domain pruning time: 1.4309990406



## Initial Value Signal 

In [6]:
        start_time = t()
        start_time1 = t()
        initial_value_signal = SignalInit(session.Denial_constraints, holo_obj.dataengine,
                                          session.dataset)
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('init signal time: '+str(d)+'\n')
        fx.write('init signal time: '+str(d)+'\n')
        print 'init signal time: '+str(d)+'\n'

init signal time: 0.000114917755127



## Co-occurence Signal 

In [7]:
        start_time = t()
        session.add_featurizer(initial_value_signal)
        statistics_signal = SignalCooccur(session.Denial_constraints, holo_obj.dataengine,
                                          session.dataset)
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('cooccur signal time: '+str(d)+'\n')
        fx.write('cooccur signal time: '+str(d)+'\n')
        print 'cooccur signal time: '+str(d)+'\n'

cooccur signal time: 0.00043511390686



## DC Featurization 

In [8]:
        start_time = t()
        session.add_featurizer(statistics_signal)
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('dc signal time: '+str(d)+'\n')
        fx.write('dc signal time: '+str(d)+'\n')
        print 'dc signal time: '+str(d)+'\n'
        start_time = t()
        dc_signal = SignalDC(session.Denial_constraints, holo_obj.dataengine, session.dataset)
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('dc featurize time: '+str(d)+'\n')
        fx.write('dc featurize time: '+str(d)+'\n')
        print 'dc featurize time: '+str(d)+'\n'
        session.add_featurizer(dc_signal)
        session.ds_featurize()
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('total featurization time: '+str(d)+'\n')
        fx.write('total featurization time: '+str(d)+'\n')
        print 'total featurization time: '+str(d)+'\n'

dc signal time: 0.00028395652771

dc featurize time: 8.39233398438e-05

INSERT INTO 00590556366891_Feature_temp SELECT * FROM (  (SELECT  @p := @p + 1 AS var_index,            init_flat.tid AS rv_index,            init_flat.attr_name AS rv_attr,            init_flat.attr_val AS assigned_val,            concat('Init=',init_flat.attr_val ) AS feature,            'init' AS TYPE,            '      ' AS weight_id            FROM 00590556366891_Init_flat AS init_flat WHERE (init_flat.attr_name = 'A' OR init_flat.attr_name = 'B'))as T_0);
INSERT INTO 00590556366891_Feature_temp SELECT * FROM (  (SELECT DISTINCT @p := @p + 1 AS var_index,cooccur.tid_first AS rv_index,cooccur.attr_first AS rv_attr,cooccur.val_first AS assigned_val,CONCAT (cooccur.attr_second , '=' , cooccur.val_second ) AS feature,'cooccur' AS TYPE,'        ' AS weight_id FROM 00590556366891_Init_flat_join AS cooccur WHERE (cooccur.attr_first = 'A' OR cooccur.attr_first = 'B'))as T_1);
INSERT INTO 00590556366891_Feature_temp SE

## Data Repair

In [9]:
        start_time = t()
        session._numskull()
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('numbskull time: '+str(d)+'\n')
        fx.write('numbskull time: '+str(d)+'\n')
        print 'numbskull time: '+str(d)+'\n'
        start_time = t()
        session.ds_repair()
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('repair time: '+str(d)+'\n')
        fx.write('repair time: '+str(d)+'\n')
        print 'repair time: '+str(d)+'\n'

        holo_obj.logger.info('Total time: ' + str(sum(list_time)) + '\n')
        fx.write('Total time: ' + str(sum(list_time)) + '\n')
        print 'Total time: ' + str(sum(list_time)) + '\n'

        fx.close()

numbskull is starting
wrapper is starting
wrapper is finished
1
numbskull is finished
adding weight is finished is finished
numbskull time: 9.37386894226

starting repairs
repairs are finished
repair time: 1.10969305038

Total time: 48.1495959759

