# HOLOCLEAN DEMO


In [1]:
from holoclean.holoclean import HoloClean, Session
from holoclean.errordetection.errordetector import ErrorDetectors
from holoclean.featurization.featurizer import SignalInit, SignalCooccur, SignalDC
from holoclean.learning.accuracy import Accuracy
from time import time as t

## Start Holoclean demo
Create HoloClean object and Session.
The HoloClean object will log all info for the test.
Session will be used to ingest data from input files.

In [2]:
        holo_obj = HoloClean()
        session = Session("Session", holo_obj) 
        print "Testing started :"+str(t())
        

Testing started :1517334591.33


  cursor.execute('SELECT @@tx_isolation')


## Read Input and DC from file
Test data and the Denial Constraints will be read using the Session's ingestor.
After ingesting the test data will be loaded into MySQL tables along with entries in the a metadata table.

In [3]:
        fx = open('execution_time.txt', 'w')
        list_time = []
        start_time = t()
        
        session.ingest_dataset("test/inputDatabase.csv")
        #session.ingest_dataset("test/test.csv")
        d = t()-start_time
        list_time.append(d)
        holo_obj.logger.info('ingest csv time: '+str(d)+'\n')
        fx.write('ingest csv time: '+str(d)+'\n')
        print 'Init table'
        sql = holo_obj.dataengine.get_table_to_dataframe("Init", session.dataset)
        sql.select('index','ProviderNumber','HospitalName', 'Address1').show()
        print 'ingest csv time: '+str(d)+'\n'
        
        start_time = t()
        
        session.denial_constraints("test/inputConstraint.txt")
        #session.denial_constraints("test/dc1.txt")
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('read denial constraints time: '+str(d)+'\n')
        fx.write('read denial constraints time: '+str(d)+'\n')
        print 'read denial constraints time: '+str(d)+'\n'


Init table
+-----+--------------+--------------------+--------------------+
|index|ProviderNumber|        HospitalName|            Address1|
+-----+--------------+--------------------+--------------------+
|    1|         10018|CALLAHAN EYE FOUN...|1720 UNIVERSITY BLVD|
|    2|         10018|CALLAHAN EYE FOUN...|1720 UNIVERSITY BLVD|
|    3|         10018|CALLAHAN EYE FOUN...|1720 UNIVERSITY BLVD|
|    4|         10018|CALLAHAN EYE FOUN...|1720 UNIVERSITY BLVD|
|    5|         10018|CALLAHAN EYE FOUN...|1720 UNIVERSITY BLVD|
|    6|         10018|CALLAHAN EYE FOUN...|1720 UNIVERSITY BLVD|
|    7|         10018|CALLAHAN EYE FOUN...|1720 UNIVERSITY BLVD|
|    8|         10018|CALLAHAN EYE FOUN...|1720 UNIVERSITY BLVD|
|    9|         10019|HELEN KELLER MEMO...|1300 SOUTH MONTGO...|
|   10|         10019|HELEN KELLER MEMO...|1300 SOUTH MONTGO...|
|   11|         10019|HELEN KELLER MEMO...|1300 SOUTH MONTGO...|
|   12|         10019|HELEN KELLER MEMO...|1300 SOUTH MONTGO...|
|   13|       

## Error Detection
Using the Denial Constraints read through the ingestor, apply Error Detection on the test data

In [4]:
        start_time = t()
        err_detector = ErrorDetectors(session.Denial_constraints, holo_obj.dataengine,
                                      holo_obj.spark_session, session.dataset)
        session.add_error_detector(err_detector)
        session.ds_detect_errors()
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('error dectection time: '+str(d)+'\n')
        fx.write('error dectection time: '+str(d)+'\n')
        
        print 'Clean table'
        sql = holo_obj.dataengine.get_table_to_dataframe("C_clean", session.dataset)
        sql.show()
        print 'Don\'t know table'
        sql = holo_obj.dataengine.get_table_to_dataframe("C_dk", session.dataset)
        sql.show()
        print 'error dectection time: '+str(d)+'\n'

Clean table
+---+----------------+
|ind|            attr|
+---+----------------+
|104|    HospitalType|
| 10|        Address1|
|110|  ProviderNumber|
|106|EmergencyService|
|132|   HospitalOwner|
|106|        Address3|
|126|      CountyName|
|107|    HospitalName|
|111|        Address1|
|109|    HospitalType|
|140|           Score|
|165|        Stateavg|
|163|  ProviderNumber|
|147|        Address2|
|110|    HospitalName|
|167|   HospitalOwner|
|156|      CountyName|
|111|      CountyName|
|183|EmergencyService|
|152|           Score|
+---+----------------+
only showing top 20 rows

Don't know table
+---+-------+
|ind|   attr|
+---+-------+
|853|ZipCode|
|897|   City|
| 52|   City|
|658|   City|
|433|ZipCode|
|466|ZipCode|
|199|   City|
|974|   City|
|219|ZipCode|
|596|   City|
|952|   City|
|298|   City|
|360|ZipCode|
|941|ZipCode|
|228|ZipCode|
|702|   City|
|206|   City|
|643|ZipCode|
|538|   City|
|677|ZipCode|
+---+-------+
only showing top 20 rows

error dectection time: 218.3178

## Domain Pruning


In [5]:
        start_time = t()
        pruning_threshold = 0.5
        session.ds_domain_pruning(pruning_threshold)
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('domain pruning time: '+str(d)+'\n')
        fx.write('domain pruning time: '+str(d)+'\n')
        
        print 'Possible Values table'
        sql = holo_obj.dataengine.get_table_to_dataframe("Possible_values", session.dataset)
        sql.show()
        print 'domain pruning time: '+str(d)+'\n'

Possible Values table
+---+-----------+-------------+--------+---------+
|tid|  attr_name|     attr_val|observed|data_type|
+---+-----------+-------------+--------+---------+
|805|   Stateavg|     AL_AMI-3|       1|   String|
|805|   Stateavg|      AL_HF-3|       0|   String|
|574|   Stateavg|      AL_HF-1|       1|   String|
|806|       City|      GADSDEN|       1|   String|
|456|       City|    SHEFFIELD|       1|   String|
|232|       City|   FORT PAYNE|       1|   String|
|348|       City|       DOTHAN|       1|   String|
|575|       City|       VALLEY|       1|   String|
|232|      State|           AL|       1|   String|
|456|       City|   BIRMINGHAM|       0|   String|
|  1|       City|   BIRMINGHAM|       1|   String|
|806|      State|           AL|       1|   String|
|232|    ZipCode|        35968|       1|   String|
|691|    ZipCode|        35233|       1|   String|
|456|      State|           AL|       1|   String|
|348|      State|           AL|       1|   String|
|113|   S

## Initial Value Signal 

In [6]:
        start_time = t()
        start_time1 = t()
        initial_value_signal = SignalInit(session.Denial_constraints, holo_obj.dataengine,
                                          session.dataset)
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('init signal time: '+str(d)+'\n')
        fx.write('init signal time: '+str(d)+'\n')
        print 'init signal time: '+str(d)+'\n'

init signal time: 0.000231027603149



## Co-occurence Signal 

In [7]:
        start_time = t()
        session.add_featurizer(initial_value_signal)
        statistics_signal = SignalCooccur(session.Denial_constraints, holo_obj.dataengine,
                                          session.dataset)
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('cooccur signal time: '+str(d)+'\n')
        fx.write('cooccur signal time: '+str(d)+'\n')
        print 'cooccur signal time: '+str(d)+'\n'

cooccur signal time: 0.000562906265259



## DC Featurization 

In [8]:
        start_time = t()
        session.add_featurizer(statistics_signal)
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('dc signal time: '+str(d)+'\n')
        fx.write('dc signal time: '+str(d)+'\n')
        print 'dc signal time: '+str(d)+'\n'
        start_time = t()
        dc_signal = SignalDC(session.Denial_constraints, holo_obj.dataengine, session.dataset)
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('dc featurize time: '+str(d)+'\n')
        fx.write('dc featurize time: '+str(d)+'\n')
        print 'dc featurize time: '+str(d)+'\n'
        session.add_featurizer(dc_signal)
        session.ds_featurize()
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('total featurization time: '+str(d)+'\n')
        fx.write('total featurization time: '+str(d)+'\n')
        print 'Feature table'
        sql = holo_obj.dataengine.get_table_to_dataframe("Feature_temp", session.dataset)
        sql.show()
        
        print 'total featurization time: '+str(d)+'\n'

dc signal time: 0.00037693977356

dc featurize time: 0.000338077545166

adding weight_id to feature table...
adding weight_id to feature table is finished
Feature table
+---------+--------+--------------+--------------------+--------------------+----+---------+
|var_index|rv_index|       rv_attr|        assigned_val|             feature|TYPE|weight_id|
+---------+--------+--------------+--------------------+--------------------+----+---------+
|        1|     593|      Stateavg|            AL_AMI-2|       Init=AL_AMI-2|init|         |
|        2|     378|          City|            WINFIELD|       Init=WINFIELD|init|         |
|        3|     108|   MeasureName|Pneumonia Patient...|Init=Pneumonia Pa...|init|         |
|        4|     594|ProviderNumber|               10027|          Init=10027|init|         |
|        5|       1|ProviderNumber|               10018|          Init=10018|init|         |
|        6|     863|         State|                  AL|             Init=AL|init|     

## Data Repair

In [9]:
        start_time = t()
        session._numskull()
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('numbskull time: '+str(d)+'\n')
        fx.write('numbskull time: '+str(d)+'\n')
        print 'numbskull time: '+str(d)+'\n'
        start_time = t()
        session.ds_repair()
        d = t() - start_time
        list_time.append(d)
        holo_obj.logger.info('repair time: '+str(d)+'\n')
        fx.write('repair time: '+str(d)+'\n')
        print 'repair time: '+str(d)+'\n'

        holo_obj.logger.info('Total time: ' + str(sum(list_time)) + '\n')
        fx.write('Total time: ' + str(sum(list_time)) + '\n')
        print 'Total time: ' + str(sum(list_time)) + '\n'

        fx.close()

numbskull is starting
wrapper is starting
wrapper is finished
1
numbskull is finished
adding weight is finished is finished
numbskull time: 205.241176844

starting repairs
repairs are finished
+-------+-------+--------+------------+-------------------+
|rv_attr|rv_attr|rv_index|assigned_val|        probability|
+-------+-------+--------+------------+-------------------+
|   City|   City|       1|  BIRMINGHAM|                1.0|
|   City|   City|      10|  BIRMINGHAM| 0.2688911018612754|
|   City|   City|      10|   SHEFFIELD| 0.7311088981387246|
|   City|   City|     100|         OPP|                1.0|
|   City|   City|    1000|     ONEONTA|                1.0|
|   City|   City|     101|         OPP|                1.0|
|   City|   City|     102|         OPP|                1.0|
|   City|   City|     103|         OPP|                1.0|
|   City|   City|     104|         OPP|                1.0|
|   City|   City|     105|         OPP|                1.0|
|   City|   City|     106| 