# Getting Started With HoloClean

Here we simply present the basic pipeline needed to repair a dataset. For a detailed overview of this process, please see our tutorials.

In [1]:
from holoclean.holoclean import HoloClean, Session

In [2]:
holo = HoloClean(mysql_driver = "../holoclean/lib/mysql-connector-java-5.1.44-bin.jar" )
session = Session(holo)

In [3]:
data_path = "data/hospital_dataset.csv"

## returns pyspark dataframe of initial data
data = session.load_data(data_path)

In [4]:
## built in pyspark commands available
data.show(2)

+-----+--------------+--------------------+--------------------+--------+--------+----------+-----+-------+----------+-----------+--------------------+--------------------+----------------+--------------------+-----------+--------------------+-----+------+--------------+
|index|ProviderNumber|        HospitalName|            Address1|Address2|Address3|      City|State|ZipCode|CountyName|PhoneNumber|        HospitalType|       HospitalOwner|EmergencyService|           Condition|MeasureCode|         MeasureName|Score|Sample|      Stateavg|
+-----+--------------+--------------------+--------------------+--------+--------+----------+-----+-------+----------+-----------+--------------------+--------------------+----------------+--------------------+-----------+--------------------+-----+------+--------------+
|    1|         10018|CALLAHAN EYE FOUN...|1720 UNIVERSITY BLVD|   Empty|   Empty|BIRMINGHAM|   AL|  35233| JEFFERSON| 2053258100|Acute Care Hospitals|Voluntary non-pro...|            

In [5]:
## automatically detects bad formatted dc's. needs to be reviewed
dcs = session.add_denial_constraint("dc")
dcs

DCFormatException: Invalid DC: Missing Information

In [6]:
dc_path = "data/hospital_constraints.txt"

dcs = session.load_denial_constraints(dc_path)

dcs

['t1&t2&EQ(t1.ZipCode,t2.ZipCode)&IQ(t1.City,t2.City)',
 't1&t2&EQ(t1.ZipCode,t2.ZipCode)&IQ(t1.State,t2.State)',
 't1&t2&EQ(t1.PhoneNumber,t2.PhoneNumber)&IQ(t1.ZipCode,t2.ZipCode)',
 't1&t2&EQ(t1.PhoneNumber,t2.PhoneNumber)&IQ(t1.City,t2.City)',
 't1&t2&EQ(t1.PhoneNumber,t2.PhoneNumber)&IQ(t1.State,t2.State)',
 't1&t2&EQ(t1.ProviderNumber,t2.ProviderNumber)&EQ(t1.MeasureCode,t2.MeasureCode)&IQ(t1.Stateavg,t2.Stateavg)',
 't1&t2&EQ(t1.MeasureCode,t2.MeasureCode)&IQ(t1.MeasureName,t2.MeasureName)',
 't1&t2&EQ(t1.MeasureCode,t2.MeasureCode)&IQ(t1.Condition,t2.Condition)',
 't1&t2&EQ(t1.State,t2.State)&EQ(t1.MeasureCode,t2.MeasureCode)&IQ(t1.Stateavg,t2.Stateavg)']

In [7]:
session.add_denial_constraint("t1&t2&EQ(t1.Foo, t2.Bar)")

['t1&t2&EQ(t1.ZipCode,t2.ZipCode)&IQ(t1.City,t2.City)',
 't1&t2&EQ(t1.ZipCode,t2.ZipCode)&IQ(t1.State,t2.State)',
 't1&t2&EQ(t1.PhoneNumber,t2.PhoneNumber)&IQ(t1.ZipCode,t2.ZipCode)',
 't1&t2&EQ(t1.PhoneNumber,t2.PhoneNumber)&IQ(t1.City,t2.City)',
 't1&t2&EQ(t1.PhoneNumber,t2.PhoneNumber)&IQ(t1.State,t2.State)',
 't1&t2&EQ(t1.ProviderNumber,t2.ProviderNumber)&EQ(t1.MeasureCode,t2.MeasureCode)&IQ(t1.Stateavg,t2.Stateavg)',
 't1&t2&EQ(t1.MeasureCode,t2.MeasureCode)&IQ(t1.MeasureName,t2.MeasureName)',
 't1&t2&EQ(t1.MeasureCode,t2.MeasureCode)&IQ(t1.Condition,t2.Condition)',
 't1&t2&EQ(t1.State,t2.State)&EQ(t1.MeasureCode,t2.MeasureCode)&IQ(t1.Stateavg,t2.Stateavg)',
 't1&t2&EQ(t1.Foo, t2.Bar)']

In [8]:
# returns the removed dc
session.remove_denial_constraint(9)

't1&t2&EQ(t1.Foo, t2.Bar)'

In [9]:
clean, dirty = session.detect_errors()

KeyboardInterrupt: 

In [9]:
# we can skip error detection if it's already been done
clean = session.load_clean_data("data/clean.csv")
dirty = session.load_dirty_data("data/dirty.csv")

In [10]:
clean.head(5)

[Row(ind=u'106', attr=u'EmergencyService'),
 Row(ind=u'106', attr=u'Address3'),
 Row(ind=u'104', attr=u'HospitalType'),
 Row(ind=u'126', attr=u'CountyName'),
 Row(ind=u'132', attr=u'HospitalOwner')]

In [11]:
dirty.head(5)

[Row(ind=u'466', attr=u'ZipCode'),
 Row(ind=u'596', attr=u'City'),
 Row(ind=u'643', attr=u'ZipCode'),
 Row(ind=u'438', attr=u'ZipCode'),
 Row(ind=u'390', attr=u'ZipCode')]

In [12]:
repaired = session.repair()

100%|██████████| 100/100 [01:23<00:00,  1.20it/s]


In [13]:
session.compare_to_truth("data/groundtruth.csv")

The precision that we have is :0.987947882736
The recall that we have is :0.672727272727
