In [1]:
import psycopg2
import pandas as pd
from string import Template

In [2]:
dataname = "small_census_01"
index = "v1"

In [3]:
# dataname = "small_adult_1"
# index = "1"

In [4]:
# dataname =  "food_small"
# index = "v1"

### Connect to db

In [5]:
conn = psycopg2.connect(host="localhost",
                        database="{}_{}".format(dataname,index), 
                        user="holocleanuser", 
                        password="abcd1234")

In [6]:
def query_df(q, value = []):
    if len(value) != 0:
        df = pd.read_sql_query(q, conn, params=value)
    else:
        df = pd.read_sql_query(q, conn)
    return df

In [7]:
def table_lists():
    df = query_df("""SELECT table_name FROM information_schema.tables
       WHERE table_schema = 'public'""")
    return df

In [8]:
table_lists()

Unnamed: 0,table_name
0,dk_cells
1,cell_domain
2,pos_values
3,cell_distr
4,inf_values_idx
5,inf_values_dom
6,small_census_01_repaired
7,small_census_01_clean
8,small_census_01


In [9]:
# get all attributes 
data = query_df("select * from {}".format(dataname))
attributes = data.columns.values[1:]
attributes

array(['age', 'class_of_worker', 'detailed_industry_recode',
       'detailed_occupation_recode', 'education', 'wage_per_hour',
       'enroll_in_edu_inst_last_wk', 'marital_stat',
       'major_industry_code', 'major_occupation_code', 'race',
       'hispanic_origin', 'sex', 'member_of_a_labor_union',
       'reason_for_unemployment', 'full_or_part_time_employment_stat',
       'capital_gains', 'capital_losses', 'dividends_from_stocks',
       'tax_filer_stat', 'region_of_previous_residence',
       'state_of_previous_residence',
       'detailed_household_and_family_stat',
       'detailed_household_summary_in_household', 'instance_weight',
       'migration_code-change_in_msa', 'migration_code-change_in_reg',
       'migration_code-move_within_reg', 'live_in_this_house_1_year_ago',
       'migration_prev_res_in_sunbelt', 'num_persons_worked_for_employer',
       'family_members_under_18', 'country_of_birth_father',
       'country_of_birth_mother', 'country_of_birth_self', 'citizens

### Total Errors

In [10]:
errors_template = Template('SELECT t1._tid_, t2._attribute_, t1.\"$attr\" as init, t2._value_ as gt '\
                            'FROM $init_table as t1, $grdt_table as t2 '\
                            'WHERE t1._tid_ = t2._tid_ '\
                              'AND t2._attribute_ = \'$attr\' '\
                              'AND t1.\"$attr\" != t2._value_')

In [11]:
def get_total_errors():
    all_error = []
    for attr in attributes:
        query = errors_template.substitute(init_table=dataname, grdt_table='%s_clean'%dataname,
                        attr=attr)
        df = query_df(query)
        all_error.append(df)
    return all_error

In [12]:
all_error = get_total_errors()

In [13]:
all_error = pd.concat(all_error, ignore_index=True)

In [14]:
all_error

Unnamed: 0,_tid_,_attribute_,init,gt
0,828,age,3x,33
1,674,class_of_worker,prxvate,private
2,60,detailed_industry_recode,x,4
3,213,detailed_industry_recode,x3,43
4,528,detailed_industry_recode,x,0
5,922,detailed_occupation_recode,x,0
6,459,marital_stat,married-a f sxouse present,married-a f spouse present
7,286,major_occupation_code,adm support includixg clerical,adm support including clerical
8,429,race,whxte,white
9,753,hispanic_origin,all oxher,all other


In [18]:
vid_template = Template("SELECT * from cell_domain where attribute = \'$attr\' and _tid_ = \'$tid\'")

In [51]:
query_df(vid_template.substitute(attr='detailed_household_and_family_stat', tid=632))

Unnamed: 0,_cid_,_tid_,_vid_,attribute,domain,domain_size,fixed,init_index,init_value
0,26566,632,9324,detailed_household_and_family_stat,child 18+ never marr not in a subfamily|||chil...,6,0,2,chxld <18 never marr not in subfamily


In [66]:
init_count = all_error.shape[0]
init_count

1407

### Total Repair

In [21]:
query = "SELECT t1._tid_, t1.attribute, t1.init_value as init, t2.rv_value as repair " \
                 "FROM %s as t1, %s as t2 " \
                 "WHERE t1._tid_ = t2._tid_ " \
                   "AND t1.attribute = t2.attribute " \
                   "AND t1.init_value != t2.rv_value"\
                %('cell_domain', 'inf_values_dom')
all_repair = query_df(query)

In [22]:
all_repair.head()

Unnamed: 0,_tid_,attribute,init,repair
0,3,detailed_occupation_recode,3,0
1,3,detailed_industry_recode,34,0
2,4,detailed_occupation_recode,37,0
3,7,country_of_birth_self,columbia,united-states
4,9,country_of_birth_self,mexico,united-states


In [23]:
total_repair_init_count = all_repair.shape[0]
total_repair_init_count

854

In [24]:
query = "SELECT t1._tid_, t1.attribute, t1.init_value as init, t2.rv_value as repair " \
         "FROM %s as t1, %s as t2, %s as t3 " \
         "WHERE t1._tid_ = t2._tid_ " \
           "AND t1.attribute = t2.attribute " \
           "AND t1.init_value != t2.rv_value " \
           "AND t1._tid_ = t3._tid_ " \
           "AND t1.attribute = t3._attribute_"\
        %('cell_domain', 'inf_values_dom', '%s_clean'%dataname)
repair_gt = query_df(query)

In [25]:
repair_gt.head(15)

Unnamed: 0,_tid_,attribute,init,repair
0,3,detailed_industry_recode,34,0
1,3,detailed_occupation_recode,3,0
2,4,detailed_occupation_recode,37,0
3,7,country_of_birth_self,columbia,united-states
4,9,country_of_birth_self,mexico,united-states
5,12,detailed_industry_recode,45,0
6,17,detailed_industry_recode,45,0
7,17,detailed_occupation_recode,23,0
8,18,country_of_birth_self,peru,united-states
9,19,detailed_industry_recode,34,0


In [26]:
repair_gt_init_count = repair_gt.shape[0]
repair_gt_init_count

854

### Correct Repair

In [39]:
correct_repairs_template = Template('SELECT errors._tid_, errors._attribute_, '\
                            ' errors.init, errors._value_ as gt, repairs.rv_value as repair FROM'\
                            '(SELECT t2._tid_, t2._attribute_, t2._value_, t1.\"$attr\" as init '\
                             'FROM $init_table as t1, $grdt_table as t2 '\
                             'WHERE t1._tid_ = t2._tid_ '\
                               'AND t2._attribute_ = \'$attr\' '\
                               'AND t1.\"$attr\" != t2._value_ ) as errors, $inf_dom as repairs '\
                              'WHERE errors._tid_ = repairs._tid_ '\
                                'AND errors._attribute_ = repairs.attribute '\
                                'AND errors._value_ = repairs.rv_value')

In [28]:
def get_total_repair():
    all_rp = []
    for attr in attributes:
        query = correct_repairs_template.substitute(init_table=dataname, grdt_table='%s_clean'%dataname, 
                                            inf_dom = 'inf_values_dom', attr=attr)
        df = query_df(query)
        all_rp.append(df)
    return all_rp

In [29]:
all_repair_gt = get_total_repair()

In [30]:
all_repair_gt = pd.concat(all_repair_gt, ignore_index=True)

In [31]:
all_repair_gt.head(30)

Unnamed: 0,_tid_,_attribute_,init,gt,repair
0,528,detailed_industry_recode,x,0,0
1,922,detailed_occupation_recode,x,0,0
2,323,citizenship,native- born in the unxted states,native- born in the united states,native- born in the united states


In [32]:
all_repair_gt.shape[0]

3

In [46]:
wrong_repairs_template = Template('SELECT errors._tid_, errors._attribute_, '\
                            ' errors.init, errors._value_ as gt, repairs.rv_value as repair FROM'\
                            '(SELECT t2._tid_, t2._attribute_, t2._value_, t1.\"$attr\" as init '\
                             'FROM $init_table as t1, $grdt_table as t2 '\
                             'WHERE t1._tid_ = t2._tid_ '\
                               'AND t2._attribute_ = \'$attr\' '\
                               'AND t1.\"$attr\" != t2._value_ ) as errors, $inf_dom as repairs '\
                              'WHERE errors._tid_ = repairs._tid_ '\
                                'AND errors._attribute_ = repairs.attribute '\
                                'AND errors._value_ <> repairs.rv_value')

In [47]:
def get_total_wrong_repair():
    all_rp = []
    for attr in attributes:
        query = wrong_repairs_template.substitute(init_table=dataname, grdt_table='%s_clean'%dataname, 
                                            inf_dom = 'inf_values_dom', attr=attr)
        df = query_df(query)
        all_rp.append(df)
    return all_rp

In [48]:
wrong_repair_gt = get_total_wrong_repair()

In [49]:
wrong_repair_gt = pd.concat(wrong_repair_gt, ignore_index=True)

In [50]:
wrong_repair_gt

Unnamed: 0,_tid_,_attribute_,init,gt,repair
0,60,detailed_industry_recode,x,4,0
1,213,detailed_industry_recode,x3,43,0
2,632,detailed_household_and_family_stat,chxld <18 never marr not in subfamily,child <18 never marr not in subfamily,chxld <18 never marr not in subfamily
3,705,detailed_household_and_family_stat,child <18 never marr not in xubfamily,child <18 never marr not in subfamily,child <18 never marr not in xubfamily
4,299,migration_code-move_within_reg,different staxe in west,different state in west,different staxe in west
5,217,migration_prev_res_in_sunbelt,nxt in universe,not in universe,nxt in universe
6,5,num_persons_worked_for_employer,x,0,x
7,144,num_persons_worked_for_employer,x,0,x
8,244,num_persons_worked_for_employer,x,0,x
9,527,num_persons_worked_for_employer,x,4,x


### Detected Errors

In [33]:
query = "SELECT t1._tid_, t1.init_value as init, t2._value_ as gt " \
        "FROM %s as t1, %s as t2, %s as t3 " \
        "WHERE t1._tid_ = t2._tid_ AND t1._cid_ = t3._cid_ " \
        "AND t1.attribute = t2._attribute_ " \
        "AND t1.init_value != t2._value_" \
        % ('cell_domain', '%s_clean'%dataname, 'dk_cells')

In [34]:
init_error = query_df(query)

In [35]:
init_error.shape[0]

14

In [38]:
init_error.head(10)

Unnamed: 0,_tid_,init,gt
0,5,x,0
1,60,x,4
2,144,x,0
3,213,x3,43
4,217,nxt in universe,not in universe
5,244,x,0
6,299,different staxe in west,different state in west
7,323,native- born in the unxted states,native- born in the united states
8,527,x,4
9,528,x,0


### Feature Weights

In [None]:
# example: 2777
