In [1]:
import psycopg2
import pandas as pd
from string import Template

In [2]:
index = 7

### Connect to db

In [3]:
conn = psycopg2.connect(host="localhost",
                        database="food_{}".format(index), 
                        user="holocleanuser", 
                        password="abcd1234")

In [4]:
def query_df(q, value = []):
    if len(value) != 0:
        df = pd.read_sql_query(q, conn, params=value)
    else:
        df = pd.read_sql_query(q, conn)
    return df

In [5]:
def table_lists():
    df = query_df("""SELECT table_name FROM information_schema.tables
       WHERE table_schema = 'public'""")
    return df

In [6]:
table_lists()

Unnamed: 0,table_name
0,food
1,dk_cells
2,cell_domain
3,pos_values
4,cell_distr
5,inf_values_idx
6,inf_values_dom
7,food_repaired
8,food_clean


In [7]:
# get all attributes 
data = query_df("select * from food")
attributes = data.columns.values[1:]
attributes

array(['akaname', 'inspectionid', 'city', 'state', 'results', 'longitude',
       'latitude', 'inspectiondate', 'risk', 'location', 'license',
       'facilitytype', 'address', 'inspectiontype', 'dbaname', 'zip'],
      dtype=object)

### Total Errors

In [8]:
errors_template = Template('SELECT t1._tid_, t2._attribute_, t1.$attr as init, t2._value_ as gt '\
                            'FROM $init_table as t1, $grdt_table as t2 '\
                            'WHERE t1._tid_ = t2._tid_ '\
                              'AND t2._attribute_ = \'$attr\' '\
                              'AND t1.$attr != t2._value_')

In [9]:
def get_total_errors():
    all_error = []
    for attr in attributes:
        query = errors_template.substitute(init_table='food', grdt_table='food_clean',
                        attr=attr)
        df = query_df(query)
        all_error.append(df)
    return all_error

In [10]:
all_error = get_total_errors()

In [11]:
all_error = pd.concat(all_error, ignore_index=True)

In [12]:
all_error.head()

Unnamed: 0,_tid_,_attribute_,init,gt
0,113940,city,_nan_,chicago
1,118355,city,_nan_,chicago
2,119036,city,_nan_,chicago
3,119660,city,_nan_,chicago
4,368,city,_nan_,chicago


In [13]:
init_count = all_error.shape[0]
init_count

619

In [14]:
def contains_in_order(init, gt):
    if not isinstance(init, str) or not isinstance(gt, str):
        return False
    if init.strip().isdigit():
        return False
    for token in init.split(" "):
        token = token.strip()
        if token in ["st", "street", "pl", "rd", "blvd", "ave", "bldg"]:
            continue
        if token not in gt:
            return False
    return True

In [15]:
def contains(init, gt):
    return contains_in_order(init, gt) or contains_in_order(gt, init)

In [16]:
false_error = all_error[[contains(t[1]['init'], t[1]['gt']) for t in all_error.iterrows()]]

In [17]:
false_error_tid = false_error._tid_.values

In [18]:
true_error_count = init_count - len(false_error_tid)
true_error_count

405

In [19]:
false_error.head(10)

Unnamed: 0,_tid_,_attribute_,init,gt
141,132542,facilitytype,mobile food,mobile food dispenser
146,132816,facilitytype,mobile food,mobile food dispenser
147,132851,facilitytype,mobile food,mobile food dispenser
198,54907,facilitytype,store,grocery store
245,73770,facilitytype,rooftops,rooftop
348,100063,address,5633 n ashland ave,5633-5635 n ashland ave
349,100291,address,5633 n ashland ave,5633-5635 n ashland ave
350,104353,address,3632 n pulaski rd,3632-3640 n pulaski rd
351,105178,address,333 w 35th st bldg,333 w 35th st
352,106229,address,2008 n halsted,2008 n halsted st


### Total Repair

In [20]:
query = "SELECT t1._tid_, t1.attribute, t1.init_value as init, t2.rv_value as repair " \
                 "FROM %s as t1, %s as t2 " \
                 "WHERE t1._tid_ = t2._tid_ " \
                   "AND t1.attribute = t2.attribute " \
                   "AND t1.init_value != t2.rv_value"\
                %('cell_domain', 'inf_values_dom')
all_repair = query_df(query)

In [21]:
all_repair.head()

Unnamed: 0,_tid_,attribute,init,repair
0,462,facilitytype,_nan_,restaurant
1,2746,dbaname,whale tale,filippos
2,4563,dbaname,table,bonsoiree
3,7096,dbaname,galleria market,europa galleria inc
4,9895,risk,_nan_,risk 2 (medium)


In [22]:
false_repair = all_repair[[contains(t[1]['init'], t[1]['repair']) for t in all_repair.iterrows()]]

In [23]:
false_repair.head(10)

Unnamed: 0,_tid_,attribute,init,repair
15,59974,dbaname,sports service soldier field,sportservice soldier field
32,76078,dbaname,gallery food express,gallery food express inc.
40,91077,address,201 n state st fl,201 n state st
47,108759,dbaname,sports service soldier field,sportservice soldier field
52,117866,akaname,carniceria guanajuato,carniceria guanajuato #3
58,125634,dbaname,harolds chicken,harolds chicken shack
62,128387,address,1300 e 47th st bldg,1300 e 47th st
64,133691,address,1960 w 13 th st,1960 w 13th st


In [24]:
false_repair_tid = false_repair._tid_.values

In [25]:
total_repair_init_count = all_repair.shape[0]
total_repair_init_count

65

In [26]:
true_total_repair_count = total_repair_init_count - len(false_repair_tid)
true_total_repair_count

57

In [27]:
query = "SELECT t1._tid_, t1.attribute, t1.init_value as init, t2.rv_value as repair " \
         "FROM %s as t1, %s as t2, %s as t3 " \
         "WHERE t1._tid_ = t2._tid_ " \
           "AND t1.attribute = t2.attribute " \
           "AND t1.init_value != t2.rv_value " \
           "AND t1._tid_ = t3._tid_ " \
           "AND t1.attribute = t3._attribute_"\
        %('cell_domain', 'inf_values_dom', 'food_clean')
repair_gt = query_df(query)

In [28]:
repair_gt.head()

Unnamed: 0,_tid_,attribute,init,repair
0,26760,facilitytype,navy pier kiosk,restaurant
1,57615,facilitytype,_nan_,restaurant
2,59977,facilitytype,_nan_,restaurant
3,63404,facilitytype,_nan_,restaurant
4,67929,facilitytype,_nan_,restaurant


In [29]:
repair_gt_init_count = repair_gt.shape[0]
repair_gt_init_count

20

In [30]:
false_repair_gt = repair_gt[[contains(t[1]['init'], t[1]['repair']) for t in repair_gt.iterrows()]]

In [31]:
false_repair_gt.head(10)

Unnamed: 0,_tid_,attribute,init,repair
13,91077,address,201 n state st fl,201 n state st


In [32]:
false_repair_gt_tid = false_repair_gt._tid_.values

In [33]:
true_repair_gt_count = repair_gt_init_count - len(false_repair_gt_tid)
true_repair_gt_count

19

### Correct Repair

In [34]:
repairs_template = Template('SELECT errors._tid_, errors._attribute_, '\
                            ' errors.init, errors._value_ as gt, repairs.rv_value as repair FROM'\
                            '(SELECT t2._tid_, t2._attribute_, t2._value_, t1.$attr as init '\
                             'FROM $init_table as t1, $grdt_table as t2 '\
                             'WHERE t1._tid_ = t2._tid_ '\
                               'AND t2._attribute_ = \'$attr\' '\
                               'AND t1.$attr != t2._value_ ) as errors, $inf_dom as repairs '\
                              'WHERE errors._tid_ = repairs._tid_ '\
                                'AND errors._attribute_ = repairs.attribute ')

In [35]:
def get_total_repair():
    all_rp = []
    for attr in attributes:
        query = repairs_template.substitute(init_table='food', grdt_table='food_clean', 
                                            inf_dom = 'inf_values_dom', attr=attr)
        df = query_df(query)
        all_rp.append(df)
    return all_rp

In [36]:
all_repair_gt = get_total_repair()

In [37]:
all_repair_gt = pd.concat(all_repair_gt, ignore_index=True)

In [38]:
all_repair_gt.head()

Unnamed: 0,_tid_,_attribute_,init,gt,repair
0,152,facilitytype,childrens services facility,school,childrens services facility
1,1090,facilitytype,mobile food dispenser,restaurant,mobile food dispenser
2,1094,facilitytype,mobile food dispenser,restaurant,mobile food dispenser
3,1096,facilitytype,mobile food dispenser,restaurant,mobile food dispenser
4,2275,facilitytype,mobile food dispenser,restaurant,mobile food dispenser


In [39]:
all_repair_gt.shape[0]

500

In [40]:
# 1. remove false errors & false repairs
to_remove = list(set(false_error_tid).union(set(false_repair_gt_tid)))
repair_df = all_repair_gt[~all_repair_gt['_tid_'].isin(to_remove)]
repair_df.shape[0]

343

In [41]:
# 2. pick two mutual exclusive parts: 1) strict equal -- init_count 2) non-strict equal
strict_equal = repair_df[repair_df['repair'] == repair_df['gt']]

In [42]:
strict_equal_count = strict_equal.shape[0]

In [43]:
nonstrict = repair_df[repair_df['repair'] != repair_df['gt']]

In [44]:
nonstrict

Unnamed: 0,_tid_,_attribute_,init,gt,repair
0,152,facilitytype,childrens services facility,school,childrens services facility
1,1090,facilitytype,mobile food dispenser,restaurant,mobile food dispenser
2,1094,facilitytype,mobile food dispenser,restaurant,mobile food dispenser
3,1096,facilitytype,mobile food dispenser,restaurant,mobile food dispenser
4,2275,facilitytype,mobile food dispenser,restaurant,mobile food dispenser
5,2535,facilitytype,_nan_,restaurant,_nan_
6,3317,facilitytype,childrens services facility,school,childrens services facility
7,3807,facilitytype,_nan_,restaurant,_nan_
8,3922,facilitytype,mobile prepared food vendor,bakery,mobile prepared food vendor
9,4242,facilitytype,childrens services facility,daycare above and under 2 years,childrens services facility


In [45]:
nonstrict_equal = nonstrict[[contains(t[1]['gt'], t[1]['repair']) for t in nonstrict.iterrows()]]

In [46]:
nonstrict_count = nonstrict_equal.shape[0]
nonstrict_count

0

In [47]:
correct_repair_count = strict_equal_count + nonstrict_count
correct_repair_count

17

### Detected Errors

In [48]:
query = "SELECT t1._tid_, t1.init_value as init, t2._value_ as gt " \
        "FROM %s as t1, %s as t2, %s as t3 " \
        "WHERE t1._tid_ = t2._tid_ AND t1._cid_ = t3._cid_ " \
        "AND t1.attribute = t2._attribute_ " \
        "AND t1.init_value != t2._value_" \
        % ('cell_domain', 'food_clean', 'dk_cells')

In [49]:
init_error = query_df(query)

In [50]:
init_error.shape[0]

508

In [51]:
# remove false error
true_error = init_error[~init_error['_tid_'].isin(false_error_tid)]

In [52]:
true_error.shape[0]

351

In [53]:
true_detected_error = true_error.shape[0]

### Compute Statistics

In [54]:
precision = float(correct_repair_count) / true_repair_gt_count

In [55]:
precision

0.8947368421052632

In [56]:
recall = float(correct_repair_count) / true_error_count
recall

0.04197530864197531

In [57]:
repairing_recall = float(correct_repair_count) / true_detected_error
repairing_recall

0.04843304843304843

In [58]:
f1 = 2*(precision*recall)/(precision+recall)
f1

0.08018867924528301

In [59]:
repairing_f1 = 2*(precision*repairing_recall)/(precision+repairing_recall)
repairing_f1

0.09189189189189188

### Read Configuration from Old Evaluation

In [60]:
ev = pd.read_csv("/fastdisk/Evaluation_Results/hc_eval_food_GL.csv")

In [61]:
config = ev[ev['notes'] == index]
config

Unnamed: 0,data,dc,prunning_k,weight_decay,normalize,bias,featurizer,notes,precision,recall,repairing_recall,F1,repairing_F1,detected_errors,total_errors,correct_repairs,total_repairs,total_repairs(Grdth_present),runtime,location
6,food,1113194633_food_GLD_cov_alpha0dot01_topk_3_s40...,0.1,0.01,False,False,initsim-constraint-initattr-freq-occur,7,0.9,0.0291,0.0354,0.0563,0.0682,508.0,619.0,18.0,65.0,20.0,9756.543961,/fastdisk/ProfilerData/food/GL/GLD/sparsity_0....


In [62]:
config.shape[1]

20

In [63]:
first_half = config.iloc[:,0:8].values[0]
former = ','.join(map(str, first_half))
former

'food,1113194633_food_GLD_cov_alpha0dot01_topk_3_s400_k8_b001_multiple_topdown_knn10_euclidean_dc.txt,0.1,0.01,False,False,initsim-constraint-initattr-freq-occur,7'

In [64]:
second_half = config.iloc[:,18:].values[0]
latter = ','.join(map(str, second_half))
latter

'9756.54396081,/fastdisk/ProfilerData/food/GL/GLD/sparsity_0.01/decomposition/knn/multiple_topdown'

In [65]:
ev.columns.values

array(['data', 'dc', 'prunning_k', 'weight_decay', 'normalize', 'bias',
       'featurizer', 'notes', 'precision', 'recall', 'repairing_recall',
       'F1', 'repairing_F1', 'detected_errors', 'total_errors',
       'correct_repairs', 'total_repairs', 'total_repairs(Grdth_present)',
       'runtime', 'location'], dtype=object)

### Output Results

In [66]:
out = open("/fastdisk/Evaluation_Results/food-true-evaluation.csv", "a+")

In [67]:
meat = ["%.4f"%precision, "%.4f"%recall, "%.4f"%repairing_recall,
        "%.4f"%f1, "%.4f"%repairing_f1,
        str(true_detected_error),str(true_error_count),
        str(correct_repair_count),str(true_total_repair_count),
        str(true_repair_gt_count)]

In [68]:
",".join(meat)

'0.8947,0.0420,0.0484,0.0802,0.0919,351,405,17,57,19'

In [69]:
out.write("{},{},{}\n".format(former, ",".join(meat),latter))

In [70]:
out.close()