In [2]:
import py_entitymatching as em

TypeError: required field "type_ignores" missing from Module

# Down Sampling

In [None]:
# Read the CSV files
A = em.read_csv_metadata('./data/csv_example_messy_input.csv', key='Id')
print(len(A))

# Downsample the datasets
# y_param: Control the down sample size of table sample_A.
sample_A, sample_B = em.down_sample(A, A, 
                                    size=500, 
                                    y_param=1)

print(len(sample_A), len(sample_B))

# Blocker

In [None]:
# Attribute Equivalence Blocker
# output_attrs: A list of attribute names from the table to be included in the output candidate set
ab = em.AttrEquivalenceBlocker()
C0 = ab.block_tables(sample_A, sample_B, 
                     'Phone', 'Phone', 
                     l_output_attrs=['Site name', 'Address', 'Phone', 'Zip'], 
                     r_output_attrs=['Site name', 'Address', 'Phone', 'Zip'])
C0.head()

In [None]:
# Overlap Blocker
# word_level: Whether the overlap attributes should be tokenized as words
ob = em.OverlapBlocker()
C1 = ob.block_tables(sample_A, sample_B, 
                     'Address', 'Address', 
                     word_level=True,
                     overlap_size=2, 
                     l_output_attrs=['Site name', 'Address', 'Phone', 'Zip'], 
                     r_output_attrs=['Site name', 'Address', 'Phone', 'Zip'])
C1.head()

In [2]:
# Blackbox Blockers
def match_func(ltuple, rtuple):
    l_phone, l_zip = ltuple['Phone'], ltuple['Zip']
    r_phone, r_zip = rtuple['Phone'], rtuple['Zip']
    if l_phone != r_phone and l_zip != r_zip:
        return True
    else:
        return False


bb = em.BlackBoxBlocker()
bb.set_black_box_function(match_func)
C2 = bb.block_tables(sample_A, sample_B, 
                     l_output_attrs=['Site name', 'Address', 'Phone', 'Zip'], 
                     r_output_attrs=['Site name', 'Address', 'Phone', 'Zip'])
C2.head()

TypeError: required field "type_ignores" missing from Module

In [None]:
# Generates features that can be used for blocking purposes
# Whether to show the user the inferred attribute types and the features chosen for those types.
block_f = em.get_features_for_blocking(sample_A, sample_B, 
                                       validate_inferred_attr_types=False)
block_f

In [None]:
# Rule-Based Blocker
rule1 = ['Address_Address_lev_dist(ltuple, rtuple) > 5']

rb = em.RuleBasedBlocker()
rb.add_rule(rule1, block_f)

C3 = rb.block_tables(sample_A, sample_B, 
                     l_output_attrs=['Site name', 'Address', 'Phone', 'Zip'], 
                     r_output_attrs=['Site name', 'Address', 'Phone', 'Zip'])
C3.head()

In [None]:
# Combining Multiple Blockers
ab = em.AttrEquivalenceBlocker()
C0 = ab.block_tables(sample_A, sample_B, 
                     'Phone', 'Phone', 
                     l_output_attrs=['Site name', 'Address', 'Phone', 'Zip'], 
                     r_output_attrs=['Site name', 'Address', 'Phone', 'Zip'])


ob = em.OverlapBlocker()
C4 = ob.block_candset(C0, 
                      'Address', 'Address', 
                      word_level=True, 
                      overlap_size=2)
C4.head()

# Sampling and Labeling

In [None]:
# Read the CSV files
A = em.read_csv_metadata('./data/csv_example_messy_input.csv', key='Id')
# Downsample the datasets
sample_A, sample_B = em.down_sample(A, A, size=500, y_param=1, show_progress=False)


def match_func(ltuple, rtuple):
    l_phone, l_zip = ltuple['Phone'], ltuple['Zip']
    r_phone, r_zip = rtuple['Phone'], rtuple['Zip']
    if l_phone != r_phone and l_zip != r_zip:
        return True
    else:
        return False


bb = em.BlackBoxBlocker()
bb.set_black_box_function(match_func)
C = bb.block_tables(sample_A, sample_B, 
                    l_output_attrs=['Site name', 'Address', 'Phone', 'Zip'], 
                    r_output_attrs=['Site name', 'Address', 'Phone', 'Zip'])

ob = em.OverlapBlocker()
C1 = ob.block_candset(C, 'Address', 'Address', word_level=True, overlap_size=2)

# Sample Candidate Set
S = em.sample_table(C1, 450)

# Label the sampled set
# Specify the name for the label column
G = em.label_table(S, 'label')

save_file = False
if save_file:
    em.to_csv_metadata(sample_A, './data/a.csv')
    em.to_csv_metadata(sample_B, './data/b.csv')
    em.to_csv_metadata(G, './data/labelled_data.csv')

# Matcher

In [None]:
# Read the CSV files
sample_A = em.read_csv_metadata('./data/a.csv', key='Id')
sample_B = em.read_csv_metadata('./data/b.csv', key='Id')

G = em.read_csv_metadata('./data/labelled_data.csv', key='_id', fk_ltable='ltable_Id', fk_rtable='rtable_Id',
                         ltable=sample_A, rtable=sample_B)

In [None]:
# Generate a set of features
F = em.get_features_for_matching(sample_A, sample_B, validate_inferred_attr_types=False)
F.feature_name

In [None]:
# Convert the G into a set of feature vectors using F
# attrs_after: List of attributes from the input candset that should be added after the feature vectors
H = em.extract_feature_vecs(G,
                            feature_table=F,
                            attrs_after='label',
                            show_progress=False)
H.head()

In [None]:
# Impute feature vectors with the mean of the column values.
H = em.impute_table(H,
                    exclude_attrs=['_id', 'ltable_Id', 'rtable_Id', 'label'],
                    strategy='mean')

In [None]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree')
svm = em.SVMMatcher(name='SVM')
rf = em.RFMatcher(name='RF')
lg = em.LogRegMatcher(name='LogReg')
ln = em.LinRegMatcher(name='LinReg')

# Select the best ML matcher using CV
# k: The k value for cross-validation
result = em.select_matcher([dt, rf, svm, ln, lg], table=H,
                           exclude_attrs=['_id', 'ltable_Id', 'rtable_Id', 'label'],
                           k=5,
                           target_attr='label')
print(result['cv_stats'])

# Entity Matching

In [None]:
import py_entitymatching as em

# Read the CSV files
A = em.read_csv_metadata('./data/csv_example_input_with_true_ids.csv', key='Id')
A_0 = em.read_csv_metadata('./data/csv_example_messy_input.csv', key='Id')


def match_func(ltuple, rtuple):
    l_phone, l_zip = ltuple['Phone'], ltuple['Zip']
    r_phone, r_zip = rtuple['Phone'], rtuple['Zip']
    if l_phone != r_phone and l_zip != r_zip:
        return True
    else:
        return False


has_label = True
if not has_label:
    # Combining Multiple Blockers
    bb = em.BlackBoxBlocker()
    bb.set_black_box_function(match_func)
    C = bb.block_tables(A, A,
                        l_output_attrs=['Site name', 'Address', 'Zip', 'Phone', 'True Id'],
                        r_output_attrs=['Site name', 'Address', 'Zip', 'Phone', 'True Id'])

    ob = em.OverlapBlocker()
    C1 = ob.block_candset(C,
                          'Address', 'Address',
                          word_level=True,
                          overlap_size=2)

    em.to_csv_metadata(C1, './data/block_tables.csv')

    # Sample Candidate Set
    S = em.sample_table(C1, 400)

    # Label the sampled set
    # Specify the name for the label column
    G = em.label_table(S, 'label')
    em.to_csv_metadata(G, './data/labelled_data_full.csv')

else:
    G = em.read_csv_metadata('./data/labelled_data_full.csv',
                             key='_id',
                             fk_ltable='ltable_Id', fk_rtable='rtable_Id',
                             ltable=A_0, rtable=A_0)
    C = em.read_csv_metadata('./data/block_tables.csv',
                             key='_id',
                             fk_ltable='ltable_Id', fk_rtable='rtable_Id',
                             ltable=A_0, rtable=A_0)

    train_test = em.split_train_test(G, train_proportion=0.7)
    devel_set = train_test['train']
    eval_set = train_test['test']

    # Generate a set of features
    F = em.get_features_for_matching(A_0, A_0, validate_inferred_attr_types=False)
    
    # Convert the data into a set of feature vectors using F
    H_train = em.extract_feature_vecs(devel_set, 
                                      feature_table=F, 
                                      attrs_after='label')
    H_eval = em.extract_feature_vecs(eval_set, 
                                     feature_table=F, 
                                     attrs_after='label')
    C_test = em.extract_feature_vecs(C, 
                                     feature_table=F)

    # Impute feature vectors with the mean of the column values.
    H_train = em.impute_table(H_train,
                              exclude_attrs=['_id', 'ltable_Id', 'rtable_Id', 'label'],
                              strategy='mean')
    H_eval = em.impute_table(H_eval,
                             exclude_attrs=['_id', 'ltable_Id', 'rtable_Id', 'label'],
                             strategy='mean')
    C_test = em.impute_table(C_test,
                             exclude_attrs=['_id', 'ltable_Id', 'rtable_Id'],
                             strategy='mean')

    # Train Matcher
    rf = em.RFMatcher(name='RF')
    rf.fit(table=H_train,
           exclude_attrs=['_id', 'ltable_Id', 'rtable_Id', 'label'],
           target_attr='label')

    # Test1
    pred_table = rf.predict(table=H_eval,
                            exclude_attrs=['_id', 'ltable_Id', 'rtable_Id', 'label'],
                            append=True,
                            target_attr='predicted_labels')

    eval_summary = em.eval_matches(pred_table, 'label', 'predicted_labels')
    print(eval_summary)

    # Test2
    pred_table = rf.predict(table=C_test,
                            exclude_attrs=['_id', 'ltable_Id', 'rtable_Id'],
                            append=True, 
                            target_attr='predicted_labels')
    em.to_csv_metadata(C_test, './data/result.csv')

# Evaluation

In [None]:
from future.utils import viewitems

import csv
import collections
import itertools


def evaluateDuplicates(found_dupes, true_dupes):
    true_positives = found_dupes.intersection(true_dupes)
    false_positives = found_dupes.difference(true_dupes)
    uncovered_dupes = true_dupes.difference(found_dupes)

    print('found duplicate')
    print(len(found_dupes))

    print('precision')
    print(1 - len(false_positives) / float(len(found_dupes)))

    print('recall')
    print(len(true_positives) / float(len(true_dupes)))


def mydupePairs(filename):
    dupe_s = set([])
    with open(filename,encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter=',', quotechar='"')
        for row in reader:
            ltable_Id = row['ltable_Id']
            rtable_Id = row['rtable_Id']
            if row['predicted_labels'] == '1':
                pair = {ltable_Id, rtable_Id}
                if len(pair) > 1:
                    dupe_s.add(frozenset(pair))
    return dupe_s


def dupePairs(filename, rowname) :
    dupe_d = collections.defaultdict(list)

    with open(filename,encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter=',', quotechar='"')
        for row in reader:
            dupe_d[row[rowname]].append(row['Id'])

    if 'x' in dupe_d :
        del dupe_d['x']

    dupe_s = set([])
    for (unique_id, cluster) in viewitems(dupe_d):
        if len(cluster) > 1:
            for pair in itertools.combinations(cluster, 2):
                dupe_s.add(frozenset(pair))

    return dupe_s

manual_clusters = './data/csv_example_input_with_true_ids.csv'
dedupe_clusters = './data/result.csv'

true_dupes = dupePairs(manual_clusters, 'True Id')

test_dupes = mydupePairs(dedupe_clusters)

evaluateDuplicates(test_dupes, true_dupes)