# 1. Load Data

In [1]:
# surpress scipy warning
# https://github.com/scipy/scipy/issues/5998
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [2]:
import py_entitymatching as em
import pandas as pd
# Set the seed value 
seed = 0

In [3]:
A = em.read_csv_metadata('cars_sample.csv', key='vin')
B = em.read_csv_metadata('market_sample.csv', key='vin')
# Load the pre-labeled data
S = em.read_csv_metadata('pairs_sample_label.csv',
                        key='_id', ltable=A, rtable=B,
                        fk_ltable='ltable_vin', fk_rtable='rtable_vin')

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


Then, split the labeled data into development set and evaluation set. Use the development set to select the best learning-based matcher



In [4]:
# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.75, random_state=0)
I = IJ['train']
J = IJ['test']

# 2. Extracting Feature Vectors
Loading feature table does not work (need to check this).

In [5]:
F = em.get_features_for_matching(A[['name', 'price', 'maker', 'miles', 'exterior_color']], 
                                 B[['name', 'price', 'maker', 'miles', 'exterior_color']], 
                                 validate_inferred_attr_types=False)

In [6]:
F.feature_name # use all the similarity functions

0                                       price_price_exm
1                                       price_price_anm
2                                  price_price_lev_dist
3                                   price_price_lev_sim
4                                  maker_maker_lev_dist
5                                   maker_maker_lev_sim
6                                       maker_maker_jar
7                                       maker_maker_jwn
8                                       maker_maker_exm
9                           maker_maker_jac_qgm_3_qgm_3
10                                      miles_miles_exm
11                                      miles_miles_anm
12                                 miles_miles_lev_dist
13                                  miles_miles_lev_sim
14        exterior_color_exterior_color_jac_qgm_3_qgm_3
15    exterior_color_exterior_color_cos_dlm_dc0_dlm_dc0
16    exterior_color_exterior_color_jac_dlm_dc0_dlm_dc0
17                    exterior_color_exterior_co

In [7]:
# extract feature vector
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)

In [8]:
H.head()

Unnamed: 0,_id,ltable_vin,rtable_vin,price_price_exm,price_price_anm,price_price_lev_dist,price_price_lev_sim,maker_maker_lev_dist,maker_maker_lev_sim,maker_maker_jar,...,miles_miles_lev_sim,exterior_color_exterior_color_jac_qgm_3_qgm_3,exterior_color_exterior_color_cos_dlm_dc0_dlm_dc0,exterior_color_exterior_color_jac_dlm_dc0_dlm_dc0,exterior_color_exterior_color_mel,exterior_color_exterior_color_lev_dist,exterior_color_exterior_color_lev_sim,exterior_color_exterior_color_nmw,exterior_color_exterior_color_sw,label
46,10277,WBAJE7C52JG891355,WBAJE7C52JG891355,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,12.0,12.0,1
263,40862,WDDWK8HB0HF508055,4JGDA7FB4HA900448,,,,,0.0,1.0,1.0,...,0.5,0.0,0.0,0.0,0.328986,21.0,0.086957,-16.0,2.0,0
458,73829,WAUB1GFF5J1008968,WA1BNAFY6J2021089,0.0,0.993673,3.0,0.571429,0.0,1.0,1.0,...,0.5,0.184211,0.0,0.0,0.659916,12.0,0.478261,6.0,9.0,0
230,39224,WDDSJ4GB2JN556731,WDC0G4KB2JV060541,,,,,0.0,1.0,1.0,...,0.5,0.0,0.0,0.0,0.0,12.0,0.0,-7.0,0.0,0
107,16958,WA1LAAF75JD001766,WA1LAAF75JD001766,,,,,0.0,1.0,1.0,...,1.0,0.555556,0.333333,0.2,0.957895,2.0,0.894737,17.0,17.0,1


In [9]:
any(pd.isnull(H)) # check if there is NULL data

True

In [10]:
# Impute feature vectors with the mean of the column values.
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_vin', 'rtable_vin', 'label'],
                strategy='mean')

# 3. Cross Validation
Train different matchers using 5-fold CV, pick matcher based on F1 value.

In [11]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')

In [12]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_vin', 'rtable_vin', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.992308,1.0,0.996078
1,RF,1.0,1.0,1.0
2,SVM,1.0,0.949315,0.973211
3,LinReg,1.0,0.969433,0.98441
4,LogReg,1.0,0.977856,0.988718


In [13]:
# precision result
result['drill_down_cv_stats']['precision']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x119f8fef0>,5,1.0,1.0,1.0,1.0,0.961538,0.992308
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x119f8f4a8>,5,1.0,1.0,1.0,1.0,1.0,1.0
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x119f8f518>,5,1.0,1.0,1.0,1.0,1.0,1.0
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x119f8f898>,5,1.0,1.0,1.0,1.0,1.0,1.0
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x119f8fcc0>,5,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
result['drill_down_cv_stats']['recall']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x119f8fef0>,5,1.0,1.0,1.0,1.0,1.0,1.0
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x119f8f4a8>,5,1.0,1.0,1.0,1.0,1.0,1.0
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x119f8f518>,5,0.97561,1.0,0.9,0.870968,1.0,0.949315
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x119f8f898>,5,0.95122,0.961538,0.966667,0.967742,1.0,0.969433
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x119f8fcc0>,5,1.0,0.961538,1.0,0.967742,0.96,0.977856


In [15]:
result['drill_down_cv_stats']['f1']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x119f8fef0>,5,1.0,1.0,1.0,1.0,0.980392,0.996078
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x119f8f4a8>,5,1.0,1.0,1.0,1.0,1.0,1.0
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x119f8f518>,5,0.987654,1.0,0.947368,0.931034,1.0,0.973211
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x119f8f898>,5,0.975,0.980392,0.983051,0.983607,1.0,0.98441
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x119f8fcc0>,5,1.0,0.980392,1.0,0.983607,0.979592,0.988718


# 4. Train and Predict
Random forest is picked by CV. So train a RF matcher on I and apply the matcher on J to predict. It can be seen from the result that the performance of the matcher on J is very good. 

In [16]:
result['selected_matcher']

<py_entitymatching.matcher.rfmatcher.RFMatcher at 0x119f8f4a8>

In [17]:
rf = em.RFMatcher(name='RF', random_state=0)

In [18]:
# Train using feature vectors from I 
rf.fit(table=H, 
       exclude_attrs=['_id', 'ltable_vin', 'rtable_vin', 'label'], 
       target_attr='label')

In [19]:
L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='label', show_progress=False)

In [20]:
any(pd.isnull(L))

True

In [21]:
L = em.impute_table(L, 
                exclude_attrs=['_id', 'ltable_vin', 'rtable_vin', 'label'],
                strategy='mean')

In [22]:
predictions = rf.predict(table=L, exclude_attrs=['_id', 'ltable_vin', 'rtable_vin', 'label'], 
              append=True, target_attr='predicted', inplace=False, return_probs=True,
                        probs_attr='proba')

In [23]:
predictions[['_id', 'ltable_vin', 'rtable_vin', 'predicted', 'proba']].head()

Unnamed: 0,_id,ltable_vin,rtable_vin,predicted,proba
434,70427,WDDLJ9BB2EA100847,WDDLJ9BB2EA100847,1,1.0
122,19114,2T2BK1BA3FC327307,2T2BK1BA3FC327307,1,1.0
224,39210,4JGDA5HB0JB086817,WDC0G4KB2JV060541,0,0.0
479,75120,JTHBL46F375003358,JTHBL46F375003358,1,1.0
205,36375,WA1LFAFP3FA108602,WAUWFLFF8F1037786,0,0.0


In [24]:
eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result)

Precision : 100.0% (58/58)
Recall : 100.0% (58/58)
F1 : 100.0%
False positives : 0 (out of 58 positive predictions)
False negatives : 0 (out of 92 negative predictions)
