# 1. Load Data

In [1]:
# surpress scipy warning
# https://github.com/scipy/scipy/issues/5998
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [2]:
import py_entitymatching as em
import pandas as pd
# Set the seed value 
seed = 0

In [3]:
A = em.read_csv_metadata('cars_sample.csv', key='vin')
B = em.read_csv_metadata('market_sample.csv', key='vin')
# Load the pre-labeled data
S = em.read_csv_metadata('pairs_sample_label.csv',
                        key='_id', ltable=A, rtable=B,
                        fk_ltable='ltable_vin', fk_rtable='rtable_vin')

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


Then, split the labeled data into development set and evaluation set. Use the development set to select the best learning-based matcher



In [4]:
# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.75, random_state=0)
I = IJ['train']
J = IJ['test']

# 2. Extracting Feature Vectors
Loading feature table does not work (need to check this).

In [5]:
F = em.get_features_for_matching(A[['name', 'price', 'maker', 'miles', 'exterior_color']], 
                                 B[['name', 'price', 'maker', 'miles', 'exterior_color']], 
                                 validate_inferred_attr_types=False)

In [6]:
F.feature_name # use all the similarity functions

0                                       price_price_exm
1                                       price_price_anm
2                                  price_price_lev_dist
3                                   price_price_lev_sim
4                                  maker_maker_lev_dist
5                                   maker_maker_lev_sim
6                                       maker_maker_jar
7                                       maker_maker_jwn
8                                       maker_maker_exm
9                           maker_maker_jac_qgm_3_qgm_3
10                                      miles_miles_exm
11                                      miles_miles_anm
12                                 miles_miles_lev_dist
13                                  miles_miles_lev_sim
14        exterior_color_exterior_color_jac_qgm_3_qgm_3
15    exterior_color_exterior_color_cos_dlm_dc0_dlm_dc0
16    exterior_color_exterior_color_jac_dlm_dc0_dlm_dc0
17                    exterior_color_exterior_co

In [7]:
# extract feature vector
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)

In [8]:
H.head()

Unnamed: 0,_id,ltable_vin,rtable_vin,price_price_exm,price_price_anm,price_price_lev_dist,price_price_lev_sim,maker_maker_lev_dist,maker_maker_lev_sim,maker_maker_jar,...,miles_miles_lev_sim,exterior_color_exterior_color_jac_qgm_3_qgm_3,exterior_color_exterior_color_cos_dlm_dc0_dlm_dc0,exterior_color_exterior_color_jac_dlm_dc0_dlm_dc0,exterior_color_exterior_color_mel,exterior_color_exterior_color_lev_dist,exterior_color_exterior_color_lev_sim,exterior_color_exterior_color_nmw,exterior_color_exterior_color_sw,label
46,7405,WBA8D9G53JNU67833,WBA4J3C5XJBG91177,0.0,0.967234,5.0,0.285714,0.0,1.0,1.0,...,0.666667,0.0,0.0,0.0,0.0,8.0,0.111111,-4.0,1.0,0
263,39491,4JGDA5HB3FA529524,4JGDA5HB2FA475455,0.0,0.963942,4.0,0.428571,0.0,1.0,1.0,...,0.285714,0.0,0.0,0.0,0.50505,10.0,0.090909,-4.0,1.0,0
458,69958,WBAVL1C5XFVY34215,WBAVL1C5XFVY34215,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,12.0,12.0,1
230,36575,WBA8D9G54JNU68540,WBA8D9G54JNU67730,0.0,0.977572,4.0,0.428571,0.0,1.0,1.0,...,0.666667,0.027778,0.0,0.0,0.496032,18.0,0.142857,-7.0,3.0,0
107,19234,WBA3V9C51FP798550,WBA3V9C51FP798550,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,12.0,12.0,1


In [9]:
any(pd.isnull(H)) # check if there is NULL data

True

In [10]:
# Impute feature vectors with the mean of the column values.
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_vin', 'rtable_vin', 'label'],
                strategy='mean')

# 3. Cross Validation
Train different matchers using 5-fold CV, pick matcher based on F1 value.

In [11]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')

In [12]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_vin', 'rtable_vin', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.993548,1.0,0.996721
1,RF,0.993548,1.0,0.996721
2,SVM,0.980345,0.955654,0.9676
3,LinReg,1.0,0.955758,0.97681
4,LogReg,1.0,0.993103,0.996491


In [13]:
# precision result
result['drill_down_cv_stats']['precision']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x10cc16160>,5,1.0,1.0,1.0,0.967742,1.0,0.993548
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x10cc16b38>,5,1.0,1.0,1.0,0.967742,1.0,0.993548
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x10cc16390>,5,0.969697,1.0,1.0,0.967742,0.964286,0.980345
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x10cc165c0>,5,1.0,1.0,1.0,1.0,1.0,1.0
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x10cc16f98>,5,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
result['drill_down_cv_stats']['recall']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x10cc16160>,5,1.0,1.0,1.0,1.0,1.0,1.0
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x10cc16b38>,5,1.0,1.0,1.0,1.0,1.0,1.0
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x10cc16390>,5,0.941176,0.966667,0.939394,1.0,0.931034,0.955654
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x10cc165c0>,5,1.0,0.966667,0.878788,0.933333,1.0,0.955758
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x10cc16f98>,5,1.0,1.0,1.0,1.0,0.965517,0.993103


In [15]:
result['drill_down_cv_stats']['f1']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x10cc16160>,5,1.0,1.0,1.0,0.983607,1.0,0.996721
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x10cc16b38>,5,1.0,1.0,1.0,0.983607,1.0,0.996721
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x10cc16390>,5,0.955224,0.983051,0.96875,0.983607,0.947368,0.9676
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x10cc165c0>,5,1.0,0.983051,0.935484,0.965517,1.0,0.97681
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x10cc16f98>,5,1.0,1.0,1.0,1.0,0.982456,0.996491


# 4. Train and Predict
Random forest is picked by CV. So train a RF matcher on I and apply the matcher on J to predict. It can be seen from the result that the performance of the matcher on J is very good. 

In [16]:
result['selected_matcher']

<py_entitymatching.matcher.dtmatcher.DTMatcher at 0x10cc16160>

In [17]:
rf = em.RFMatcher(name='RF', random_state=0)

In [18]:
# Train using feature vectors from I 
rf.fit(table=H, 
       exclude_attrs=['_id', 'ltable_vin', 'rtable_vin', 'label'], 
       target_attr='label')

In [19]:
L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='label', show_progress=False)

In [20]:
any(pd.isnull(L))

True

In [21]:
L = em.impute_table(L, 
                exclude_attrs=['_id', 'ltable_vin', 'rtable_vin', 'label'],
                strategy='mean')

In [22]:
predictions = rf.predict(table=L, exclude_attrs=['_id', 'ltable_vin', 'rtable_vin', 'label'], 
              append=True, target_attr='predicted', inplace=False, return_probs=True,
                        probs_attr='proba')

In [23]:
predictions[['_id', 'ltable_vin', 'rtable_vin', 'predicted', 'proba']].head()

Unnamed: 0,_id,ltable_vin,rtable_vin,predicted,proba
434,66276,WA1BNAFY5J2099783,WA1BNAFY8J2017383,0,0.0
122,21429,5UXWX9C34H0T23686,5UXWX9C34H0T23686,1,1.0
224,35685,WA1C2AFP9HA099371,WA1C2AFP4HA096460,0,0.0
479,72736,WDDZF4KB9JA409188,WDC0G4KB8JV060995,0,0.0
205,32389,WBA3B3G54FNR87396,WBA3B3G54FNR87396,1,1.0


In [24]:
eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result)

Precision : 97.01% (65/67)
Recall : 98.48% (65/66)
F1 : 97.74%
False positives : 2 (out of 67 positive predictions)
False negatives : 1 (out of 83 negative predictions)
