In [4]:
from future.builtins import next
import os
import csv
import re
import logging
import optparse

import dedupe
from unidecode import unidecode

import pandas as pd

In [5]:
pd.options.display.float_format = '{:20,.2f}'.format
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

In [11]:
ebooks1_all_path = (r'/home/ubuntu/jupyter/ServerX/1_Standard Data Integration/Sample Datasets'
                    r'/Processed Data/product_samples/ebooks1_all.csv')

In [12]:
input_file = ebooks1_all_path
output_file = 'ebooks1_output2.csv'
settings_file = 'ebooks1_learned_settings2'
training_file = 'ebooks1_training2.json'

## DF and corpus prep

In [19]:
fields_of_interest = [
    'Id',
    'name',
    'description',
    'producer',
    'price',
    'source'
]

In [20]:
ebooks1_all = pd.read_csv(ebooks1_all_path, sep=',', quotechar='"')[fields_of_interest]

In [21]:
ebooks1_all.columns

Index(['Id', 'name', 'description', 'producer', 'price', 'source'], dtype='object')

In [48]:
nan_float_ids = [4614, 4770, 8449]

In [51]:
ebooks1_all = ebooks1_all[~(ebooks1_all['name'].isnull())]

In [57]:
ebooks1_all[(ebooks1_all['description'].isnull())].head()

Unnamed: 0,Id,name,description,producer,price,source
943,944,Running Technique,,Brian Martin,8.99,itunes
7694,1195,Canoe Country,,University of Minnesota Press,60.0,ebooks
8165,1666,Best Bike Rides Philadelphia,,Falcon Guides,17.99,ebooks
10047,3548,A Guide to Improvised Weaponry,,F+W Media,15.99,ebooks
10114,3615,The Official Gun Digest Book of Guns & Prices 2015,,F+W Media,26.99,ebooks


In [41]:
x = ebooks1_all[(ebooks1_all['name'].isnull()]
x.columns

Index(['Id', 'name', 'description', 'producer', 'price', 'source'], dtype='object')

In [37]:
type(x['name'][11113])

float

In [12]:
ebooks1_all[ebooks1_all['name'] == None]

Unnamed: 0.1,Unnamed: 0,Id,name,description,producer,price,source


In [14]:
description_corpus = ebooks1_all['description'].to_list()
description_corpus = [x for x in description_corpus if str(x) != 'nan']

In [58]:
producer_corpus = ebooks1_all.drop_duplicates().to_dict('records')

In [59]:
producers = list(ebooks1_all['producer'].unique())
producers = [x for x in producers if str(x) != 'nan']

## ----------------------------------------------------------------------------------------------------------

In [65]:
def preProcess(key, column):
    
    try : # python 2/3 string differences
        column = column.decode('utf8')
    except AttributeError:
        pass
    column = unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    column = column.lower()
    if not column:
        return None
        
    if key == 'price':
        column = float(column) 
    return column

In [66]:
def readData(filename):
    
    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(k, v)) for (k, v) in row.items()]
            row_id = int(row['Id'])
            data_d[row_id] = dict(clean_row)

    return data_d 

In [67]:
print('importing data ...')
data_d = readData(input_file)

importing data ...


## Dedupe Processing

In [68]:
fields = [
    {'field' : 'name', 'type': 'Name'},
#    {'field' : 'name', 'type': 'String'},
 #   {'field' : 'description', 
 #    'type': 'Text',
 #    'corpus': description_corpus,
 #    'has_missing': True
 #   },
#    {'field' : 'category', 
#     'type': 'FuzzyCategorical',
#     'categories': categories,
#     'corpus': category_corpus,
#     'has missing' : True
#    },        
    {'field' : 'producer', 
     'type': 'FuzzyCategorical',
     'categories': producers,
     'corpus': producer_corpus,
     'has_missing': True
    },
    {'field' : 'price', 
     'type': 'Price',
     'has_missing': True
    },
]

In [69]:
deduper = dedupe.Dedupe(fields)

In [70]:
deduper.prepare_training(data_d)

INFO:dedupe.canopy_index:Removing stop word  d
INFO:dedupe.canopy_index:Removing stop word  h
INFO:dedupe.canopy_index:Removing stop word de
INFO:dedupe.canopy_index:Removing stop word en
INFO:dedupe.canopy_index:Removing stop word es
INFO:dedupe.canopy_index:Removing stop word s 
INFO:dedupe.canopy_index:Removing stop word t 
INFO:dedupe.canopy_index:Removing stop word y 
INFO:dedupe.canopy_index:Removing stop word  b
INFO:dedupe.canopy_index:Removing stop word  r
INFO:dedupe.canopy_index:Removing stop word an
INFO:dedupe.canopy_index:Removing stop word fo
INFO:dedupe.canopy_index:Removing stop word in
INFO:dedupe.canopy_index:Removing stop word nd
INFO:dedupe.canopy_index:Removing stop word or
INFO:dedupe.canopy_index:Removing stop word r 
INFO:dedupe.canopy_index:Removing stop word to
INFO:dedupe.canopy_index:Removing stop word un
INFO:dedupe.canopy_index:Removing stop word  l
INFO:dedupe.canopy_index:Removing stop word io
INFO:dedupe.canopy_index:Removing stop word on
INFO:dedupe.c

INFO:dedupe.canopy_index:Removing stop word se
INFO:dedupe.canopy_index:Removing stop word am
INFO:dedupe.canopy_index:Removing stop word ro
INFO:dedupe.canopy_index:Removing stop word is
INFO:dedupe.canopy_index:Removing stop word ic
INFO:dedupe.canopy_index:Removing stop word ts
INFO:dedupe.canopy_index:Removing stop word ll
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(PartialIndexTfidfTextCanopyPredicate: (0.8, name, CorporationName), PartialPredicate: (commonTwoTokens, name, CorporationName))


In [71]:
dedupe.consoleLabel(deduper)

name : brett
producer : triumph books
price : 11.99

name : birdie
producer : triumph books
price : 11.99

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


n


name : hiking michigan
producer : human kinetics
price : 19.95

name : hiking missouri
producer : human kinetics
price : 19.95

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things bulls fans should know & do before they die
producer : triumph books
price : 11.99

name : the psycho 100
producer : triumph books
price : 11.99

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things syracuse fans should know & do before they die
producer : triumph books
price : 11.99

name : the psycho 100
producer : triumph books
price : 11.99

0/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : instant tennis 2
producer : infinite ideas
price : 2.95

name : instant tennis 2
producer : infinite ideas
price : 2.95

0/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : golf for dummies
producer : wiley
price : 0.0

name : golf for dummies
producer : wiley
price : 0.0

1/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(PartialIndexTfidfTextCanopyPredicate: (0.8, name, CorporationName), PartialPredicate: (commonTwoTokens, name, CorporationName))
INFO:dedupe.training:(PartialIndexLevenshteinCanopyPredicate: (4, name, CorporationName), SimplePredicate: (alphaNumericPredicate, name))
name : 101 youth netball drills age 7-11
producer : bloomsbury publishing
price : 16.99

name : 101 youth netball drills age 12-16
producer : bloomsbury publishing
price : 19.99

2/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(PartialIndexTfidfTextCanopyPredicate: (0.6, name, CorporationName), TfidfNGramCanopyPredicate: (0.4, name))
INFO:dedupe.training:(PartialIndexLevenshteinCanopyPredicate: (4, name, CorporationName), SimplePredicate: (alphaNumericPredicate, name))
name : the gun digest book of tactical weapons assembly/disassembly
producer : f+w media
price : 24.99

name : the gun digest book of tactical weapons assembly/disassembly
producer : f+w media
price : 29.99

2/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : outdoor recreation management
producer : taylor and francis
price : 66.95

name : outdoor recreation management
producer : taylor and francis
price : 66.95

3/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(PartialIndexTfidfTextCanopyPredicate: (0.2, name, CorporationName), TfidfNGramCanopyPredicate: (0.4, name))
INFO:dedupe.training:(PartialIndexLevenshteinCanopyPredicate: (4, name, CorporationName), SimplePredicate: (alphaNumericPredicate, name))
name : sporting nationalisms
producer : taylor and francis
price : 64.95

name : sporting cultures
producer : taylor and francis
price : 54.95

4/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(PartialPredicate: (twoGramFingerprint, name, CorporationName), TfidfNGramCanopyPredicate: (0.4, name))
INFO:dedupe.training:(PartialIndexLevenshteinCanopyPredicate: (4, name, CorporationName), SimplePredicate: (alphaNumericPredicate, name))
name : the football fanatic's essential guide part 2: 1978 to 2010
producer : hachette india
price : 0.99

name : the football fanatic's essential guide part 1: origins to 1974
producer : hachette india
price : 0.99

4/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : wilderness world of cameron mcneish
producer : neil wilson publishing
price : 14.57

name : wilderness world of cameron mcneish
producer : neil wilson publishing
price : 14.57

4/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : the danihers
producer : allen & unwin
price : 29.99

name : the danihers
producer : allen & unwin
price : 29.99

5/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(PartialPredicate: (wholeFieldPredicate, name, CorporationName), SimplePredicate: (wholeFieldPredicate, price))
INFO:dedupe.training:(PartialIndexTfidfTextCanopyPredicate: (0.2, name, CorporationName), TfidfNGramCanopyPredicate: (0.4, name))
name : slow journeys
producer : allen & unwin
price : 25.44

name : slow journeys
producer : allen & unwin
price : 25.45

6/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : seeing the sunrise
producer : allen & unwin
price : 22.71

name : seeing the sunrise
producer : allen & unwin
price : 22.72

7/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(PartialPredicate: (wholeFieldPredicate, name, CorporationName), SimplePredicate: (wholeFieldPredicate, price))
INFO:dedupe.training:(PartialIndexTfidfTextCanopyPredicate: (0.2, name, CorporationName), TfidfNGramCanopyPredicate: (0.4, name))
INFO:dedupe.training:(PartialIndexTfidfNGramCanopyPredicate: (0.8, name, CorporationName), SimplePredicate: (sameThreeCharStartPredicate, name))
name : shooter's bible
producer : skyhorse publishing
price : 29.95

name : shooter's bible
producer : skyhorse publishing
price : 29.99

8/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, name), SimplePredicate: (wholeFieldPredicate, producer))
name : legendary hunts
producer : boone and crockett club
price : 9.99

name : legendary hunts ii
producer : boone and crockett club
price : 9.99

9/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : gun digest book of concealed carry
producer : f+w media
price : 27.99

name : the gun digest book of concealed carry
producer : f+w media
price : 24.99

9/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : never mind the bluebirds 2
producer : the history press
price : 7.99

name : never mind the bluebirds
producer : the history press
price : 9.99

10/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, name), TfidfTextCanopyPredicate: (0.8, name))
INFO:dedupe.training:(PartialPredicate: (wholeFieldPredicate, name, CorporationName), SimplePredicate: (wholeFieldPredicate, price))
name : the complete book of surf fishing
producer : skyhorse publishing
price : 16.99

name : the complete book of surf fishing
producer : skyhorse publishing
price : 16.95

10/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : the card
producer : harpercollins
price : 10.99

name : the card
producer : harpercollins
price : 11.14

11/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, name), PartialIndexTfidfNGramCanopyPredicate: (0.4, name, CorporationName))
INFO:dedupe.training:(PartialPredicate: (wholeFieldPredicate, name, CorporationName), SimplePredicate: (roundTo1, price))
name : the everything running book
producer : f+w media
price : 14.95

name : the everything running book
producer : f+w media
price : 16.95

12/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : sport and the law
producer : university of nsw press
price : 38.95

name : sport and the law
producer : university of nsw press
price : 49.95

13/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, name), TfidfTextCanopyPredicate: (0.8, name))
INFO:dedupe.training:(PartialPredicate: (wholeFieldPredicate, name, CorporationName), SimplePredicate: (roundTo1, price))
name : communication and sport
producer : sage publications
price : 68.0

name : communication and sport
producer : sage publications
price : 55.0

14/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, name), TfidfTextCanopyPredicate: (0.8, name))
INFO:dedupe.training:(PartialIndexTfidfTextCanopyPredicate: (0.2, name, CorporationName), TfidfNGramCanopyPredicate: (0.4, name))
INFO:dedupe.training:(PartialPredicate: (wholeFieldPredicate, name, CorporationName), SimplePredicate: (wholeFieldPredicate, price))
name : in search of burningbush
producer : mcgraw-hill education
price : 19.95

name : in search of burningbush
producer : mcgraw-hill education
price : 14.95

15/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : cartridges of the world
producer : f+w media
price : 47.49

name : cartridges of the world
producer : f+w media
price : 34.99

16/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, name), SimplePredicate: (wholeFieldPredicate, producer))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, name), TfidfTextCanopyPredicate: (0.8, name))
name : food, nutrition and sports performance ii
producer : taylor and francis
price : 75.95

name : food, nutrition and sports performance iii
producer : taylor and francis
price : 47.95

17/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : sport and the english, 1918-1939
producer : taylor and francis
price : 215.0

name : sport and social mobility
producer : taylor and francis
price : 48.95

17/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : children and exercise xxviii
producer : taylor and francis
price : 53.95

name : children and exercise xxvii
producer : taylor and francis
price : 48.95

17/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : best easy day hikes st. louis
producer : falcon guides
price : 8.99

name : best easy day hikes bend and central oregon
producer : falcon guides
price : 8.99

17/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : modelling the messerschmitt bf 109b/c/d/e
producer : osprey publishing
price : 15.95

name : modelling the messerschmitt me 262
producer : osprey publishing
price : 15.95

17/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : little book of golf tips
producer : g2 rights ltd
price : 4.36

name : little book of golf legends
producer : g2 rights ltd
price : 4.36

17/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : 60 hikes within 60 miles: harrisburg
producer : menasha ridge press
price : 15.95

name : 60 hikes within 60 miles: st. louis
producer : menasha ridge press
price : 15.95

17/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : no holds barred fighting
producer : tracks publishing
price : 9.99

name : no holds barred fighting: takedowns
producer : tracks publishing
price : 9.99

17/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : best tent camping: kentucky
producer : menasha ridge press
price : 15.95

name : best tent camping: georgia
producer : menasha ridge press
price : 15.95

18/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(PartialIndexTfidfNGramCanopyPredicate: (0.4, name, CorporationName), TfidfTextCanopyPredicate: (0.6, name))
INFO:dedupe.training:(PartialIndexTfidfTextCanopyPredicate: (0.6, name, CorporationName), PartialPredicate: (commonThreeTokens, name, CorporationName))
INFO:dedupe.training:(PartialPredicate: (wholeFieldPredicate, name, CorporationName), SimplePredicate: (roundTo1, price))
name : best hikes near st. louis
producer : falcon guides
price : 18.99

name : best hikes near salt lake city
producer : falcon guides
price : 17.99

18/10 positive, 15/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : kinanthropometry ix
producer : taylor and francis
price : 54.95

name : kinanthropometry viii
producer : taylor and francis
price : 49.95

18/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : modelling the messerschmitt bf 110
producer : osprey publishing
price : 15.95

name : modelling the messerschmitt bf 109f and early g series
producer : osprey publishing
price : 15.95

18/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : the best american sports writing 2011
producer : houghton mifflin harcourt
price : 14.95

name : the best american sports writing 2012
producer : houghton mifflin harcourt
price : 14.95

18/10 positive, 18/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : no holds barred fighting: the ultimate guide to conditioning
producer : tracks publishing
price : 9.99

name : no holds barred fighting
producer : tracks publishing
price : 9.99

18/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : best bike rides boston
producer : falcon guides
price : 20.99

name : best bike rides chicago
producer : falcon guides
price : 17.99

19/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(PartialPredicate: (twoGramFingerprint, name, CorporationName), TfidfNGramCanopyPredicate: (0.4, name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, name), TfidfTextCanopyPredicate: (0.8, name))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, price), TfidfTextCanopyPredicate: (0.4, name))
INFO:dedupe.training:(PartialIndexTfidfTextCanopyPredicate: (0.8, name, CorporationName), PartialPredicate: (commonTwoTokens, name, CorporationName))
name : 60 hikes within 60 miles: nashville
producer : menasha ridge press
price : 16.95

name : 60 hikes within 60 miles: st. louis
producer : menasha ridge press
price : 15.95

19/10 positive, 20/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 60 hikes within 60 miles: sacramento
producer : menasha ridge press
price : 17.95

name : 60 hikes within 60 miles: st. louis
producer : menasha ridge press
price : 15.95

19/10 positive, 21/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 60 hikes within 60 miles: st. louis
producer : menasha ridge press
price : 15.95

name : 60 hikes within 60 miles: portland
producer : menasha ridge press
price : 18.95

19/10 positive, 22/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : tales from the indiana high school basketball locker room
producer : sports publishing
price : 19.95

name : tales from the indiana hoosiers locker room
producer : sports publishing
price : 19.95

19/10 positive, 23/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : game changers: new york giants
producer : triumph books
price : 24.95

name : game changers: penn state
producer : triumph books
price : 24.99

20/10 positive, 23/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(PartialPredicate: (twoGramFingerprint, name, CorporationName), TfidfNGramCanopyPredicate: (0.4, name))
INFO:dedupe.training:(PartialPredicate: (sameSevenCharStartPredicate, name, CorporationName), SimplePredicate: (wholeFieldPredicate, price))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, name), TfidfTextCanopyPredicate: (0.8, name))
INFO:dedupe.training:(PartialIndexTfidfTextCanopyPredicate: (0.8, name, CorporationName), PartialPredicate: (commonTwoTokens, name, CorporationName))
name : best bike rides denver and boulder
producer : falcon guides
price : 20.99

name : best bike rides los angeles
producer : falcon guides
price : 21.99

20/10 positive, 24/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : biomechanics and medicine in swimming vii
producer : taylor and francis
price : 54.95

name : biomechanics and medicine in swimming v1
producer : taylor and francis
price : 49.95

20/10 positive, 25/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : the gun digest book of the ar-15
producer : f+w media
price : 27.99

name : gun digest book of the ar-15, volume iv
producer : f+w media
price : 37.49

20/10 positive, 26/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : crack of the bat
producer : unp - nebraska
price : 28.95

name : crack of the bat
producer : sports publishing
price : 29.95

20/10 positive, 26/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : day and section hikes pacific crest trail: southern california
producer : wilderness press
price : 14.95

name : day & section hikes pacific crest trail: northern california
producer : wilderness press
price : 13.95

20/10 positive, 26/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : best easy day hikes salt lake city
producer : falcon guides
price : 8.99

name : best easy day hikes
producer : falcon guides
price : 13.99

20/10 positive, 27/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : death at the ballpark
producer : mcfarland & company, inc., publishers
price : 39.95

name : death at the ballpark
producer : mcfarland & company, inc., publishers
price : 65.0

20/10 positive, 27/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : best easy day hikes salt lake city
producer : falcon guides
price : 11.99

name : best easy day hikes salt lake city
producer : falcon guides
price : 8.99

21/10 positive, 27/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : the cooperstown symposium on baseball and american culture, 2002
producer : mcfarland & company, inc., publishers
price : 45.0

name : the cooperstown symposium on baseball and american culture, 2013-2014
producer : mcfarland & company, inc., publishers
price : 45.0

22/10 positive, 27/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : the complete idiot's guide to running, 3rd edition
producer : dk publishing
price : 16.95

name : the complete idiot's guide to golf, 2nd edition
producer : dk publishing
price : 19.95

22/10 positive, 28/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : death to the bcs
producer : penguin publishing group
price : 16.99

name : death to the bcs: totally revised and updated
producer : penguin publishing group
price : 17.99

22/10 positive, 29/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : it's only a game
producer : penguin publishing group
price : 17.99

name : it's only a game
producer : atria books
price : 15.99

22/10 positive, 29/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : the physics of rugby
producer : nottingham university press
price : 28.95

name : the physics of rugby
producer : 5m publishing ltd
price : 32.73

22/10 positive, 30/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : fergie the greatest
producer : john blake
price : 7.99

name : fergie the greatest
producer : john blake
price : 11.99

22/10 positive, 31/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : best hikes near seattle
producer : falcon guides
price : 17.99

name : best hikes near seattle
producer : falcon guides
price : 21.99

23/10 positive, 31/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : how to win competitions
producer : allen & unwin
price : 9.08

name : how to win competitions
producer : allen & unwin
price : 18.14

24/10 positive, 31/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : ronaldo - 2014 updated edition
producer : icon books
price : 6.99

name : ronaldo - 2015 updated edition
producer : icon books
price : 6.99

24/10 positive, 31/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : the complete idiot's guide to rving, 3e
producer : dk publishing
price : 19.95

name : the complete idiot's guide to golf, 2nd edition
producer : dk publishing
price : 19.95

24/10 positive, 32/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : modelling the us army m4 (76mm) sherman medium tank
producer : osprey publishing
price : 15.95

name : modelling the us army m4 (75mm) sherman medium tank
producer : osprey publishing
price : 15.95

24/10 positive, 33/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : kinanthropometry and exercise physiology laboratory manual: tests, procedures and data
producer : taylor and francis
price : 75.95

name : kinanthropometry and exercise physiology laboratory manual: tests, procedures and data
producer : taylor and francis
price : 150.0

25/10 positive, 33/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : sailing alone around the world
producer : harpercollins canada
price : 0.99

name : sailing alone around the world
producer : harpercollins canada
price : 4.99

26/10 positive, 33/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : the story of the world cup
producer : faber & faber
price : 18.94

name : the story of the world cup: 2014
producer : faber & faber
price : 11.65

27/10 positive, 33/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : sport and exercise psychology
producer : taylor and francis
price : 49.95

name : sport and exercise psychology: the key concepts
producer : taylor and francis
price : 30.95

27/10 positive, 33/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : sport management cultures
producer : taylor and francis
price : 54.95

name : sport management: the basics
producer : taylor and francis
price : 33.95

28/10 positive, 33/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(PartialPredicate: (sameSevenCharStartPredicate, name, CorporationName), TfidfTextCanopyPredicate: (0.4, name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, name), TfidfTextCanopyPredicate: (0.8, name))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, price), TfidfTextCanopyPredicate: (0.6, name))
name : heart of a coach playbook
producer : baker publishing group
price : 6.99

name : heart of a coach
producer : baker publishing group
price : 11.99

28/10 positive, 34/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : remember this titan: the bill yoast story
producer : taylor trade publishing
price : 13.99

name : remember this titan
producer : taylor trade publishing
price : 9.99

28/10 positive, 34/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : sport and exercise science
producer : taylor and francis
price : 74.95

name : sport and exercise psychology: the key concepts
producer : taylor and francis
price : 33.95

28/10 positive, 34/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : baseball injuries
producer : mcfarland & company, inc., publishers
price : 45.0

name : baseball state by state
producer : mcfarland & company, inc., publishers
price : 55.0

28/10 positive, 35/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : shooter's bible guide to extreme iron
producer : skyhorse publishing
price : 19.95

name : shooter's bible
producer : skyhorse publishing
price : 29.95

28/10 positive, 36/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : the cooperstown symposium on baseball and american culture, 2002
producer : mcfarland & company, inc., publishers
price : 45.0

name : the cooperstown symposium on baseball and american culture, 2007-2008
producer : mcfarland & company, inc., publishers
price : 45.0

28/10 positive, 36/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : wing chun kung-fu volume 3
producer : tuttle publishing
price : 12.95

name : wing chun kung-fu volume 1
producer : tuttle publishing
price : 8.95

28/10 positive, 37/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : british sport - a bibliography to 2000
producer : taylor and francis
price : 54.95

name : british sport: a bibliography to 2000
producer : taylor and francis
price : 215.0

28/10 positive, 38/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : instant golf 2
producer : infinite ideas
price : 2.95

name : instant tennis
producer : infinite ideas
price : 2.95

29/10 positive, 38/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things 76ers fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things 49ers fans should know & do before they die
producer : triumph books
price : 14.99

29/10 positive, 39/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : you're the ref
producer : skyhorse publishing
price : 8.99

name : you're the ref
producer : skyhorse publishing
price : 8.95

29/10 positive, 40/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : sport and physical education: the key concepts
producer : taylor and francis
price : 33.95

name : sport and physical education: the key concepts
producer : taylor and francis
price : 26.95

30/10 positive, 40/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : handbook of sports medicine and science, rowing
producer : wiley
price : 84.95

name : handbook of sports medicine and science, sports therapy
producer : wiley
price : 62.0

31/10 positive, 40/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : handbook of sports medicine and science, road cycling
producer : wiley
price : 64.95

name : handbook of sports medicine and science, basketball
producer : wiley
price : 75.95

32/10 positive, 40/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : messi - 2014 updated edition
producer : icon books
price : 6.99

name : messi - 2015 updated edition
producer : icon books
price : 6.99

32/10 positive, 41/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : the cooperstown symposium on baseball and american culture, 2000
producer : mcfarland & company, inc., publishers
price : 45.0

name : the cooperstown symposium on baseball and american culture, 2011-2012
producer : mcfarland & company, inc., publishers
price : 45.0

32/10 positive, 42/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : practical karate volume 3
producer : tuttle publishing
price : 9.95

name : practical karate volume 4
producer : tuttle publishing
price : 9.95

32/10 positive, 43/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : ally mccoist
producer : john blake
price : 8.99

name : ally mccoist
producer : john blake
price : 9.99

32/10 positive, 44/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : 109 walks in british columbia's lower mainland
producer : greystone books
price : 19.95

name : 109 walks in british columbia's lower mainland
producer : greystone books
price : 15.95

33/10 positive, 44/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : gone pro: alabama
producer : clerisy press
price : 17.95

name : gone pro: alabama
producer : clerisy press
price : 16.95

34/10 positive, 44/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : a game of two halves
producer : taylor and francis
price : 50.95

name : a game of two halves
producer : black & white publishing
price : 1.99

35/10 positive, 44/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : introduction to sports biomechanics
producer : taylor and francis
price : 70.95

name : introduction to sports biomechanics
producer : taylor and francis
price : 49.95

35/10 positive, 45/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : the best american poetry 2006
producer : scribner
price : 13.99

name : the best american poetry 2008
producer : scribner
price : 13.99

36/10 positive, 45/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : practical karate volume 2
producer : tuttle publishing
price : 9.95

name : practical karate volume 1
producer : tuttle publishing
price : 9.95

36/10 positive, 46/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : the cooperstown symposium on baseball and american culture, 2007-2008
producer : mcfarland & company, inc., publishers
price : 45.0

name : the cooperstown symposium on baseball and american culture, 2009-2010
producer : mcfarland & company, inc., publishers
price : 38.0

36/10 positive, 47/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : the shorter wisden 2015
producer : bloomsbury publishing
price : 11.99

name : the shorter wisden 2013
producer : bloomsbury publishing
price : 16.99

36/10 positive, 48/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 50 ways to improve your powerboat driving
producer : bloomsbury publishing
price : 12.99

name : 50 ways to improve your navigation
producer : bloomsbury publishing
price : 12.99

36/10 positive, 49/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : it's how you play the game
producer : harpercollins
price : 10.99

name : it's how you play the game
producer : harpercollins
price : 8.99

36/10 positive, 50/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : hiking olympic national park
producer : falcon guides
price : 15.99

name : hiking olympic national park
producer : falcon guides
price : 21.99

37/10 positive, 50/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : the pro wrestling hall of fame
producer : ecw press
price : 11.95

name : the pro wrestling hall of fame
producer : ecw press
price : 14.95

38/10 positive, 50/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : hiking the absaroka-beartooth wilderness
producer : falcon guides
price : 21.99

name : hiking the absaroka-beartooth wilderness
producer : falcon guides
price : 18.99

39/10 positive, 50/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : 100 things blues fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things bulls fans should know & do before they die
producer : triumph books
price : 11.99

40/10 positive, 50/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : the complete idiot's guide to marathon training
producer : dk publishing
price : 18.95

name : the complete idiot's guide to karate
producer : dk publishing
price : 18.95

40/10 positive, 51/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : game day: tennessee football
producer : triumph books
price : 24.99

name : game day: georgia football
producer : triumph books
price : 24.99

40/10 positive, 52/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : instant golf 2
producer : infinite ideas
price : 2.95

name : instant golf
producer : infinite ideas
price : 2.95

40/10 positive, 53/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : the cooperstown symposium on baseball and american culture, 2013-2014
producer : mcfarland & company, inc., publishers
price : 45.0

name : the cooperstown symposium on baseball and american culture, 2007-2008
producer : mcfarland & company, inc., publishers
price : 45.0

40/10 positive, 54/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things ravens fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things raiders fans should know & do before they die
producer : triumph books
price : 11.99

40/10 positive, 55/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things bears fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things mets fans should know & do before they die
producer : triumph books
price : 11.99

40/10 positive, 56/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : gun trader's guide
producer : skyhorse publishing
price : 29.95

name : gun trader's guide
producer : skyhorse publishing
price : 29.99

40/10 positive, 57/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : standard catalog of military firearms
producer : f+w media
price : 29.99

name : standard catalog of military firearms
producer : f+w media
price : 34.99

41/10 positive, 57/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : coaching science
producer : sage publications
price : 32.0

name : coaching science
producer : wiley
price : 180.0

42/10 positive, 57/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things tigers fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things giants fans should know & do before they die
producer : triumph books
price : 11.99

42/10 positive, 58/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things a's fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things angels fans should know & do before they die
producer : triumph books
price : 11.99

42/10 positive, 59/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : 100 things bears fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things brewers fans should know & do before they die
producer : triumph books
price : 11.99

43/10 positive, 59/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things reds fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things lakers fans should know & do before they die
producer : triumph books
price : 11.99

43/10 positive, 60/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things giants fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things jets fans should know & do before they die
producer : triumph books
price : 11.99

43/10 positive, 61/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : gun digest's defensive handgun training rules and tips eshort
producer : f+w media
price : 1.5

name : gun digest's defensive handgun training eshort
producer : f+w media
price : 1.5

43/10 positive, 62/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : 100 things pirates fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things phillies fans should know & do before they die
producer : triumph books
price : 11.99

44/10 positive, 62/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things panthers fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things raptors fans should know & do before they die
producer : triumph books
price : 0.0

44/10 positive, 63/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : ronaldo - 2014 updated edition
producer : icon books
price : 6.99

name : ronaldo - 2013 edition
producer : icon books
price : 7.99

44/10 positive, 64/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things raiders fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things angels fans should know & do before they die
producer : triumph books
price : 11.99

44/10 positive, 65/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things eagles fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things orioles fans should know & do before they die
producer : triumph books
price : 11.99

44/10 positive, 66/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things packers fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things tigers fans should know & do before they die
producer : triumph books
price : 11.99

44/10 positive, 67/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : sailing alone around the world
producer : harpercollins canada
price : 0.99

name : sailing alone around the world
producer : bloomsbury publishing
price : 11.99

44/10 positive, 68/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : gun trader's guide
producer : skyhorse publishing
price : 29.95

name : gun trader's guide
producer : skyhorse publishing
price : 29.99

44/10 positive, 69/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : 100 things lions fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things lakers fans should know & do before they die
producer : triumph books
price : 11.99

45/10 positive, 69/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things flyers fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things bears fans should know & do before they die
producer : triumph books
price : 11.99

45/10 positive, 70/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : beaver
producer : allen & unwin
price : 23.95

name : beaver
producer : allen & unwin
price : 27.22

45/10 positive, 71/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : molina
producer : simon & schuster
price : 11.99

name : molina
producer : simon & schuster
price : 12.99

46/10 positive, 71/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : sea kayaking
producer : greystone books
price : 14.95

name : sea kayaking
producer : greystone books
price : 19.95

47/10 positive, 71/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : science and football vii
producer : taylor and francis
price : 42.95

name : science and soccer
producer : taylor and francis
price : 65.95

48/10 positive, 71/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : gun digest 2015
producer : f+w media
price : 34.99

name : gun digest 2012
producer : f+w media
price : 32.99

48/10 positive, 71/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : gun digest 2015
producer : f+w media
price : 34.99

name : gun digest 2011
producer : f+w media
price : 32.99

48/10 positive, 72/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : science and soccer
producer : taylor and francis
price : 71.95

name : science and football vi
producer : taylor and francis
price : 54.95

48/10 positive, 73/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : science and racket sports iii
producer : taylor and francis
price : 54.95

name : science and football vii
producer : taylor and francis
price : 42.95

48/10 positive, 74/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : hiking pennsylvania
producer : falcon guides
price : 21.99

name : hiking pennsylvania
producer : falcon guides
price : 17.99

48/10 positive, 75/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : paterno legacy
producer : triumph books
price : 11.99

name : paterno legacy
producer : triumph books
price : 13.99

49/10 positive, 75/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : core four
producer : triumph books
price : 11.99

name : core four
producer : triumph books
price : 13.99

50/10 positive, 75/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : my story
producer : penguin books ltd
price : 14.38

name : my story
producer : penguin books ltd
price : 32.0

51/10 positive, 75/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : taekwondo patterns
producer : crowood
price : 16.99

name : taekwon-do patterns
producer : crowood
price : 24.78

51/10 positive, 75/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : shotgunning
producer : skyhorse publishing
price : 14.95

name : shotgunning
producer : skyhorse publishing
price : 29.95

52/10 positive, 75/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(PartialPredicate: (sameSevenCharStartPredicate, name, CorporationName), TfidfTextCanopyPredicate: (0.4, name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, name), TfidfTextCanopyPredicate: (0.8, name))
INFO:dedupe.training:(PartialPredicate: (sameSevenCharStartPredicate, name, Surname), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, price), TfidfTextCanopyPredicate: (0.6, name))
name : arete
producer : university of california press
price : 28.95

name : arete
producer : university of california press
price : 15.95

53/10 positive, 75/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : golf made easy!
producer : publish on demand global llc
price : 9.99

name : golf made easy!
producer : sbpra
price : 2.99

54/10 positive, 75/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : once bitten
producer : panoma press
price : 6.99

name : once bitten
producer : ecademy press ltd
price : 14.99

54/10 positive, 76/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : john muir trail
producer : wilderness press
price : 18.95

name : john muir trail
producer : wilderness press
price : 15.95

54/10 positive, 77/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : instant golf 2
producer : infinite ideas
price : 2.95

name : instant golf
producer : infinite ideas
price : 2.95

55/10 positive, 77/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : cinderella man
producer : harpercollins
price : 0.99

name : cinderella man
producer : houghton mifflin harcourt
price : 13.99

55/10 positive, 78/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : jean beliveau
producer : harpercollins canada
price : 0.99

name : jean beliveau
producer : greystone books
price : 18.95

55/10 positive, 79/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : born to fight
producer : john blake
price : 9.99

name : born to fight
producer : hachette australia
price : 12.49

55/10 positive, 80/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : walking san francisco
producer : falcon guides
price : 13.99

name : walking san francisco
producer : wilderness press
price : 16.95

55/10 positive, 81/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things bills fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things knicks fans should know & do before they die
producer : triumph books
price : 11.99

55/10 positive, 82/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things blues fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things jets fans should know & do before they die
producer : triumph books
price : 11.99

55/10 positive, 83/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n 


(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things lions fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things patriots fans should know & do before they die
producer : triumph books
price : 11.99

55/10 positive, 84/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things falcons fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things rangers fans should know & do before they die
producer : triumph books
price : 11.99

55/10 positive, 85/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things knicks fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things nascar fans should know & do before they die
producer : triumph books
price : 11.99

55/10 positive, 86/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things sabres fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things giants fans should know & do before they die
producer : triumph books
price : 11.99

55/10 positive, 87/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things auburn fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things lakers fans should know & do before they die
producer : triumph books
price : 11.99

55/10 positive, 88/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things flyers fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things giants fans should know & do before they die
producer : triumph books
price : 11.99

55/10 positive, 89/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : nord perou : les andes, guide de trekking
producer : primento digital publishing
price : 5.99

name : sud perou : les andes, guide de trekking
producer : primento digital publishing
price : 5.99

55/10 positive, 90/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : muhammad ali
producer : temple university press
price : 29.95

name : muhammad ali
producer : open road media
price : 14.99

56/10 positive, 90/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : children and exercise xxiv
producer : taylor and francis
price : 54.95

name : children and exercise xxvii
producer : taylor and francis
price : 48.95

56/10 positive, 91/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things eagles fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things reds fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 92/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things duke fans should know & do before they die
producer : triumph books
price : 0.0

name : 100 things ravens fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 93/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 60 hikes within 60 miles: salt lake city
producer : menasha ridge press
price : 16.95

name : 60 hikes within 60 miles: madison
producer : menasha ridge press
price : 18.95

56/10 positive, 94/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : game of my life new york rangers
producer : sports publishing
price : 24.95

name : game of my life new york yankees
producer : sports publishing
price : 24.95

56/10 positive, 95/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : krav maga
producer : skyhorse publishing
price : 19.95

name : krav maga
producer : st. martin's press
price : 9.99

56/10 positive, 95/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : krav maga
producer : tuttle publishing
price : 17.95

name : krav maga
producer : st. martin's press
price : 9.99

56/10 positive, 96/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things arkansas fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things broncos fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 97/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things broncos fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things mariners fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 98/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things indians fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things redskins fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 99/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things broncos fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things giants fans should know & do before they die
producer : triumph books
price : 14.95

56/10 positive, 100/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things braves fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things rangers fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 101/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things nascar fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things rangers fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 102/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things indians fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things royals fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 103/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : road biking(tm) florida
producer : falcon guides
price : 15.99

name : road biking(tm) ohio
producer : falcon guides
price : 15.99

56/10 positive, 104/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : gun digest's defensive handgun loading eshort
producer : f+w media
price : 1.5

name : gun digest's defensive handgun drills & techniques collection eshort
producer : f+w media
price : 2.99

56/10 positive, 105/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : 100 things cubs fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things buckeyes fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 105/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things braves fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things 49ers fans should know & do before they die
producer : triumph books
price : 14.99

56/10 positive, 106/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : usain bolt
producer : sportsbooks
price : 7.0

name : usain bolt
producer : arcadia books limited
price : 3.99

56/10 positive, 107/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : boxing
producer : elsevier science
price : 290.0

name : boxing
producer : crowood
price : 11.65

56/10 positive, 108/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things blues fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things giants fans should know & do before they die
producer : triumph books
price : 14.95

56/10 positive, 109/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : the cycling anthology: volume five
producer : random house
price : 14.38

name : the cycling anthology: volume one
producer : random house
price : 14.38

56/10 positive, 110/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things bills fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things kansas fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 111/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things penguins fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things patriots fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 112/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things patriots fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things mariners fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 113/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things royals fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things cardinals fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 114/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : last breath
producer : random house publishing group
price : 11.99

name : last breath
producer : simon & schuster
price : 7.99

56/10 positive, 115/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : sports journalism
producer : sage publications
price : 42.0

name : sports journalism
producer : taylor and francis
price : 61.95

56/10 positive, 116/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : capoeira
producer : taylor and francis
price : 84.95

name : capoeira
producer : north atlantic books
price : 15.95

56/10 positive, 117/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : the encyclopaedia of sports medicine: an ioc medical commission publication, strength and power in sport
producer : wiley
price : 219.95

name : the encyclopaedia of sports medicine: an ioc medical commission publication, women in sport
producer : wiley
price : 255.95

56/10 positive, 118/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : 100 things red sox fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things rockies fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 118/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things broncos fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things falcons fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 119/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things brewers fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things rockies fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 120/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things colts fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things panthers fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 121/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things blues fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things steelers fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 122/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things wildcats fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things bulls fans should know & do before they die
producer : triumph books
price : 11.99

56/10 positive, 123/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : gun digest's the future of defensive handgun training skills eshort
producer : f+w media
price : 1.5

name : gun digest's defensive handgun training skills definitions eshort
producer : f+w media
price : 1.5

56/10 positive, 124/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : shooter's bible
producer : skyhorse publishing
price : 29.95

name : shooter's bible
producer : skyhorse publishing
price : 29.99

57/10 positive, 124/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : 100 things duke fans should know & do before they die
producer : triumph books
price : 0.0

name : 100 things raiders fans should know & do before they die
producer : triumph books
price : 11.99

58/10 positive, 124/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things utes fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things dodgers fans should know & do before they die
producer : triumph books
price : 11.99

58/10 positive, 125/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things indians fans should know & do before they die
producer : triumph books
price : 11.99

name : 100 things cardinals fans should know & do before they die
producer : triumph books
price : 11.99

58/10 positive, 126/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : if these walls could talk: dallas cowboys
producer : triumph books
price : 11.99

name : if these walls could talk: philadelphia phillies
producer : triumph books
price : 11.99

58/10 positive, 127/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 100 things giants fans should know & do before they die
producer : triumph books
price : 14.95

name : 100 things mariners fans should know & do before they die
producer : triumph books
price : 11.99

58/10 positive, 128/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : win forever
producer : penguin publishing group
price : 12.99

name : win forever
producer : penguin group us
price : 18.99

58/10 positive, 129/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : seeing red
producer : harpercollins publishers
price : 8.99

name : seeing red
producer : mainstream publishing
price : 12.78

59/10 positive, 129/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : fitness boxing
producer : crowood
price : 18.94

name : fitness boxing
producer : meyer & meyer
price : 13.35

59/10 positive, 129/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : game over
producer : atria books
price : 13.99

name : game over
producer : the new press
price : 18.95

59/10 positive, 130/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : total recall
producer : simon & schuster
price : 14.99

name : total recall
producer : quiller
price : 19.99

59/10 positive, 131/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


In [72]:
deduper.train()

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.100000, score 0.6341124584163941
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, producer), TfidfTextCanopyPredicate: (0.6, name))
INFO:dedupe.training:(PartialIndexLevenshteinCanopyPredicate: (4, name, CorporationName), SimplePredicate: (oneGramFingerprint, name))
INFO:dedupe.training:(SimplePredicate: (oneGramFingerprint, name), SimplePredicate: (wholeFieldPredicate, price))


In [73]:
with open(training_file, 'w') as tf:
    deduper.writeTraining(tf)

In [74]:
with open(settings_file, 'wb') as sf:
    deduper.writeSettings(sf)

In [75]:
threshold = deduper.threshold(data_d, recall_weight=1)
threshold

INFO:dedupe.canopy_index:Removing stop word and
INFO:dedupe.canopy_index:Removing stop word the
INFO:dedupe.canopy_index:Removing stop word of
INFO:dedupe.blocking:10000, 45.9522732 seconds
INFO:dedupe.api:Maximum expected recall and precision
INFO:dedupe.api:recall: 0.676
INFO:dedupe.api:precision: 0.548
INFO:dedupe.api:With threshold: 0.303


0.3033746

In [76]:
clustered_dupes = deduper.match(data_d, threshold)
print('# duplicate sets', len(clustered_dupes))

INFO:dedupe.canopy_index:Removing stop word and
INFO:dedupe.canopy_index:Removing stop word the
INFO:dedupe.canopy_index:Removing stop word of
INFO:dedupe.blocking:10000, 47.4198362 seconds


# duplicate sets 327


In [80]:
for key, values in data_d.items():
    values['price'] = str(values['price']) 

In [81]:
cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
    id_set, scores = cluster
    cluster_d = [data_d[c] for c in id_set]
    canonical_rep = dedupe.canonicalize(cluster_d)
    for record_id, score in zip(id_set, scores):
        cluster_membership[record_id] = {
            "cluster id" : cluster_id,
            "canonical representation" : canonical_rep,
            "confidence": score
        }

In [84]:
singleton_id = cluster_id + 1

with open(output_file, 'w') as f_output, open(input_file) as f_input:
    writer = csv.writer(f_output)
    reader = csv.reader(f_input)

    heading_row = next(reader)
    heading_row.insert(0, 'confidence_score')
    heading_row.insert(0, 'Cluster ID')
    canonical_keys = canonical_rep.keys()
    for key in canonical_keys:
        heading_row.append('canonical_' + key)

    writer.writerow(heading_row)

    for row in reader:
        row_id = int(row[0])
        if row_id in cluster_membership:
            cluster_id = cluster_membership[row_id]["cluster id"]
            canonical_rep = cluster_membership[row_id]["canonical representation"]
            row.insert(0, cluster_membership[row_id]['confidence'])
            row.insert(0, cluster_id)
            for key in canonical_keys:
                row.append(canonical_rep[key].encode('utf8'))
        else:
            row.insert(0, None)
            row.insert(0, singleton_id)
            singleton_id += 1
            for key in canonical_keys:
                row.append(None)
        writer.writerow(row)

In [91]:
ebooks1_output = pd.read_csv('ebooks1_output2.csv', sep=',', quotechar='"')

In [93]:
ebooks1_output.columns

Index(['Cluster ID', 'confidence_score', 'Unnamed: 2', 'Id', 'name', 'description', 'producer', 'price', 'source', 'canonical_', 'canonical_Id', 'canonical_name', 'canonical_description', 'canonical_producer', 'canonical_price', 'canonical_source'], dtype='object')

In [94]:
fields_to_compare = [
    'Cluster ID',
    'name',
    'price',
    'producer',
    'confidence_score'
]
ebooks1_output = ebooks1_output[fields_to_compare]

In [96]:
ebooks1_output[ebooks1_output['confidence_score'] > 0.6].sort_values('Cluster ID').head(10)

Unnamed: 0,Cluster ID,name,price,producer,confidence_score
16014,0,Steve Cooper's Australian Fishing Guide,17.5,Hardie Grant Books,0.64
19562,0,College Football's Most Memorable Games,29.95,"McFarland & Company, Inc., Publishers",0.64
19341,0,The Gun Digest Book of Firearms Assembly/Disassembly Part IV - Centerfire Rifles,24.99,F+W Media,0.64
19502,1,"The Complete Sailor, Second Edition",18.0,McGraw-Hill Education,0.64
19474,1,Aussie Rules For Dummies,0.0,Wiley,0.64
16606,1,Heroes are Forever,12.78,Mainstream Publishing,0.64
19607,1,Death to the BCS: Totally Revised and Updated,17.99,Penguin Publishing Group,0.64
17881,2,The Last Days of Shea,9.99,Taylor Trade Publishing,0.64
19654,2,Ice Time,11.99,Crown/Archetype,0.64
19810,3,"Have Glove, Will Travel",9.99,Crown/Archetype,0.64
