In [23]:
from future.builtins import next
import os
import csv
import re
import logging
import optparse

import dedupe
from unidecode import unidecode

import pandas as pd

In [24]:
pd.options.display.float_format = '{:20,.2f}'.format
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

In [25]:
amazon_walmart_all_path = (r'/home/ubuntu/jupyter/ServerX/1_Standard Data Integration/Sample Datasets'
                    r'/Processed Data/product_samples/amazon_walmart_all.csv')

## Prepare df and dict corpus

In [65]:
fields_of_interest = [
    'Id',
    'name',
    'producer',
    'description',
    'price',
    'category',
    'source'
]

In [66]:
amazon_walmart_all_df = pd.read_csv(amazon_walmart_all_path, sep=',', quotechar='"')[fields_of_interest]

In [None]:
amazon_walmart_all_df.dtypes

In [None]:
x = amazon_walmart_all_df[amazon_walmart_all_df['category'].isnull()]
x.head(1)

In [None]:
z = amazon_walmart_all_df[amazon_walmart_all_df['producer'].isnull()]
z.head(1)

In [None]:
y = amazon_walmart_all_df[amazon_walmart_all_df['price'].isnull()]
y.head(1)

In [None]:
h = amazon_walmart_all_df[amazon_walmart_all_df['description'].isnull()]
h.head()

In [None]:
amazon_walmart_all_df[amazon_walmart_all_df['name'].isnull()]

In [67]:
description_corpus = amazon_walmart_all_df['description'].to_list()
description_corpus = [x for x in description_corpus if str(x) != 'nan']

In [68]:
description_corpus[1]

'EPSON ELPLP12 1500HRS 200V REPL LAMP FOR LAMP POWERLITE FOR 7700P 5600P 7600 Features Lamp Life 1500 Hour Manufacturer Epson Corporation Compatible Devices LCD Manufacturer Part Number ELPLP12 Manufacturer Website Address www.epson.com Product Name Replacement Lamp Package Type Retail Product Type 200W UHE Projector Lamp Tech Specs Manufacturer Epson Corporation Manufacturer Part Number ELPLP12 Shipping Dimensions 5.25  Depth Manufacturer Website Address www.epson.com Lamp Life 1500 Hour Compatibility Epson Powerlite 7700P Projector Epson Powerlite 7600P Projector Epson Powerlite 5600P Projector Compatible Devices LCD Product Name Replacement Lamp Shipping Weight 1 lb Package Type Retail Product Type 200W UHE Projector Lamp'

In [69]:
category_corpus = amazon_walmart_all_df.drop_duplicates().to_dict('records')

In [70]:
categories = list(amazon_walmart_all_df['category'].unique())
categories = [x for x in categories if str(x) != 'nan']

In [71]:
producer_corpus = amazon_walmart_all_df.drop_duplicates().to_dict('records')

In [72]:
producers = list(amazon_walmart_all_df['producer'].unique())
producers = [x for x in producers if str(x) != 'nan']

In [77]:
producers.sort()
producers

['-NA-',
 '1d4',
 '24/7 Cases',
 '3 in 1 Charger',
 '3D Connexion',
 '3DRose',
 '3DTV Corp',
 '3Dconnexion',
 '3M',
 '3M#',
 '3gjuice',
 '4inkjets',
 '501001717398',
 'A Days Tech',
 'A Young Life',
 'A-DATA',
 'A4TECH',
 'AAS',
 'AAXA',
 'ABC Products',
 'ACCESSORYSQUAD',
 'ACCO',
 'ACD',
 'ACD Systems',
 'ACP',
 'ACP-EP Memory',
 'AGF',
 'AGPtek',
 'AIDATA',
 'AKG',
 'ALFA',
 'ALL COLORS',
 'AMD',
 'AMP',
 'AMP Energy',
 'AMZER',
 'ANTEC',
 'AO Safety',
 'AOC',
 'APC',
 'ARCLYTE TECHNOLOGIES  INC.',
 'ARCLYTE TECHNOLOGIES INC.',
 'ARKON',
 'ARKVIEW',
 'ART',
 'ASI',
 'AT T',
 'AT-A-GLANCE',
 'ATDEC',
 'ATI',
 'ATP',
 'ATREND',
 'ATREND-BBOX',
 'AURIA LLC',
 'AVF',
 'AVF Group',
 'AVID',
 'AVID TECHNOLOGY',
 'AVerMedia',
 'AZiO',
 'AblePlanet',
 'Acase',
 'Accell',
 'Accentra  Inc.',
 "Accessories Zone's Bundle",
 'AccessoriesZone',
 'Accessory Export',
 'Accessory Export  LLC',
 'Accessory Genie',
 'Accessory Power',
 'Accessory Workshop',
 'Acclivity - MYOB',
 'Acco',
 'Ace',
 'Acer

In [78]:
input_file = amazon_walmart_all_path
output_file = 'amazon_walmart_output3.csv'
settings_file = 'amazon_walmart_learned_settings3'
training_file = 'amazon_walmart_training3.json'

In [14]:
float('1.25')

1.25

In [79]:
def preProcess(key, column):
    
    try : # python 2/3 string differences
        column = column.decode('utf8')
    except AttributeError:
        pass
    column = unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    column = column.lower()
    if not column:
        return None
        
    if key == 'price':
        column = float(column) 
    return column

def readData(filename):
    
    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(k, v)) for (k, v) in row.items()]
            row_id = int(row['Id'])
            data_d[row_id] = dict(clean_row)

    return data_d    

In [80]:
print('importing data ...')
data_d = readData(input_file)

importing data ...


In [81]:
fields = [
    {'field' : 'name', 'type': 'Name'},
    {'field' : 'name', 'type': 'String'},
    {'field' : 'description', 
     'type': 'Text',
     'corpus': description_corpus,
     'has_missing': True
    },
    {'field' : 'category', 
     'type': 'FuzzyCategorical',
     'categories': categories,
     'corpus': category_corpus,
     'has missing' : True
    },        
    {'field' : 'producer', 
     'type': 'FuzzyCategorical',
     'categories': producers,
     'corpus': producer_corpus,
     'has_missing': True
    },
    {'field' : 'price', 
     'type': 'Price',
     'has_missing': True
    },
]

In [82]:
deduper = dedupe.Dedupe(fields)

In [None]:
# took about 20 min with blocked proportion 0.8
deduper.prepare_training(data_d)

INFO:dedupe.canopy_index:Removing stop word with
INFO:dedupe.canopy_index:Removing stop word 7
INFO:dedupe.canopy_index:Removing stop word is
INFO:dedupe.canopy_index:Removing stop word than
INFO:dedupe.canopy_index:Removing stop word get
INFO:dedupe.canopy_index:Removing stop word quickly
INFO:dedupe.canopy_index:Removing stop word that
INFO:dedupe.canopy_index:Removing stop word your
INFO:dedupe.canopy_index:Removing stop word into
INFO:dedupe.canopy_index:Removing stop word and
INFO:dedupe.canopy_index:Removing stop word share
INFO:dedupe.canopy_index:Removing stop word works
INFO:dedupe.canopy_index:Removing stop word you
INFO:dedupe.canopy_index:Removing stop word protect
INFO:dedupe.canopy_index:Removing stop word of
INFO:dedupe.canopy_index:Removing stop word access
INFO:dedupe.canopy_index:Removing stop word are
INFO:dedupe.canopy_index:Removing stop word a
INFO:dedupe.canopy_index:Removing stop word easy
INFO:dedupe.canopy_index:Removing stop word to
INFO:dedupe.canopy_index:R

INFO:dedupe.canopy_index:Removing stop word three
INFO:dedupe.canopy_index:Removing stop word 12
INFO:dedupe.canopy_index:Removing stop word 6
INFO:dedupe.canopy_index:Removing stop word provide
INFO:dedupe.canopy_index:Removing stop word out
INFO:dedupe.canopy_index:Removing stop word phone
INFO:dedupe.canopy_index:Removing stop word made
INFO:dedupe.canopy_index:Removing stop word amp
INFO:dedupe.canopy_index:Removing stop word delivers
INFO:dedupe.canopy_index:Removing stop word range
INFO:dedupe.canopy_index:Removing stop word take
INFO:dedupe.canopy_index:Removing stop word long
INFO:dedupe.canopy_index:Removing stop word between
INFO:dedupe.canopy_index:Removing stop word color
INFO:dedupe.canopy_index:Removing stop word lcd
INFO:dedupe.canopy_index:Removing stop word user
INFO:dedupe.canopy_index:Removing stop word battery
INFO:dedupe.canopy_index:Removing stop word 16
INFO:dedupe.canopy_index:Removing stop word 15
INFO:dedupe.canopy_index:Removing stop word weight
INFO:dedupe.c

INFO:dedupe.canopy_index:Removing stop word ol
INFO:dedupe.canopy_index:Removing stop word  v
INFO:dedupe.canopy_index:Removing stop word cl
INFO:dedupe.canopy_index:Removing stop word ga
INFO:dedupe.canopy_index:Removing stop word hd
INFO:dedupe.canopy_index:Removing stop word mi
INFO:dedupe.canopy_index:Removing stop word n 
INFO:dedupe.canopy_index:Removing stop word no
INFO:dedupe.canopy_index:Removing stop word to
INFO:dedupe.canopy_index:Removing stop word  n
INFO:dedupe.canopy_index:Removing stop word ap
INFO:dedupe.canopy_index:Removing stop word cr
INFO:dedupe.canopy_index:Removing stop word en
INFO:dedupe.canopy_index:Removing stop word fi
INFO:dedupe.canopy_index:Removing stop word id
INFO:dedupe.canopy_index:Removing stop word im
INFO:dedupe.canopy_index:Removing stop word ma
INFO:dedupe.canopy_index:Removing stop word nc
INFO:dedupe.canopy_index:Removing stop word r 
INFO:dedupe.canopy_index:Removing stop word tr
INFO:dedupe.canopy_index:Removing stop word 10
INFO:dedupe.c

INFO:dedupe.canopy_index:Removing stop word ul
INFO:dedupe.canopy_index:Removing stop word 2 
INFO:dedupe.canopy_index:Removing stop word ba
INFO:dedupe.canopy_index:Removing stop word iv
INFO:dedupe.canopy_index:Removing stop word ot
INFO:dedupe.canopy_index:Removing stop word is
INFO:dedupe.canopy_index:Removing stop word si
INFO:dedupe.canopy_index:Removing stop word um
INFO:dedupe.canopy_index:Removing stop word 10
INFO:dedupe.canopy_index:Removing stop word ce
INFO:dedupe.canopy_index:Removing stop word du
INFO:dedupe.canopy_index:Removing stop word 12
INFO:dedupe.canopy_index:Removing stop word bo
INFO:dedupe.canopy_index:Removing stop word ie
INFO:dedupe.canopy_index:Removing stop word rd
INFO:dedupe.canopy_index:Removing stop word ua
INFO:dedupe.canopy_index:Removing stop word br
INFO:dedupe.canopy_index:Removing stop word as
INFO:dedupe.canopy_index:Removing stop word pl
INFO:dedupe.canopy_index:Removing stop word 60
INFO:dedupe.canopy_index:Removing stop word ay
INFO:dedupe.c

In [39]:
dedupe.consoleLabel(deduper)

name : durable bridge
category : audio video accessories
producer : durable
price : 203.86

name : durable bridge
category : audio video accessories
producer : durable
price : 203.86

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


y


name : hp pavilion dv6-3013nr 15.6-inch laptop - argento
category : laptops
producer : hp
price : None

name : hp pavilion dv6-6140us entertainment notebook pc silver
category : laptops
producer : hp
price : 649.99

1/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, name), SimplePredicate: (wholeFieldPredicate, price))
name : acer as5750-6438 15.6-inch laptop mesh black
category : laptops
producer : acer
price : 548.72

name : acer as5552-6838 15.6-inch laptop mesh black
category : laptops
producer : acer
price : 529.6

1/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : durable bridge
category : audio video accessories
producer : durable
price : 215.48

name : durable bridge
category : audio video accessories
producer : durable
price : 203.86

1/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : pentax k-r 12.4 mp digital slr camera with 3.0-inch lcd and 18-55mm f 3.5-5.6 lens black
category : digital slr cameras
producer : pentax
price : None

name : pentax k-r 12.4 mp digital slr camera with 3.0-inch lcd and 18-55mm f 3.5-5.6 lens red
category : digital slr cameras
producer : pentax
price : None

1/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : blow molded case in black 13 x 18 x 4.25
category : media storage organization
producer : platt
price : 56.99

name : blow molded case in black 11.5 x 16 x 4.38
category : media storage organization
producer : platt
price : None

1/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : elite screens ezframe fixed frame projection screen 16 9 aspect ratio - 120in. cine white
category : projection screens
producer : elite screens
price : None

name : elite screens ezframe fixed frame projection screen 16 9 aspect ratio - 100in. cine white
category : projection screens
producer : elite screens
price : 405.99

1/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : da-lite tensioned cosmopolitan electrol - projection screen rear motorized - da-tex
category : projection screens
producer : da-lite
price : None

name : da-lite tensioned cosmopolitan electrol - projection screen rear motorized - da-tex
category : projection screens
producer : da-lite
price : None

1/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : da-lite advantage electrol high power - projection screen motorized
category : projection screens
producer : da-lite
price : None

name : da-lite advantage electrol high power - projection screen motorized
category : projection screens
producer : da-lite
price : None

2/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, name), SimplePredicate: (fingerprint, name))
name : buffalo technology ministation plus 500 gb usb 3.0 portable external hard drive with shock protection hd-pnt500u3s silver
category : external hard drives
producer : buffalo technology
price : 74.88

name : buffalo technology ministation plus 500 gb usb 3.0 portable external hard drive with shock protection hd-pnt500u3b black
category : external hard drives
producer : buffalo technology
price : 75.24

3/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : sony cyber-shot dsc-hx7v 16.2 mp exmor r cmos digital still camera with 10x wide-angle optical zoom g lens 3d sweep panorama and full 1080 60i hd video black
category : point shoot digital cameras
producer : sony
price : 269.0

name : sony cyber-shot dsc-hx7v 16.2 mp exmor r cmos digital still camera with 10x wide-angle optical zoom g lens 3d sweep panorama and full 1080 60i hd video blue
category : point shoot digital cameras
producer : sony
price : 269.0

3/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : sony cyber-shot dsc-tx10 16.2 mp waterproof digital still camera with exmor r cmos sensor 3d sweep panorama and full hd 1080 60i video blue
category : point shoot digital cameras
producer : sony
price : 329.0

name : sony cyber-shot dsc-tx10 16.2 mp waterproof digital still camera with exmor r cmos sensor 3d sweep panorama and full hd 1080 60i video black
category : point shoot digital cameras
producer : sony
price : 329.0

3/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : sylvania 4 gb 3.6-inch touch screen video mp3 player media center with expandable memory slot built-in speakerphone pink
category : mp3 players
producer : curtis
price : 39.99

name : sylvania 4 gb 3.6-inch touch screen video mp3 player media center with expandable memory slot built-in speakerphone blue
category : mp3 players
producer : curtis
price : 39.99

3/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : mydesk blue
category : None
producer : lap desk
price : 14.56

name : black mydesk
category : lap desks
producer : lap desk
price : 13.22

3/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : transcend 32 gb class 10 sdhc flash memory card ts32gsdhc10e
category : blank media
producer : transcend
price : 46.93

name : transcend 32 gb class 10 sdhc flash memory card ts32gsdhc10
category : blank media
producer : transcend
price : 46.49

3/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : transcend 8 gb class 6 sdhc flash memory card ts8gsdhc6
category : blank media
producer : transcend
price : 11.99

name : transcend 8 gb class 6 sdhc flash memory card ts8gsdhc6e
category : blank media
producer : transcend
price : 11.9

4/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (oneGramFingerprint, name), SimplePredicate: (sortedAcronym, name))
name : nikon coolpix s9100 12.1 mp cmos digital camera with 18x nikkor ed wide-angle optical zoom lens and full hd 1080p video black
category : point shoot digital cameras
producer : nikon
price : None

name : nikon coolpix s9100 12.1 mp cmos digital camera with 18x nikkor ed wide-angle optical zoom lens and full hd 1080p video silver
category : point shoot digital cameras
producer : nikon
price : None

5/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : thinskin personalization films iphone 4 carbon fiber s
category : covers skins
producer : truepower
price : 19.95

name : thinskin personalization films iphone 4 carbon fiber
category : covers skins
producer : truepower
price : 19.95

5/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : panasonic lumix dmc-fp5 14.1 mp digital camera with 4x optical image stabilized zoom with 3.0-inch touch-screen lcd blue
category : point shoot digital cameras
producer : panasonic
price : 129.95

name : panasonic lumix dmc-fp5 14.1 mp digital camera with 4x optical image stabilized zoom with 3.0-inch touch-screen lcd silver
category : point shoot digital cameras
producer : panasonic
price : 129.95

6/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (oneGramFingerprint, name), SimplePredicate: (sortedAcronym, name))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, name), SimplePredicate: (wholeFieldPredicate, price))
name : nikon coolpix s4100 14 mp digital camera with 5x nikkor wide-angle optical zoom lens and 3-inch touch-panel lcd plum
category : point shoot digital cameras
producer : nikon
price : None

name : nikon coolpix s4100 14 mp digital camera with 5x nikkor wide-angle optical zoom lens and 3-inch touch-panel lcd black
category : point shoot digital cameras
producer : nikon
price : None

6/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : fujifilm finepix z700exr 12 mp super ccd exr digital camera with 5x optical zoom and 3.5-inch touch-screen lcd red
category : point shoot digital cameras
producer : fuji
price : 176.08

name : fujifilm finepix z700exr 12 mp super ccd exr digital camera with 5x optical zoom and 3.5-inch touch-screen lcd black
category : point shoot digital cameras
producer : fuji
price : 169.95

6/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : sony dcr-sr68 80gb hard disk drive handycam camcorder
category : camcorders
producer : sony
price : 328.0

name : sony dcr-sr68 80gb hard disk drive handycam camcorder red
category : camcorders
producer : sony
price : 328.0

6/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : da-lite tensioned cosmopolitan electrol - projection screen motorized - 1 1 - dual vision
category : projection screens
producer : da-lite
price : None

name : da-lite tensioned cosmopolitan electrol - projection screen motorized - 1 1 - dual vision
category : projection screens
producer : da-lite
price : None

6/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : charge sync cable audio
category : audio cables
producer : dreamgear
price : 25.74

name : charge sync cable audio
category : audio cables
producer : dreamgear
price : 25.74

7/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : hp 92 black ink cartridge in retail packaging c9362wn
category : inkjet printer ink
producer : hp
price : 14.15

name : hp 92 black ink cartridge in retail packaging c9362wn 140
category : inkjet printer ink
producer : hp
price : 13.9

8/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : fujifilm finepix xp30 14 mp waterproof digital camera with fujinon 5x optical zoom lens and gps geo-tagging function black
category : point shoot digital cameras
producer : fuji
price : 179.95

name : fujifilm finepix xp30 14 mp waterproof digital camera with fujinon 5x optical zoom lens and gps geo-tagging function green
category : point shoot digital cameras
producer : fuji
price : 189.99

9/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (oneGramFingerprint, name), SimplePredicate: (sortedAcronym, name))
INFO:dedupe.training:(SimplePredicate: (roundTo1, price), TfidfTextCanopyPredicate: (0.8, name))
name : draper revelation motorized ceiling-recessed projector mount model a - mounting kit ceiling mount for projector - ceiling mountable
category : projector mounts
producer : draper
price : None

name : draper revelation motorized ceiling-recessed projector mount model b - mounting kit ceiling mount for projector - ceiling mountable
category : projector mounts
producer : draper
price : None

9/10 positive, 15/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : high power multi-mask imager fixed frame screen - 45 x 80 hdtv format size 45 x 80
category : projection screens
producer : da-lite
price : 5362.25

name : high power multi-mask imager fixed frame screen - 45 x 80 hdtv format size 49 x 87
category : projection screens
producer : da-lite
price : 5529.82

9/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : casio ex-z550 14.1mp digital camera with 4x wide angle zoom with ccd shift image stabilization and 2.7 inch lcd black
category : point shoot digital cameras
producer : casio
price : 112.78

name : casio ex-z550 14.1mp digital camera with 4x wide angle zoom with ccd shift image stabilization and 2.7 inch lcd red
category : point shoot digital cameras
producer : casio
price : 104.95

9/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : wacom bamboo replacement nib set pen not included for ctl460 cth460 cth461 cth661 intuos4
category : keyboards
producer : wacom
price : 9.99

name : wacom bamboo replacement nib set pen not included for ctl460 cth460 cth461 cth661 intuos4
category : keyboards
producer : wacom
price : 9.99

9/10 positive, 18/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : da-lite advantage electrol high power - projection screen motorized
category : projection screens
producer : da-lite
price : None

name : da-lite advantage electrol high power - projection screen motorized
category : projection screens
producer : da-lite
price : None

10/10 positive, 18/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


In [46]:
data_d[1]

{'': '0',
 'Id': '1',
 'name': 'koss eq50 3-band stereo equalizer',
 'producer': 'koss',
 'description': 'the pocket-size koss 3-band equalizer delivers high-fidelity performance and output normally reserved for more expensive home systems. with a 10db boost or -10db cut range of level it features a 3-band equalizer that allows for convenient and individual bass midrange and treble adjustment. power output is greater than 20mw per channel providing clean and undistorted output into your favorite stereophones. ergonomically designed for easy handling a rotary volume control and on off switch are placed for convenient usage.',
 'price': 12.65,
 'category': 'headphone accessories',
 'source': 'amazon'}

In [53]:
deduper.train()

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
  * (true_distinct + false_distinct)))
INFO:rlr.crossvalidation:optimum alpha: 0.000100, score 0.24869807198976637
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (oneGramFingerprint, name), SimplePredicate: (sortedAcronym, name))
INFO:dedupe.training:(SimplePredicate: (roundTo1, price), TfidfTextCanopyPredicate: (0.8, name))


In [54]:
threshold = deduper.threshold(data_d, recall_weight=1)
threshold

INFO:dedupe.canopy_index:Removing stop word new
INFO:dedupe.canopy_index:Removing stop word 4
INFO:dedupe.canopy_index:Removing stop word black
INFO:dedupe.canopy_index:Removing stop word digital
INFO:dedupe.canopy_index:Removing stop word with
INFO:dedupe.canopy_index:Removing stop word and
INFO:dedupe.canopy_index:Removing stop word white
INFO:dedupe.canopy_index:Removing stop word case
INFO:dedupe.canopy_index:Removing stop word x
INFO:dedupe.canopy_index:Removing stop word inch
INFO:dedupe.canopy_index:Removing stop word 1
INFO:dedupe.canopy_index:Removing stop word for
INFO:dedupe.canopy_index:Removing stop word gb
INFO:dedupe.canopy_index:Removing stop word 2
INFO:dedupe.canopy_index:Removing stop word usb
INFO:dedupe.canopy_index:Removing stop word 8
INFO:dedupe.canopy_index:Removing stop word 3
INFO:dedupe.blocking:10000, 2.3534082 seconds
INFO:dedupe.blocking:20000, 4.5438192 seconds
INFO:dedupe.api:Maximum expected recall and precision
INFO:dedupe.api:recall: 0.891
INFO:dedup

0.42378402

In [55]:
print('clustering...')
clustered_dupes = deduper.match(data_d, threshold)

print('# duplicate sets', len(clustered_dupes))

clustering...


INFO:dedupe.canopy_index:Removing stop word new
INFO:dedupe.canopy_index:Removing stop word 4
INFO:dedupe.canopy_index:Removing stop word black
INFO:dedupe.canopy_index:Removing stop word digital
INFO:dedupe.canopy_index:Removing stop word with
INFO:dedupe.canopy_index:Removing stop word and
INFO:dedupe.canopy_index:Removing stop word white
INFO:dedupe.canopy_index:Removing stop word case
INFO:dedupe.canopy_index:Removing stop word x
INFO:dedupe.canopy_index:Removing stop word inch
INFO:dedupe.canopy_index:Removing stop word 1
INFO:dedupe.canopy_index:Removing stop word for
INFO:dedupe.canopy_index:Removing stop word gb
INFO:dedupe.canopy_index:Removing stop word 2
INFO:dedupe.canopy_index:Removing stop word usb
INFO:dedupe.canopy_index:Removing stop word 8
INFO:dedupe.canopy_index:Removing stop word 3
INFO:dedupe.blocking:10000, 2.2952282 seconds
INFO:dedupe.blocking:20000, 4.4590712 seconds


# duplicate sets 104


In [56]:
for key, values in data_d.items():
    values['price'] = str(values['price']) 

In [57]:
cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
    id_set, scores = cluster
    cluster_d = [data_d[c] for c in id_set]
    
    
    canonical_rep = dedupe.canonicalize(cluster_d)
    for record_id, score in zip(id_set, scores):
        cluster_membership[record_id] = {
            "cluster id" : cluster_id,
            "canonical representation" : canonical_rep,
            "confidence": score
        }

singleton_id = cluster_id + 1

with open(output_file, 'w') as f_output, open(input_file) as f_input:
    writer = csv.writer(f_output)
    reader = csv.reader(f_input)

    heading_row = next(reader)
    heading_row.insert(0, 'confidence_score')
    heading_row.insert(0, 'Cluster ID')
    canonical_keys = canonical_rep.keys()
    for key in canonical_keys:
        heading_row.append('canonical_' + key)

    writer.writerow(heading_row)

    for row in reader:
        row_id = int(row[0])
        if row_id in cluster_membership:
            cluster_id = cluster_membership[row_id]["cluster id"]
            canonical_rep = cluster_membership[row_id]["canonical representation"]
            row.insert(0, cluster_membership[row_id]['confidence'])
            row.insert(0, cluster_id)
            for key in canonical_keys:
                row.append(canonical_rep[key].encode('utf8'))
        else:
            row.insert(0, None)
            row.insert(0, singleton_id)
            singleton_id += 1
            for key in canonical_keys:
                row.append(None)
        writer.writerow(row)
        

In [58]:
fields_of_interest = ['Cluster ID', 'confidence_score', 'Id', 'name', 'producer', 'description', 'price']

In [59]:
amazon_walmart_output = pd.read_csv('amazon_walmart_output2.csv', sep=',', quotechar='"')[fields_of_interest]

In [None]:
amazon_walmart_output[amazon_walmart_output['confidence_score'] == None]

In [None]:
amazon_walmart_output = amazon_walmart_output[fields_of_interest]

In [61]:
amazon_walmart_output[amazon_walmart_output['confidence_score'] > 0.9].sort_values('Cluster ID')

Unnamed: 0,Cluster ID,confidence_score,Id,name,producer,description,price
101,1,1.0,102,Case Logic SLR Camera Backpack,Case Logic,This SLR backpack combines practicality and style with customizable organization and sleek lines. Carry everythingSLR camera lenses flash and laptopcomfortably on your back. Product Material Nylon Product Weight 3.39 lbs. Laptop Compartment Dimensions 14.9 or 14 inch PC fit in the padded laptop compartment Unzip side pocket slide your tripod inside and secure it at the top with the adjustable buckle serves as additional accessory storage when not using a tripod Weather hood fits snugly over the bag to protect from the elements when not in use roll into its stuff sack for storage Front pocket provides storage for any non-camera items you require A padded front panel ensures your camera and accessories are protected from impact Convenient mesh water bottle pouch Innovative strap management system eliminates messy hanging straps,79.99
2655,1,1.0,102,AGF HA0435-M005 Case for BlackBerry Torch 9800 and 9810 Endo GRT - 1 Pack - Retail Packaging - Black,AGF,This case fits the BlackBerry Torch 9800 amp 9810 and features a patented 2-piece glide rail technology case. It is made with super-slick glide tech resin so phone opens amp closes with ease. The polycarbonate internal support structure provides utmost protection from every angle. The case attaches to itself so there s no need to stress over any scratches.,23.93
23530,1,1.0,20977,New-Advantus 75456 - ID Badge Holder Horizontal 4w x 3h Clear 50 Pack - AVT75456,Advantus,ID name badge holders are prepunched for chain or clip use. Includes clip. Insert Width 4 in Insert Height 3 in Orientation Horizontal Color s Clear. Badge Badges Holder Horizontal ID Badge Holder ID ID Badge Holder Security Badge Security Passes Identification Pass-cards Tags,17.95
286,5,1.0,287,Motion Systems IPAD01 Charlie iPad Sleeve Black,MOTION SYSTEMS,Motion Systems IPAD01 Charlie iPad Sleeve Sleeve design for convenient access Hard protective exterior Soft interior design Perfect for the on-the-go individual,16.99
2840,5,1.0,287,Da-Lite 27565 Da-Plex Deluxe Rear Projection Screen - 72 x 96 Video Format,Da-Lite,Da-Lite 27565 Da-Lite offers a wide range of rigid rear projection screen systems for any need ranging from corporate boardrooms to home theaters designed to provide the highest resolution and most accurate color fidelity. With Da-Lite rear projection screens viewers can enjoy bright high resolution images without turning the lights off. This coating is deposited on a transparent glass Da-Glas or acrylic Da-Plex substrate. Da-Lite utilizes a special coating process which chemically bonds the optical layer to the substrate creating a very high degree of adhesion guaranteed not to peel or strip off. Deluxe Frame Features -Impressive architectural design adds sophistication to any installation -1-3 4 x 3 rectangular base tube -Dovetail frame eliminates light leakage -Black anodized finish -Frame size equals screen viewing area plus 5-1 2,
15307,5,1.0,12754,Lowepro Ridge 30 Camera Case Black,Lowepro,Designed as a protective carrying case for cameras ranging in size from ultra-compact to compact the Lowepro Ridge 30 camera bag is constructed of lightweight water-resistant materials. The main compartment is lined with brushed tricot to protect lenses and displays from nicks and scratches. Additional pockets make it easy to store memory cards and spare batteries and a belt loop and shoulder strap both let you keep your camera secure and your hands free no matter where you are or what you re doing. The Lowepro Ridge is recommended for a number of cameras including the following Canon S40 50 60 70 and A310 400 Fuji F700 710 810 and A120 HP 435 and M307 407 Kodak LS743 753 Minolta F100 200 300 Kyocera L30 4V 3V Olympus D450 and Sony DSC-8 10 32 52 72 73 93 100.,10.13
3290,8,0.99,737,Microsoft Natural Ergonomic Keyboard 4000,Microsoft,From the Manufacturer Overview Make the move to superior comfort with this breakthrough keyboard design Place your hands and arms naturally and comfortably with the Microsoft Natural Ergonomic Keyboard 4000. The improved split design encourages a more natural hand wrist and forearm position and its new curved key layout makes keys easier to reach. The Zoom Slider lets you zoom in for close-ups of Web pages images and more with just the touch of a finger. And 5 customizable My Favorites keys give you instant access to the folders files and Web pages you use most. Features Ergonomic Keyboard Design The advanced design of the Microsoft Natural Ergonomic Keyboard promotes a more natural hand wrist and forearm position for greater comfort. 14-Degree Gable Advanced ergonomic design with enhanced slope reduces wrist pronation. 12-Degree Split and Natural Arc The split keyboard design encourages natural wrist posture while mimicking the curved placement of the fingertips to reduce key reach. 7-Degree Reverse Slope Removable palm lift creates a reverse slope for a more natural wrist posture. Additional Features Better Support - Cushioned Palm Rest - Get better support while resting your hands with this plush integrated palm rest. See It Your Way - Zoom Slider - Easily zoom in for close-ups of digital pictures maps and more. Instant Access - My Favorites Hot Keys - Customize Hot Keys to instantly open the programs files and Web pages you use the most. Simply push and hold to set-just like your car radio. Improved Number Pad - Common numeric functions such as and Backspace right where you need them most - above the number pad. System Requirements Windows Vista Windows XP Pro Pro x64 Edition Home Media Center Edition Tablet PC Edition Windows 2000 with Service Pack 4 SP4 or later 60 MB free or Mac OS X v10.2x 10.4x. Expos will only work on Mac OS X v10.3 or later 30 MB free Powered USB port CD drive Users of Mac OS X v10.2.x must install IntelliType Pro and IntelliPoint software before connecting the hardware. Users of Mac OS X v10.3.x must upgrade to Mac OS X v10.3.9 before connecting hardware OR install IntelliType Pro and IntelliPoint software before connecting the hardware.,34.61
736,8,0.99,737,Hammermill Everyday Copy And Print Paper 92 Bright 20lb Letter White 5000 Sheets Ctn,Hammermill,Hammermill Everyday Copy and Print Paper 8-1 2 10 Reams per Carton 500 Sheets per Ream 92 brightness 20 lb 99.99 percent jam-free guarantee Acid-free for archival quality Helps eliminate misfeeds internal jams and uneven stacking ColorLok for bolder blacks brighter colors and faster drying,46.47
22114,8,0.99,19561,MaxWhite Kestrel Series Floor Electric Projection Screen - 72 Diagonal Size 92 diagonal,Elite,FE92H Size 92 diagonal The Kestrel Electric Floor Series has a stylish piano gloss finish makes it an attractive centerpiece for home theater and high level conference rooms where wall ceiling installations are not practical. It is suitable for fixed or mobile presentations. Features -Portable Motorized Electric Floor Screen. -Screen Material Elite Screens MaxWhite. -Stylish black piano gloss finish. -Tubular motor allows swift operation. -Internal Radio Frequency and Infrared Receivers. -Standard Universal Learning IR Remote ZR800D . -Includes RF remote and 12v Trigger adapter. -Built-In 5-12V projector trigger - Synchronize Screen Up Down with Projector On Off. -The screen surface is our MaxWhite FGT premium grade fiber glass-backed material. -Rises and lowers by our motorized precision scissor-backed cross spring risers. Specifications -16 9 aspect ratio. -72 diagonal. -Screen Gain 1.1. -Overall dimensions 77.3 H x 66.8 W x 7.8 D. -2-year parts and labor manufacturer warranty. Brochure Screen Material User Guide,1595.0
1640,20,0.95,1641,Sharp EL-334MB Basic Calculator,Sharp,Sharp EL-334MB Basic Calculator Perfect for home or business 10-digit display Shows punctuation for larger calculations Dual-power,15.99
