This dedupe file is meant to deduplicate single files.

In [None]:
import os
import csv
import re

import dedupe
from unidecode import unidecode

In [2]:
# Be sure to edit this before running
input_file = '../Data/'
output_file = '../Data/dedupe_results/sunshine_physicians/sunshine_physicians.csv'
settings_file = '../Data/dedupe_results/sunshine_physicians_learned_settings'
training_file = '../Data/dedupe_results/sunshine_physicians_training.json'
id_column = ''

In [None]:
fields = [{'field':'price','type':'Price','has missing': True},
                  {'field':'modelno','type':'String','has missing': True},
                  {'field':'title','type':'Text','has missing': True,'corpus': descriptions()},
                  {'field':'brand','type':'String','has missing': True}]

In [4]:
def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """

    column = unidecode(column)
    column = re.sub('\n', ' ', column)
    column = re.sub('-', '', column)
    column = re.sub('/', ' ', column)
    column = re.sub("'", '', column)
    column = re.sub(",", '', column)
    column = re.sub(":", ' ', column)
    column = re.sub('  +', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column:
        column = None
    return column

In [5]:
def readData(filename):
    """
    Read in our data from a CSV file and create a dictionary of records,
    where the key is a unique record ID and each value is dict
    """

    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            clean_row = dict([(k, preProcess(v)) for (k, v) in row.items()])
            row_id = int(row[id_column])
            data_d[row_id] = dict(clean_row)

    return data_d

In [6]:
if __name__ == '__main__':
    
    print('importing data ...')
    data_d = readData(input_file)    
       
    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)
    
    else:        
        deduper = dedupe.Dedupe(fields)
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file) as tf:
                deduper.prepare_training(data_d,
                                        training_file=tf,
                                        sample_size=15000)
        else:
            deduper.prepare_training(data_d)
            
        print('starting active labeling...')
        dedupe.console_label(deduper)
        deduper.train()
        with open(training_file, 'w') as tf:
            deduper.write_training(tf)
        with open(settings_file, 'wb') as sf:
            deduper.write_settings(sf)
    
    print('clustering...')    
    clustered_dupes = deduper.partition(data_d, 0.5)
    print('# duplicate sets', len(clustered_dupes))
    
    cluster_membership = {}
    for cluster_id, (records, scores) in enumerate(clustered_dupes):
        for record_id, score in zip(records, scores):
            cluster_membership[record_id] = {
                "Cluster ID": cluster_id,
                "confidence_score": score
            }
    print('writing file')
    with open(output_file, 'w') as f_output, open(input_file) as f_input:

        reader = csv.DictReader(f_input)
        fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames

        writer = csv.DictWriter(f_output, fieldnames=fieldnames)
        writer.writeheader()

        for row in reader:
            row_id = int(row['Id'])
            row.update(cluster_membership[row_id])
            writer.writerow(row)
        print('done')

importing data ...


INFO:dedupe.api:Predicate set:
INFO:dedupe.api:(TfidfTextSearchPredicate: (0.6, title), SimplePredicate: (commonThreeTokens, title))
INFO:dedupe.api:(LevenshteinSearchPredicate: (1, modelno), SimplePredicate: (orderOfMagnitude, price), SimplePredicate: (sameSevenCharStartPredicate, brand))
INFO:dedupe.api:(TfidfTextSearchPredicate: (0.4, title), SimplePredicate: (roundTo1, price))
INFO:dedupe.api:(TfidfTextSearchPredicate: (0.6, title), SimplePredicate: (commonTwoTokens, brand), SimplePredicate: (hundredIntegersOddPredicate, title))


reading from ./runs/walmart_amazon_learned_settings
clustering...


INFO:dedupe.blocking:10000, 2.7190802 seconds


# duplicate sets 836
writing file
done
