This dedupe file is meant to deduplicate single files.

In [1]:
import os
import csv
import re

import dedupe
from unidecode import unidecode

In [2]:
# Be sure to edit this before running
input_file_left = '../Data/Outputs_Cleanup/physicians_info_dedup.csv'
input_file_right = '../Data/Outputs_Cleanup/prescriber_dedup.csv'
output_file = '../Data/recordLink_results/physicians.csv'
settings_file = '../Data/recordLink_results/physicians_learned_settings'
training_file = '../Data/recordLink_results/physicians_training.json'

In [3]:
fields = [{'field':'fname','type':'String'},
                  {'field':'lname','type':'String'},
                  {'field':'type','type':'String'},
                  {'field':'city','type':'String'}]

In [4]:
def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """

    column = unidecode(column)
    column = re.sub('\n', ' ', column)
    column = re.sub('-', '', column)
    column = re.sub('/', ' ', column)
    column = re.sub("'", '', column)
    column = re.sub(",", '', column)
    column = re.sub(":", ' ', column)
    column = re.sub('  +', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column:
        column = None
    return column

In [5]:
def readData(filename):
    """
    Read in our data from a CSV file and create a dictionary of records,
    where the key is a unique record ID and each value is dict
    """

    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            clean_row = dict([(k, preProcess(v)) for (k, v) in row.items()])
            data_d[filename + str(i)] = dict(clean_row)

    return data_d

In [7]:
if __name__ == '__main__':
    
    print('importing data ...')
    data_1 = readData(input_file_left)
    data_2 = readData(input_file_right)   
       
    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as sf:
            linker = dedupe.StaticRecordLink(sf)
    
    else:        
        linker = dedupe.RecordLink(fields)
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file) as tf:
                linker.prepare_training(data_1,
                                        data_2,
                                        training_file=tf,
                                        sample_size=15000)
        else:
            linker.prepare_training(data_1, data_2, sample_size=5000,blocked_proportion=0.5)
        print('starting active labeling...')
        dedupe.console_label(linker)
        linker.train()
        with open(training_file, 'w') as tf:
            linker.write_training(tf)
        with open(settings_file, 'wb') as sf:
            linker.write_settings(sf)
    
    print('clustering...')    
    linked_records = linker.join(data_1, data_2, threshold=0.5,constraint='one-to-one')
    print('# duplicate sets', len(linked_records))
    
    cluster_membership = {}
    for cluster_id, (cluster, score) in enumerate(linked_records):
        for record_id in cluster:
            cluster_membership[record_id] = {'Cluster ID': cluster_id,
                                             'Link Score': score}
    print('writing file')
    with open(output_file, 'w',newline='') as f:

        header_unwritten = True

        for fileno, filename in enumerate((left_file, right_file)):
            with open(filename) as f_input:
                reader = csv.DictReader(f_input)

                if header_unwritten:

                    fieldnames = (['Cluster ID', 'Link Score', 'source file'] +
                                  reader.fieldnames)

                    writer = csv.DictWriter(f, fieldnames=fieldnames)
                    writer.writeheader()

                    header_unwritten = False

                for row_id, row in enumerate(reader):

                    record_id = filename + str(row_id)
                    cluster_details = cluster_membership.get(record_id, {})
                    row['source file'] = fileno
                    row.update(cluster_details)

                    writer.writerow(row)
        print('done')

importing data ...


OverflowError: Python int too large to convert to C long