# Dedupe openfda versus medicare part D

In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates how to use RecordLink with two comma separated
values (CSV) files. We have listings of products from two different
online stores. The task is to link products between the datasets.

The output will be a CSV with our linkded results.

"""
import os
import csv
import re
import logging
import optparse
import pandas as pd

import dedupe
from unidecode import unidecode

In [2]:
fda_partD_drugs = pd.read_csv('../../Data/Outputs_Cleanup/FDA_partD_drug_matching/fda_partD_drugs_matched.csv')
df = fda_partD_drugs[['brand_name']]
df.to_csv('../../Data/Outputs_Cleanup/Sunshine_dedupe_drug_integration/fda_partD_input.csv')

In [3]:
#sunshine_drugs = pd.read_csv('../../Data/Outputs_Cleanup/Sunshine/sunshine_drugs_distinct.csv')
#df = sunshine_drugs[['brand_name']]
#df.to_csv('../../Data/Outputs_Cleanup/Sunshine_dedupe_drug_integration/sunshine_input.csv')

In [4]:
def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """

    column = unidecode(column)
    column = re.sub('\n', ' ', column)
    column = re.sub('-', '', column)
    column = re.sub('/', ' ', column)
    column = re.sub("'", '', column)
    column = re.sub(",", '', column)
    column = re.sub(":", ' ', column)
    column = re.sub('  +', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column:
        column = None
    return column


def readData(filename):
    """
    Read in our data from a CSV file and create a dictionary of records,
    where the key is a unique record ID.
    """

    data_d = {}

    with open(filename) as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            clean_row = dict([(k, preProcess(v)) for (k, v) in row.items()])
            data_d[filename + str(i)] = dict(clean_row)

    return data_d

In [5]:
 # ## Setup
retrain = input('Do you want to add on to your training (y/n). If you wanted to start over, delete your .json file')
isretrain = True if retrain == 'y' else False

if isretrain == True:
    try:
        os.remove('csv_example_learned_settings')
    except:
        print('Your settings file appears to not have existed.')
    
    
output_file = '../../Data/Outputs_Cleanup/Sunshine_dedupe_drug_integration/data_matching_output.csv'
settings_file = '../../Data/Outputs_Cleanup/Sunshine_dedupe_drug_integration/data_matching_learned_settings'
training_file = '../../Data/Outputs_Cleanup/Sunshine_dedupe_drug_integration/data_matching_training.json'

left_file = '../../Data/Outputs_Cleanup/Sunshine_dedupe_drug_integration/fda_partD_input.csv'
right_file = '../../Data/Outputs_Cleanup/Sunshine/sunshine_drugs_distinct.csv'

print('importing data ...')
data_1 = readData(left_file)
data_2 = readData(right_file)

def descriptions():
    for dataset in (data_1, data_2):
        for record in dataset.values():
            yield record['description']

# ## Training

if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as sf:
        linker = dedupe.StaticRecordLink(sf)

else:
    # Define the fields the linker will pay attention to
    #
    # Notice how we are telling the linker to use a custom field comparator
    # for the 'price' field.
    fields = [
        {'field': 'brand_name', 'type': 'String'},
        #{'field': 'title', 'type': 'Text', 'corpus': descriptions()},
        #{'field': 'description', 'type': 'Text',
        # 'has missing': True, 'corpus': descriptions()},
        #{'field': 'price', 'type': 'Price', 'has missing': True}
    ]

    # Create a new linker object and pass our data model to it.
    linker = dedupe.RecordLink(fields)

    # If we have training data saved from a previous run of linker,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file) as tf:
            linker.prepare_training(data_1,
                                    data_2,
                                    training_file=tf,
                                    sample_size=15000)
    else:
        linker.prepare_training(data_1, data_2, sample_size=15000)

    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as matches
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print('starting active labeling...')

    dedupe.console_label(linker)

    linker.train()

    # When finished, save our training away to disk
    with open(training_file, 'w') as tf:
        linker.write_training(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open(settings_file, 'wb') as sf:
        linker.write_settings(sf)

# ## Blocking

# ## Clustering

# Find the threshold that will maximize a weighted average of our
# precision and recall.  When we set the recall weight to 2, we are
# saying we care twice as much about recall as we do precision.
#
# If we had more data, we would not pass in all the blocked data into
# this function but a representative sample.

print('clustering...')
linked_records = linker.join(data_1, data_2, 0.0)

print('# duplicate sets', len(linked_records))
# ## Writing Results

# Write our original data back out to a CSV with a new column called
# 'Cluster ID' which indicates which records refer to each other.

cluster_membership = {}
for cluster_id, (cluster, score) in enumerate(linked_records):
    for record_id in cluster:
        cluster_membership[record_id] = {'Cluster ID': cluster_id,
                                         'Link Score': score}

with open(output_file, 'w') as f:

    header_unwritten = True

    for fileno, filename in enumerate((left_file, right_file)):
        with open(filename) as f_input:
            reader = csv.DictReader(f_input)

            if header_unwritten:

                fieldnames = (['Cluster ID', 'Link Score', 'source file'] +
                              reader.fieldnames)

                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()

                header_unwritten = False

            for row_id, row in enumerate(reader):

                record_id = filename + str(row_id)
                cluster_details = cluster_membership.get(record_id, {})
                row['source file'] = fileno
                row.update(cluster_details)

                writer.writerow(row)

Do you want to add on to your training (y/n). If you wanted to start over, delete your .json file n


importing data ...


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (commonTwoTokens, brand_name)
brand_name : belsomra

brand_name : belsomra

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


starting active labeling...


 y


brand_name : faslodex

brand_name : faslodex

1/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, brand_name)
brand_name : jentadueto

brand_name : jentadueto xr

2/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


brand_name : synjardy

brand_name : synjardy xr

2/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


brand_name : synjardy xr

brand_name : synjardy

2/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


brand_name : janumet

brand_name : janumet xr

2/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


brand_name : bydureon bcise

brand_name : bydureon

2/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


brand_name : aristada

brand_name : aristada initio

2/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : arnuity ellipta

brand_name : arnuity

3/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (sameSevenCharStartPredicate, brand_name)
brand_name : clinpro 5000

brand_name : clinpro 5000 toothpastevanilla mint

4/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


brand_name : penlac

brand_name : pentacel

4/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


brand_name : clinpro 5000

brand_name : clinpro 5000 toothpaste bubble gum

4/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


brand_name : clinpro 5000

brand_name : clinpro 5000 toothpastespearmint

4/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


brand_name : janumet xr

brand_name : janumet xr

4/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : rhofade

brand_name : rhofade

5/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : synjardy

brand_name : synjardy

6/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : fabrazyme

brand_name : fabrazyme

7/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : fasenra

brand_name : fasenra

8/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : darzalex

brand_name : darzalex

9/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : clindamycin phosphate and benzoyl peroxide clindamycin phosphate and benzoyl peroxide

brand_name : fluarix

10/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


brand_name : insulin lispro protamine and insulin lispro injectable suspension mix7525 kwikpen

brand_name : xgeva

10/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


brand_name : bydureon

brand_name : bydureon

10/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : lumizyme

brand_name : lumizyme

11/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : cerdelga

brand_name : cerdelga

12/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : sivextro

brand_name : sivextro

13/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : jentadueto xr

brand_name : jentadueto xr

14/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : motegrity

brand_name : motegrity

15/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : clinisol

brand_name : clinpro 5000 toothpastevanilla mint

16/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


brand_name : clindamycin phosphate and benzoyl peroxide clindamycin phosphate and benzoyl peroxide

brand_name : fasenra

16/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


brand_name : dupixent

brand_name : dupixent

16/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : dulera

brand_name : dulera

17/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : aldurazyme

brand_name : aldurazyme

18/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : cerezyme

brand_name : cerezyme

19/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : jardiance

brand_name : jardiance

20/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


brand_name : steglatro

brand_name : steglatro

21/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 f


Finished labeling
INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.000010, score 0.7416676652276867
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(TfidfNGramSearchPredicate: (0.6, brand_name), SimplePredicate: (sameThreeCharStartPredicate, brand_name))


clustering...
# duplicate sets 26


In [None]:
fieldnames