# OpenFDA Manufacturer Deduplication
## Authors: 
    1. Lam Ho
    2. Jonah Breslow
    3. Jeffrey Kagan
## Purpose:
The purpose of this notebook is to internally deduplicate the manufacturer data from the OpenFDA data. The output will be used to match with the Sunshine Act Data that contains manufacturer node information. For this procedure, we utilized the Dedupe.io python implementation.

### Importing modules

In [18]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates how to use dedupe with a comma separated values
(CSV) file. All operations are performed in memory, so will run very
quickly on datasets up to ~10,000 rows.

We start with a CSV file containing our messy data. In this example,
it is listings of early childhood education centers in Chicago
compiled from several different sources.

The output will be a CSV with our clustered results.

For larger datasets, see our [mysql_example](mysql_example.html)
"""

import os
import csv
import re
import logging
import optparse
import pandas as pd
import pickle

import dedupe
from unidecode import unidecode

### Importing Data

In [19]:
openfda = pd.read_csv('../Data/Outputs_Cleanup/FDA/openfda_processed.csv')
openfda_manuf = openfda[['manufacturer_name_normalized', 'fda_manuf_id']].drop_duplicates()
openfda_manuf.columns = ['manufacturer_name', 'fda_manuf_id']
openfda_manuf.to_csv('../Data/Outputs_Cleanup/FDA/openfda_manufacturer_dedupe_input.csv', index=False)

### Running the Dedupe procedure

In [20]:
def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """
    column = unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    # If data is missing, indicate that by setting the value to `None`
    if not column:
        column = None
    return column


def readData(filename):
    """
    Read in our data from a CSV file and create a dictionary of records,
    where the key is a unique record ID and each value is dict
    """

    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row['fda_manuf_id'])
            data_d[row_id] = dict(clean_row)

    return data_d

In [21]:
# ## Setup
retrain = input('Do you want to add on to your training (y/n). If you wanted to start over, delete your .json file')
isretrain = True if retrain == 'y' else False

if isretrain == True:
    try:
        os.remove('csv_example_learned_settings')
    except:
        print('Your settings file appears to not have existed.')

#input_file = 'csv_example_messy_input.csv'
input_file = '../Data/Outputs_Cleanup/FDA/openfda_manufacturer_dedupe_input.csv'
output_file = '../Data/Outputs_Cleanup/FDA/openfda_manuf_deduplicated.csv'
settings_file = '../Data/Outputs_Cleanup/FDA/csv_example_learned_settings'
training_file = '../Data/Outputs_Cleanup/FDA/csv_example_training.json'

print('importing data ...')
data_d = readData(input_file)

# If a settings file already exists, we'll just load that and skip training
if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)
else:
    # ## Training

    # Define the fields dedupe will pay attention to
    fields = [
        {'field': 'manufacturer_name', 'type': 'String'},
        #{'field': 'Address', 'type': 'String'},
        #{'field': 'Zip', 'type': 'Exact', 'has missing': True},
        #{'field': 'Phone', 'type': 'String', 'has missing': True},
        ]

    # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(fields)

    # If we have training data saved from a previous run of dedupe,
    # look for it and load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.prepare_training(data_d, f)
    else:
        deduper.prepare_training(data_d)

    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print('starting active labeling...')

    dedupe.console_label(deduper)

    # Using the examples we just labeled, train the deduper and learn
    # blocking predicates
    deduper.train()

    # When finished, save our training to disk
    with open(training_file, 'w') as tf:
        deduper.write_training(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open(settings_file, 'wb') as sf:
        deduper.write_settings(sf)

# ## Clustering

# `partition` will return sets of records that dedupe
# believes are all referring to the same entity.

print('clustering...')
clustered_dupes = deduper.partition(data_d, 0.5)

print('# duplicate sets', len(clustered_dupes))

# ## Writing Results

# Write our original data back out to a CSV with a new column called
# 'Cluster ID' which indicates which records refer to each other.

cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        cluster_membership[record_id] = {
            "Cluster ID": cluster_id,
            "confidence_score": score
        }

with open(output_file, 'w') as f_output, open(input_file) as f_input:

    reader = csv.DictReader(f_input)
    fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames

    writer = csv.DictWriter(f_output, fieldnames=fieldnames)
    writer.writeheader()

    for row in reader:
        row_id = int(row['fda_manuf_id'])
        row.update(cluster_membership[row_id])
        writer.writerow(row)

Do you want to add on to your training (y/n). If you wanted to start over, delete your .json file 


importing data ...
reading from ../Data/Outputs_Cleanup/FDA/csv_example_learned_settings


INFO:dedupe.api:Predicate set:
INFO:dedupe.api:SimplePredicate: (firstTokenPredicate, manufacturer_name)
INFO:dedupe.api:(SimplePredicate: (sameSevenCharStartPredicate, manufacturer_name), TfidfNGramCanopyPredicate: (0.4, manufacturer_name))
INFO:dedupe.api:(LevenshteinCanopyPredicate: (1, manufacturer_name), TfidfTextCanopyPredicate: (0.2, manufacturer_name), SimplePredicate: (sortedAcronym, manufacturer_name))
INFO:dedupe.api:(TfidfNGramCanopyPredicate: (0.6, manufacturer_name), SimplePredicate: (oneGramFingerprint, manufacturer_name), SimplePredicate: (suffixArray, manufacturer_name))
INFO:dedupe.api:(SimplePredicate: (firstIntegerPredicate, manufacturer_name), SimplePredicate: (commonSixGram, manufacturer_name), SimplePredicate: (tokenFieldPredicate, manufacturer_name))
INFO:dedupe.api:(TfidfNGramCanopyPredicate: (0.8, manufacturer_name), TfidfNGramCanopyPredicate: (0.2, manufacturer_name), TfidfTextCanopyPredicate: (0.4, manufacturer_name))


clustering...


INFO:dedupe.canopy_index:Removing stop word  l
INFO:dedupe.canopy_index:Removing stop word e 
INFO:dedupe.canopy_index:Removing stop word is
INFO:dedupe.canopy_index:Removing stop word td
INFO:dedupe.canopy_index:Removing stop word  i
INFO:dedupe.canopy_index:Removing stop word al
INFO:dedupe.canopy_index:Removing stop word at
INFO:dedupe.canopy_index:Removing stop word ca
INFO:dedupe.canopy_index:Removing stop word ha
INFO:dedupe.canopy_index:Removing stop word in
INFO:dedupe.canopy_index:Removing stop word ma
INFO:dedupe.canopy_index:Removing stop word o 
INFO:dedupe.canopy_index:Removing stop word rm
INFO:dedupe.canopy_index:Removing stop word ti
INFO:dedupe.canopy_index:Removing stop word ut
INFO:dedupe.canopy_index:Removing stop word io
INFO:dedupe.canopy_index:Removing stop word lc
INFO:dedupe.canopy_index:Removing stop word n 
INFO:dedupe.canopy_index:Removing stop word pr
INFO:dedupe.canopy_index:Removing stop word s 
INFO:dedupe.canopy_index:Removing stop word st
INFO:dedupe.c

# duplicate sets 9779


### Post-Processing the Manufacturer Matches
When we cluster multiple manufacturers into the same manufacturer, we create a function that picks the longest `manufacturer_name` to be the cluster name.

In [22]:
def pick_brand(brands):
    pick = None
    pick_len = 0
    for idx, brand in enumerate(brands):
        if idx == 0:
            pick = brand
            pick_len = len(brand)
            continue
        if len(brand) > pick_len:
            pick = brand
            pick_len = len(brand)
    return pick

In [23]:
openfda_dedupe = pd.read_csv('../Data/Outputs_Cleanup/FDA/openfda_manuf_deduplicated.csv')

### Exporting Data to Pickle

In [24]:
df_cluster = pd.DataFrame(openfda_dedupe.groupby('Cluster ID')['manufacturer_name'].apply(list))
df_cluster['fda_manuf_id'] = openfda_dedupe.groupby('Cluster ID')['fda_manuf_id'].apply(list)
df_cluster['manuf_name_picked'] = df_cluster['manufacturer_name'].apply(pick_brand)
df_cluster.columns = ['manuf_names', 'fda_manuf_id', 'manufacturer_name']
pickle.dump(df_cluster, open( "../Data/Outputs_Cleanup/FDA/openfda_manufacturer_deduplicated_single_manuf.p", "wb" ) )