In [9]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates how to use dedupe with a comma separated values
(CSV) file. All operations are performed in memory, so will run very
quickly on datasets up to ~10,000 rows.

We start with a CSV file containing our messy data. In this example,
it is listings of early childhood education centers in Chicago
compiled from several different sources.

The output will be a CSV with our clustered results.

For larger datasets, see our [mysql_example](mysql_example.html)
"""

import os
import csv
import re
import logging
import optparse
import pandas as pd
import pickle

import dedupe
from unidecode import unidecode

In [2]:
openfda = pd.read_csv('../../Data/Outputs_Cleanup/FDA/openfda_processed.csv')
display(openfda.columns)
openfda_manuf = openfda[['manufacturer_name_normalized', 'fda_manuf_id']].drop_duplicates()
openfda_manuf.columns = ['manufacturer_name', 'fda_manuf_id']
openfda_manuf.to_csv('../../Data/Outputs_Cleanup/FDA/openfda_manufacturer_dedupe_input.csv', index=False)

Index(['fda_drug_id', 'brand_name', 'generic_name', 'manufacturer_name',
       'product_ndc', 'product_type', 'route', 'substance_name', 'spl_id',
       'spl_set_id', 'package_ndc', 'is_original_packager', 'nui',
       'pharm_class_epc', 'pharm_class_pe', 'pharm_class_cs', 'unii',
       'application_number', 'rxcui', 'upc', 'original_packager_product_ndc',
       'pharm_class_moa', 'manufacturer_name_normalized', 'fda_manuf_id'],
      dtype='object')

In [3]:
def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """
    column = unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    # If data is missing, indicate that by setting the value to `None`
    if not column:
        column = None
    return column


def readData(filename):
    """
    Read in our data from a CSV file and create a dictionary of records,
    where the key is a unique record ID and each value is dict
    """

    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row['fda_manuf_id'])
            data_d[row_id] = dict(clean_row)

    return data_d

In [4]:
# ## Setup
retrain = input('Do you want to add on to your training (y/n). If you wanted to start over, delete your .json file')
isretrain = True if retrain == 'y' else False

if isretrain == True:
    try:
        os.remove('csv_example_learned_settings')
    except:
        print('Your settings file appears to not have existed.')

#input_file = 'csv_example_messy_input.csv'
input_file = '../../Data/Outputs_Cleanup/FDA/openfda_manufacturer_dedupe_input.csv'
output_file = '../../Data/Outputs_Cleanup/FDA/openfda_manuf_deduplicated.csv'
settings_file = '../../Data/Outputs_Cleanup/FDA/csv_example_learned_settings'
training_file = '../../Data/Outputs_Cleanup/FDA/csv_example_training.json'

print('importing data ...')
data_d = readData(input_file)

# If a settings file already exists, we'll just load that and skip training
if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)
else:
    # ## Training

    # Define the fields dedupe will pay attention to
    fields = [
        {'field': 'manufacturer_name', 'type': 'String'},
        #{'field': 'Address', 'type': 'String'},
        #{'field': 'Zip', 'type': 'Exact', 'has missing': True},
        #{'field': 'Phone', 'type': 'String', 'has missing': True},
        ]

    # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(fields)

    # If we have training data saved from a previous run of dedupe,
    # look for it and load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.prepare_training(data_d, f)
    else:
        deduper.prepare_training(data_d)

    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print('starting active labeling...')

    dedupe.console_label(deduper)

    # Using the examples we just labeled, train the deduper and learn
    # blocking predicates
    deduper.train()

    # When finished, save our training to disk
    with open(training_file, 'w') as tf:
        deduper.write_training(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open(settings_file, 'wb') as sf:
        deduper.write_settings(sf)

# ## Clustering

# `partition` will return sets of records that dedupe
# believes are all referring to the same entity.

print('clustering...')
clustered_dupes = deduper.partition(data_d, 0.5)

print('# duplicate sets', len(clustered_dupes))

# ## Writing Results

# Write our original data back out to a CSV with a new column called
# 'Cluster ID' which indicates which records refer to each other.

cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        cluster_membership[record_id] = {
            "Cluster ID": cluster_id,
            "confidence_score": score
        }

with open(output_file, 'w') as f_output, open(input_file) as f_input:

    reader = csv.DictReader(f_input)
    fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames

    writer = csv.DictWriter(f_output, fieldnames=fieldnames)
    writer.writeheader()

    for row in reader:
        row_id = int(row['fda_manuf_id'])
        row.update(cluster_membership[row_id])
        writer.writerow(row)

Do you want to add on to your training (y/n). If you wanted to start over, delete your .json file n


importing data ...


INFO:dedupe.canopy_index:Removing stop word llc
INFO:dedupe.canopy_index:Removing stop word inc
INFO:dedupe.canopy_index:Removing stop word co
INFO:dedupe.canopy_index:Removing stop word  l
INFO:dedupe.canopy_index:Removing stop word it
INFO:dedupe.canopy_index:Removing stop word lc
INFO:dedupe.canopy_index:Removing stop word sa
INFO:dedupe.canopy_index:Removing stop word ti
INFO:dedupe.canopy_index:Removing stop word  i
INFO:dedupe.canopy_index:Removing stop word al
INFO:dedupe.canopy_index:Removing stop word e 
INFO:dedupe.canopy_index:Removing stop word ma
INFO:dedupe.canopy_index:Removing stop word or
INFO:dedupe.canopy_index:Removing stop word st
INFO:dedupe.canopy_index:Removing stop word d 
INFO:dedupe.canopy_index:Removing stop word he
INFO:dedupe.canopy_index:Removing stop word la
INFO:dedupe.canopy_index:Removing stop word  c
INFO:dedupe.canopy_index:Removing stop word ca
INFO:dedupe.canopy_index:Removing stop word co
INFO:dedupe.canopy_index:Removing stop word ed
INFO:dedupe

starting active labeling...


 y


manufacturer_name : your military exchanges

manufacturer_name : your military exchange

1/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, manufacturer_name)
INFO:dedupe.training:TfidfTextCanopyPredicate: (0.8, manufacturer_name)
manufacturer_name : valley of the sun cosmetics llc

manufacturer_name : valley of the sun cosmetics

2/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:LevenshteinCanopyPredicate: (1, manufacturer_name)
INFO:dedupe.training:TfidfTextCanopyPredicate: (0.8, manufacturer_name)
manufacturer_name : aquagenics technologies

manufacturer_name : aquagenics technologies inc

3/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:TfidfNGramCanopyPredicate: (0.6, manufacturer_name)
manufacturer_name : body one products inc

manufacturer_name : body one products inc

4/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


manufacturer_name : guangzhou oumiao cosmetics co ltd

manufacturer_name : guangzhou oumiao cosmetics co ltd

5/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:TfidfNGramCanopyPredicate: (0.6, manufacturer_name)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, manufacturer_name)
manufacturer_name : eminence organic skin care

manufacturer_name : eminence organic skin care ltd

6/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


manufacturer_name : forreal pharmaceuticals llc

manufacturer_name : forreal pharmaceuticals

7/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (firstTwoTokensPredicate, manufacturer_name)
manufacturer_name : valu merchandiser

manufacturer_name : valu merchandisersco

8/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


manufacturer_name : intermed inc

manufacturer_name : intermed

9/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (firstTwoTokensPredicate, manufacturer_name)
INFO:dedupe.training:LevenshteinCanopyPredicate: (3, manufacturer_name)
manufacturer_name : home health

manufacturer_name : home healthworks inc

10/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:TfidfTextCanopyPredicate: (0.6, manufacturer_name)
INFO:dedupe.training:LevenshteinCanopyPredicate: (3, manufacturer_name)
manufacturer_name : publix super markets inc

manufacturer_name : publix super markets

11/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (sameSevenCharStartPredicate, manufacturer_name)
manufacturer_name : 98 leaf inc

manufacturer_name : 98 leaf

12/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


manufacturer_name : 3lab inc

manufacturer_name : 3lab

13/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (sameSevenCharStartPredicate, manufacturer_name)
INFO:dedupe.training:SimplePredicate: (firstIntegerPredicate, manufacturer_name)
manufacturer_name : cvs pharmacy

manufacturer_name : cvs

14/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


manufacturer_name : o2

manufacturer_name : o2 clean

15/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (firstTokenPredicate, manufacturer_name)
manufacturer_name : georgia pacific consumer products

manufacturer_name : georgiapacific consumer products

16/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


manufacturer_name : edgewell personal care brands llc

manufacturer_name : edgewellpersonal care brands llc

17/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (firstTokenPredicate, manufacturer_name)
INFO:dedupe.training:SimplePredicate: (fingerprint, manufacturer_name)
manufacturer_name : fustinxiamen commodity co ltd

manufacturer_name : fustin xiamen commodity co ltd

18/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (firstTokenPredicate, manufacturer_name)
INFO:dedupe.training:SimplePredicate: (twoGramFingerprint, manufacturer_name)
manufacturer_name : sanofiaventis us llc

manufacturer_name : sanofaventis us llc

19/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


manufacturer_name : amerisource bergen

manufacturer_name : amerrisource bergen

20/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (firstTokenPredicate, manufacturer_name)
INFO:dedupe.training:LevenshteinCanopyPredicate: (1, manufacturer_name)
manufacturer_name : preferred pharmaceuticals inc

manufacturer_name : preferredpharmaceuticals inc

21/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


manufacturer_name : amazoncom services llc

manufacturer_name : amazon com services llc

22/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (firstTokenPredicate, manufacturer_name)
INFO:dedupe.training:SimplePredicate: (doubleMetaphone, manufacturer_name)
manufacturer_name : unifirst first aid corporation

manufacturer_name : unifirstfirst aid

23/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


manufacturer_name : sovents and petroleum inc

manufacturer_name : solvents ans petroleum inc

24/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (firstTokenPredicate, manufacturer_name)
INFO:dedupe.training:SimplePredicate: (doubleMetaphone, manufacturer_name)
INFO:dedupe.training:TfidfNGramCanopyPredicate: (0.8, manufacturer_name)
manufacturer_name : dr reddys laboratories limited

manufacturer_name : drreddys laboratories ltd

25/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (firstTokenPredicate, manufacturer_name)
INFO:dedupe.training:LevenshteinCanopyPredicate: (1, manufacturer_name)
INFO:dedupe.training:SimplePredicate: (twoGramFingerprint, manufacturer_name)
INFO:dedupe.training:TfidfNGramCanopyPredicate: (0.8, manufacturer_name)
manufacturer_name : crosstown concepts corporation

manufacturer_name : crossj

26/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (sameSevenCharStartPredicate, manufacturer_name)
INFO:dedupe.training:SimplePredicate: (firstTokenPredicate, manufacturer_name)
INFO:dedupe.training:LevenshteinCanopyPredicate: (1, manufacturer_name)
manufacturer_name : aksan kozmetik sanayi ve ticaret anonim sirketi

manufacturer_name : erte kozmetik sanayi ve ticaret anonim sirketi

26/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


manufacturer_name : poly pharmaceuticals inc

manufacturer_name : bell pharmaceuticals inc

27/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (sameSevenCharStartPredicate, manufacturer_name)
INFO:dedupe.training:SimplePredicate: (firstTokenPredicate, manufacturer_name)
INFO:dedupe.training:LevenshteinCanopyPredicate: (1, manufacturer_name)
INFO:dedupe.training:TfidfNGramCanopyPredicate: (0.6, manufacturer_name)
manufacturer_name : 3b international limited liability company

manufacturer_name : 3d international llc

27/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


manufacturer_name : nucare pharmaceuticals inc

manufacturer_name : nucar pharmaceuticalsinc

28/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 u


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (sameSevenCharStartPredicate, manufacturer_name)
INFO:dedupe.training:SimplePredicate: (firstTokenPredicate, manufacturer_name)
INFO:dedupe.training:LevenshteinCanopyPredicate: (1, manufacturer_name)
INFO:dedupe.training:SimplePredicate: (firstIntegerPredicate, manufacturer_name)
INFO:dedupe.training:TfidfNGramCanopyPredicate: (0.6, manufacturer_name)
manufacturer_name : vanda pharmaceuticals inc

manufacturer_name : vyne pharmaceuticals inc

28/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


manufacturer_name : arcet equipment company dba arc3 gases north

manufacturer_name : 3h

28/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


manufacturer_name : u2

manufacturer_name : rxo2 oxygen medical equipment supply co inc

28/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


manufacturer_name : ani pharmaceuticals inc

manufacturer_name : wayne pharmaceuticals inc

28/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


manufacturer_name : arimed pharmaceuticals

manufacturer_name : aries pharmaceuticals inc

28/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


manufacturer_name : camber pharmaceuticals inc

manufacturer_name : be pharmaceuticals inc

28/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


manufacturer_name : intermed

manufacturer_name : interbrilho higiene e limpeza ltda

28/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


manufacturer_name : 3d imaging drug design and development llc

manufacturer_name : 3lab

28/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


manufacturer_name : centrix inc

manufacturer_name : centrix pharmaceutical inc

28/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


manufacturer_name : innovative med

manufacturer_name : innovative safety supply inc

29/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


manufacturer_name : southern gas supply llc

manufacturer_name : southern gas and supply of mississippi inc

29/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 f


Finished labeling
INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.100000, score 0.7883396022142921
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (firstTokenPredicate, manufacturer_name)
INFO:dedupe.training:(SimplePredicate: (sameSevenCharStartPredicate, manufacturer_name), TfidfNGramCanopyPredicate: (0.4, manufacturer_name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, manufacturer_name), TfidfTextCanopyPredicate: (0.2, manufacturer_name), SimplePredicate: (sortedAcronym, manufacturer_name))
INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.6, manufacturer_name), SimplePredicate: (oneGramFingerprint, manufacturer_name), SimplePredicate: (suffixArray, manufacturer_name))
INFO:dedupe.training:(SimplePredicate: (firstIntegerPredicate, manufacturer_name), SimplePredicate: (commonSixGram, manufacturer_name), SimplePredicate: (tokenFieldPredicate, manufacturer_name))
INFO:dedupe.t

clustering...


INFO:dedupe.canopy_index:Removing stop word  l
INFO:dedupe.canopy_index:Removing stop word it
INFO:dedupe.canopy_index:Removing stop word lc
INFO:dedupe.canopy_index:Removing stop word sa
INFO:dedupe.canopy_index:Removing stop word ti
INFO:dedupe.canopy_index:Removing stop word  i
INFO:dedupe.canopy_index:Removing stop word al
INFO:dedupe.canopy_index:Removing stop word e 
INFO:dedupe.canopy_index:Removing stop word ma
INFO:dedupe.canopy_index:Removing stop word or
INFO:dedupe.canopy_index:Removing stop word st
INFO:dedupe.canopy_index:Removing stop word d 
INFO:dedupe.canopy_index:Removing stop word he
INFO:dedupe.canopy_index:Removing stop word la
INFO:dedupe.canopy_index:Removing stop word  c
INFO:dedupe.canopy_index:Removing stop word ca
INFO:dedupe.canopy_index:Removing stop word co
INFO:dedupe.canopy_index:Removing stop word ed
INFO:dedupe.canopy_index:Removing stop word g 
INFO:dedupe.canopy_index:Removing stop word ic
INFO:dedupe.canopy_index:Removing stop word lt
INFO:dedupe.c

# duplicate sets 9779


In [4]:
def pick_brand(brands):
    pick = None
    pick_len = 0
    for idx, brand in enumerate(brands):
        if idx == 0:
            pick = brand
            pick_len = len(brand)
            continue
        if len(brand) > pick_len:
            pick = brand
            pick_len = len(brand)
    return pick

#df = pd.read_csv('MedicareD_drug_deduplicated.csv')

In [5]:
openfda_dedupe = pd.read_csv('../../Data/Outputs_Cleanup/FDA/openfda_manuf_deduplicated.csv')
openfda_dedupe.head()

Unnamed: 0,Cluster ID,confidence_score,manufacturer_name,fda_manuf_id
0,0,0.950438,BIOACTIVE NUTRITIONAL INC,0
1,1,0.793765,URIEL PHARMACY INC,1
2,853,1.0,ULTRA SEAL CORPORATION,2
3,854,1.0,L PERRIGO COMPANY,3
4,2,0.632143,STRIDES PHARMA INC,4


In [11]:
df_cluster = pd.DataFrame(openfda_dedupe.groupby('Cluster ID')['manufacturer_name'].apply(list))
df_cluster['fda_manuf_id'] = openfda_dedupe.groupby('Cluster ID')['fda_manuf_id'].apply(list)
df_cluster['manuf_name_picked'] = df_cluster['manufacturer_name'].apply(pick_brand)
df_cluster.columns = ['manuf_names', 'fda_manuf_id', 'manufacturer_name']
pickle.dump(df_cluster, open( "../../Data/Outputs_Cleanup/FDA/openfda_manufacturer_deduplicated_single_manuf.p", "wb" ) )
df_cluster.to_csv('../../Data/Outputs_Cleanup/FDA/openfda_manufacturer_deduplicated_single_manuf.csv')