# Medicare Part D - OpenFDA Drug Matching
## Authors: 
    1. Lam Ho
    2. Jonah Breslow
    3. Jeffrey Kagan
## Purpose:
The purpose of this notebook is to match the drug data from the Medicare Part D Drug data to the drug data from the OpenFDA Drug data. For this procedure, we utilized the Dedupe.io python implementation.

### Importing modules

In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates how to use RecordLink with two comma separated
values (CSV) files. We have listings of products from two different
online stores. The task is to link products between the datasets.

The output will be a CSV with our linkded results.

"""
import os
import csv
import re
import logging
import optparse
import pandas as pd
import numpy as np
import pickle

import dedupe
from unidecode import unidecode

### Importing Data

In [2]:
fda_drugs = pickle.load(open("Data/Outputs_Cleanup/FDA/Openfda_Drug_deduplicated.p", "rb" )).reset_index()
df = fda_drugs[['brand_name']]
df.to_csv('Data/Outputs_Cleanup/FDA/openfda_drug_dedup_input.csv')

In [3]:
medD_drugs = pickle.load(open("Data/Outputs_Cleanup/Part_d/dedupe_output/MedicareD_Drug_deduplicated_singlebrand.p", "rb" )).reset_index()
df = medD_drugs[['brand_name']]
df.to_csv('Data/Outputs_Cleanup/Part_d/dedupe_output/medD_drug_dedup_input.csv')

### Running the Dedupe procedure

In [4]:
def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """

    column = unidecode(column)
    column = re.sub('\n', ' ', column)
    column = re.sub('-', '', column)
    column = re.sub('/', ' ', column)
    column = re.sub("'", '', column)
    column = re.sub(",", '', column)
    column = re.sub(":", ' ', column)
    column = re.sub('  +', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column:
        column = None
    return column


def readData(filename):
    """
    Read in our data from a CSV file and create a dictionary of records,
    where the key is a unique record ID.
    """

    data_d = {}

    with open(filename) as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            clean_row = dict([(k, preProcess(v)) for (k, v) in row.items()])
            data_d[filename + str(i)] = dict(clean_row)

    return data_d

### parameters

In [5]:
retrain = False

In [6]:
# Parameters
retrain = False


In [7]:

if retrain == True:
    try:
        os.remove('csv_example_learned_settings')
    except:
        print('Your settings file appears to not have existed.')
    
    
output_file = 'Data/Outputs_Cleanup/FDA_partD_drug_matching/data_matching_output.csv'
settings_file = 'Data/Outputs_Cleanup/FDA_partD_drug_matching/data_matching_learned_settings'
training_file = 'Data/Outputs_Cleanup/FDA_partD_drug_matching/data_matching_training.json'

left_file = 'Data/Outputs_Cleanup/FDA/openfda_drug_dedup_input.csv'
right_file = 'Data/Outputs_Cleanup/Part_d/dedupe_output/medD_drug_dedup_input.csv'

print('importing data ...')
data_1 = readData(left_file)
data_2 = readData(right_file)

def descriptions():
    for dataset in (data_1, data_2):
        for record in dataset.values():
            yield record['description']

# ## Training

if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as sf:
        linker = dedupe.StaticRecordLink(sf)

else:
    # Define the fields the linker will pay attention to
    #
    # Notice how we are telling the linker to use a custom field comparator
    # for the 'price' field.
    fields = [
        {'field': 'brand_name', 'type': 'String'},
        #{'field': 'title', 'type': 'Text', 'corpus': descriptions()},
        #{'field': 'description', 'type': 'Text',
        # 'has missing': True, 'corpus': descriptions()},
        #{'field': 'price', 'type': 'Price', 'has missing': True}
    ]

    # Create a new linker object and pass our data model to it.
    linker = dedupe.RecordLink(fields)

    # If we have training data saved from a previous run of linker,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file) as tf:
            linker.prepare_training(data_1,
                                    data_2,
                                    training_file=tf,
                                    sample_size=15000)
    else:
        linker.prepare_training(data_1, data_2, sample_size=15000)

    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as matches
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print('starting active labeling...')

    dedupe.console_label(linker)

    linker.train()

    # When finished, save our training away to disk
    with open(training_file, 'w') as tf:
        linker.write_training(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open(settings_file, 'wb') as sf:
        linker.write_settings(sf)

# ## Blocking

# ## Clustering

# Find the threshold that will maximize a weighted average of our
# precision and recall.  When we set the recall weight to 2, we are
# saying we care twice as much about recall as we do precision.
#
# If we had more data, we would not pass in all the blocked data into
# this function but a representative sample.

print('clustering...')
linked_records = linker.join(data_1, data_2, 0.0)

print('# duplicate sets', len(linked_records))
# ## Writing Results

# Write our original data back out to a CSV with a new column called
# 'Cluster ID' which indicates which records refer to each other.

cluster_membership = {}
for cluster_id, (cluster, score) in enumerate(linked_records):
    for record_id in cluster:
        cluster_membership[record_id] = {'Cluster ID': cluster_id,
                                         'Link Score': score}

with open(output_file, 'w') as f:

    header_unwritten = True

    for fileno, filename in enumerate((left_file, right_file)):
        with open(filename) as f_input:
            reader = csv.DictReader(f_input)

            if header_unwritten:

                fieldnames = (['Cluster ID', 'Link Score', 'source file'] +
                              reader.fieldnames)

                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()

                header_unwritten = False

            for row_id, row in enumerate(reader):

                record_id = filename + str(row_id)
                cluster_details = cluster_membership.get(record_id, {})
                row['source file'] = fileno
                row.update(cluster_details)

                writer.writerow(row)

importing data ...
reading from Data/Outputs_Cleanup/FDA_partD_drug_matching/data_matching_learned_settings


INFO:dedupe.api:Predicate set:


INFO:dedupe.api:(TfidfNGramSearchPredicate: (0.8, brand_name), SimplePredicate: (firstTokenPredicate, brand_name), SimplePredicate: (commonSixGram, brand_name))


clustering...


# duplicate sets 1512


### Post-Processing the Drug Matches

In [8]:
df = pd.read_csv('Data/Outputs_Cleanup/FDA_partD_drug_matching/data_matching_output.csv')
#df.sort_values('Cluster ID')
df_fda = df[df['source file'] == 0]
df_partD = df[df['source file'] == 1]
df_partD.columns = ['Cluster ID','Link Score','source file','index','brand_name']
df_fda.columns = ['Cluster ID','Link Score','source file','index','brand_name']

fda_drugs_merge = fda_drugs.reset_index()
medD_drugs_merge = medD_drugs.reset_index()

#### Pulling on the identifiers back into the source data

In [9]:
df_fda = df_fda.merge(fda_drugs_merge[['brand_name', 'fda_drug_id','index']], how='left', on=['brand_name','index'])
df_partD = df_partD.merge(medD_drugs_merge[['brand_name', 'MedD_drug_id','index']], how='left', on=['brand_name','index'])

#### Aggregating to the Drug-Level 
We want our output table to be unique by drug

In [10]:
drug_merge = pd.concat([df_fda, df_partD], axis=0).sort_values('Cluster ID')
drug_merge['fda_drug_id'] = drug_merge['fda_drug_id'].fillna('[]')
drug_merge['MedD_drug_id'] = drug_merge['MedD_drug_id'].fillna('[]')

drug_merge = drug_merge.reset_index()
drug_merge['Cluster ID'] = np.where(drug_merge['Cluster ID'].isna(),
                                    drug_merge['index']+100000000,
                                    drug_merge['Cluster ID']
    )
drug_merge = drug_merge.groupby('Cluster ID').agg(list)

#### Extracting IDs into source data from the lists

In [11]:
def pick_list(x):
    my_pick = None
    for each in x:
        if each == '[]':
            continue
        else:
            my_pick = each
    return my_pick

drug_merge['fda_drug_id'] = drug_merge['fda_drug_id'].apply(lambda x: pick_list(x))
drug_merge['MedD_drug_id'] = drug_merge['MedD_drug_id'].apply(lambda x: pick_list(x))

#### When we cluster multiple drug brand names into the same drug, we create a function that picks the longest brand_name to be the cluster name

In [12]:
def pick_longest(x):
    longest = None
    longest_len = 0
    for idx, each in enumerate(x):
        if idx == 0:
            longest = each
            longest_len = len(each)
            continue
        if len(each) > longest_len:
            longest = each
            longest_len = len(each)  
    return longest

### Exporting Results

In [13]:
drug_merge['brand_name'] = drug_merge['brand_name'].apply(lambda x: pick_longest(x))
drug_merge = drug_merge[['brand_name', 'fda_drug_id', 'MedD_drug_id']]
pickle.dump(drug_merge, open( "Data/Outputs_Cleanup/FDA_partD_drug_matching/fda_partD_drugs_matched.p", "wb" ) )
# drug_merge.to_csv('Data/Outputs_Cleanup/FDA_partD_drug_matching/fda_partD_drugs_matched.csv', index=False)