# Data Integration
 Purpose: This Notebook Takes all the matched entities from Drugs, Manufacturer, and Physician entity resolutions and creates the tables to be loaded into Neo4j
 
## Importing dependencies

In [24]:
import pandas as pd
import numpy as np
import pickle

## Importing Entity Resolution Lookup tables
### Manufacturer Node

In [25]:
manuf_lookup = pickle.load(open('../Data/Outputs_Cleanup/Manufacturer_entity_matching/Manufacturer_Matching_lookup.p','rb'))
manuf_lookup[['manuf_id','manufacturer_name']].to_csv('../Nodes/Manufacturer.csv')

### Drug Node

In [26]:
def string_to_ordered_list (x):
    '''
    Function that takes a string that looks like a list, and converts it to an ordered list
    '''
    ordered_list = None
    try: 
        str_list = str(x).replace('[','').replace(']','').replace(' ','').replace("'",'').split(',')
        str_list = list(map(int,str_list))
        str_set = set(str_list)
        ordered_list = sorted(list(str_set))
    except:
        ordered_list = np.NaN
    
    
    return ordered_list

In [27]:
## Importing drug matching output
drug_node = pickle.load(open('../Data/Outputs_Cleanup/Sunshine_dedupe_drug_integration/final_drug_lookup.p','rb'))
drug_node

Unnamed: 0,drug_id,brand_name,fda_drug_id,MedD_drug_id,sunshine_drug_id
0,0,CORTISONE ACETATE,[104661],[162898],
1,1,CORTEF,[170796],[159595],
2,2,CORLANOR,[110500],[16549],
3,3,COREG CR,[161742],[16548],
4,4,CORDRAN,[35144],[118759],
...,...,...,...,...,...
5127,5127,TES AIAPACK CALIBRATOR SET,,,TES AIAPACK CALIBRATOR SET
5128,5128,ALTERNARIA ALTERNATE ALTERNARIA TENUIS,,,ALTERNARIA ALTERNATE ALTERNARIA TENUIS
5129,5129,BUNAVAIL 63 MG 30COUNT BOX,,,BUNAVAIL 63 MG 30COUNT BOX
5130,5130,CLINPRO 5000 TOOTHPASTEVANILLA MINT,,,CLINPRO 5000 TOOTHPASTEVANILLA MINT


In [28]:
## Importing drug information from the deduped FDA data
openfda_drug_dedupe = pickle.load(open( "../Data/Outputs_Cleanup/FDA/Openfda_Drug_deduplicated.p", "rb" ))

## Sorting the fda_drug_id list column so that when we convert to a string, it will 
## match the fda_drug_id list column in the drug_node dataframe
openfda_drug_dedupe = openfda_drug_dedupe.applymap(set).applymap(list)
openfda_drug_dedupe.fda_drug_id = openfda_drug_dedupe.fda_drug_id.apply(sorted)

## converting the fda_drug_id string column into a sorted list column
drug_node['fda_drug_id'] = drug_node.fda_drug_id.apply(string_to_ordered_list)

## converting fda_drug_id columns in drug_node df and openfda_drug_dedupe df 
## to string from list for merging
drug_node['fda_drug_id'] = drug_node['fda_drug_id'].astype(str)
openfda_drug_dedupe['fda_drug_id'] = openfda_drug_dedupe['fda_drug_id'].astype(str)
drug_node = drug_node.merge(openfda_drug_dedupe,'left','fda_drug_id')

## Dropping id's into other datasources
# drug_node
drug_node_output = drug_node.drop(columns = ['fda_drug_id','MedD_drug_id','sunshine_drug_id','manufacturer_name'])
drug_node_output.to_csv('../Nodes/Drug_Node.csv')

In [29]:
drug_node

Unnamed: 0,drug_id,brand_name,fda_drug_id,MedD_drug_id,sunshine_drug_id,generic_name,product_type,route,substance_name,manufacturer_name,pharm_class_epc,pharm_class_pe,pharm_class_cs,pharm_class_moa
0,0,CORTISONE ACETATE,[104661],[162898],,[CORTISONE ACETATE],[HUMAN PRESCRIPTION DRUG],[ORAL],[CORTISONE ACETATE],[HIKMA PHARMACEUTICALS USA INC],"[""""]","[""""]","[""""]","[""""]"
1,1,CORTEF,[170796],[159595],,[HYDROCORTISONE],[HUMAN PRESCRIPTION DRUG],[ORAL],[HYDROCORTISONE],[PHARMACIA AND UPJOHN COMPANY LLC],[CORTICOSTEROID EPC],"[""""]","[""""]",[CORTICOSTEROID HORMONE RECEPTOR AGONISTS MOA]
2,2,CORLANOR,[110500],[16549],,[IVABRADINE],[HUMAN PRESCRIPTION DRUG],[ORAL],[IVABRADINE HYDROCHLORIDE],[AMGEN INC],"[""""]","[""""]","[""""]","[""""]"
3,3,COREG CR,[161742],[16548],,[CARVEDILOL PHOSPHATE],[HUMAN PRESCRIPTION DRUG],[ORAL],[CARVEDILOL PHOSPHATE],[GLAXOSMITHKLINE LLC],"[""""]","[""""]","[""""]","[""""]"
4,4,CORDRAN,[35144],[118759],,[FLURANDRENOLIDE],[HUMAN PRESCRIPTION DRUG],[TOPICAL],[FLURANDRENOLIDE],[ALMIRALL LLC],[CORTICOSTEROID EPC],"[""""]","[""""]",[CORTICOSTEROID HORMONE RECEPTOR AGONISTS MOA]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5127,5127,TES AIAPACK CALIBRATOR SET,,,TES AIAPACK CALIBRATOR SET,,,,,,,,,
5128,5128,ALTERNARIA ALTERNATE ALTERNARIA TENUIS,,,ALTERNARIA ALTERNATE ALTERNARIA TENUIS,,,,,,,,,
5129,5129,BUNAVAIL 63 MG 30COUNT BOX,,,BUNAVAIL 63 MG 30COUNT BOX,,,,,,,,,
5130,5130,CLINPRO 5000 TOOTHPASTEVANILLA MINT,,,CLINPRO 5000 TOOTHPASTEVANILLA MINT,,,,,,,,,


## Starting with the Sunshine Data
### Hospital Node

In [30]:
hospital_node = pd.read_csv('../Data/Outputs_Cleanup/Sunshine/hosp_info.csv')
hospital_node.to_csv('../Nodes/Hospital_Node.csv')

### Payments Node

In [31]:
payments_node = pd.read_csv('../Data/Outputs_Cleanup/Sunshine/payments.csv')
payments_node.to_csv('../Nodes/Payments_Node.csv')
# payments_node

### Manufacturer-Payments Edge

In [32]:
manuf_payments_edge = pd.read_csv('../Data/Outputs_Cleanup/Sunshine/pharmCo_payments.csv')
manuf_payments_edge = manuf_payments_edge.merge(manuf_lookup,'left','Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID')
manuf_payments_edge = manuf_payments_edge[['Record_ID','manuf_id']]
manuf_payments_edge.to_csv('../Edges/Manufacturer_Payment_Edge.csv')

### Payment-Drug Edge

In [33]:
payment_drug_edge = pd.read_csv('../Data/Outputs_Cleanup/Sunshine/record_drugs.csv')
cols = ['Record_ID','Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1',
        'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2',
        'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3',
        'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4',
        'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5'
       ]
payment_drug_edge = payment_drug_edge[cols].melt(id_vars = 'Record_ID')
payment_drug_edge.dropna(inplace=True)
payment_drug_edge = payment_drug_edge[['Record_ID','value']]
payment_drug_edge.columns = ['Record_ID','sunshine_drug_id']
drug_node[['drug_id','sunshine_drug_id']].dropna()
payment_drug_edge = payment_drug_edge.merge(drug_node[['drug_id','sunshine_drug_id']].dropna(),
                                           'left',
                                           'sunshine_drug_id')
payment_drug_edge = payment_drug_edge[~payment_drug_edge.drug_id.isna()][['Record_ID','drug_id']]

payment_drug_edge.drug_id= payment_drug_edge.drug_id.astype(int)
payment_drug_edge.to_csv('../Edges/Payment_Drug_Edge.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


### Manufacturer-Drug edge

In [34]:
openfda_manuf_dedupe = pickle.load(open( "../Data/Outputs_Cleanup/FDA/openfda_manufacturer_deduplicated_single_manuf.p", "rb" ))

manuf_drug_edge = drug_node[['drug_id','manufacturer_name']]
manuf_drug_edge = manuf_drug_edge.explode('manufacturer_name')
manuf_drug_edge.dropna(inplace=True)
manuf_drug_edge = manuf_drug_edge.merge(openfda_manuf_dedupe.explode('manuf_names'),'left',left_on='manufacturer_name',right_on='manuf_names')
# manuf_drug_edge
# manuf_lookup
manuf_drug_edge = manuf_drug_edge[['drug_id','fda_manuf_id','manufacturer_name_y']].explode('fda_manuf_id')
manuf_drug_edge = manuf_drug_edge.drop_duplicates(['drug_id','manufacturer_name_y'])[['drug_id','fda_manuf_id']]
manuf_drug_edge['fda_manuf_id'] = manuf_drug_edge['fda_manuf_id'].astype(str)
manuf_drug_edge = manuf_drug_edge.merge(manuf_lookup.explode('fda_manuf_id'),'left','fda_manuf_id')
manuf_drug_edge = manuf_drug_edge[['drug_id','manuf_id']]
manuf_drug_edge.to_csv('../Edges/Manufacturer_Drug_Edge.csv')

### Prescription Node

In [35]:
prescriptions = pickle.load(open("../Data/Outputs_Cleanup/Part_d/prescription_information.p", "rb" ))
prescriptions.head()

partD_to_drug_id = drug_node[['drug_id','MedD_drug_id']]
partD_to_drug_id['MedD_drug_id'] = partD_to_drug_id['MedD_drug_id'].apply(string_to_ordered_list)
partD_to_drug_id = partD_to_drug_id.explode('MedD_drug_id')
partD_to_drug_id.MedD_drug_id = partD_to_drug_id.MedD_drug_id.astype(float)

prescriptions = prescriptions.merge(partD_to_drug_id,'left','MedD_drug_id')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partD_to_drug_id['MedD_drug_id'] = partD_to_drug_id['MedD_drug_id'].apply(string_to_ordered_list)


In [38]:
prescriptions

Unnamed: 0,Prscrbr_NPI,Tot_Clms,Tot_30day_Fills,Tot_Day_Suply,Tot_Drug_Cst,Tot_Benes,GE65_Tot_Clms,GE65_Tot_30day_Fills,GE65_Tot_Drug_Cst,GE65_Tot_Day_Suply,GE65_Tot_Benes,Year,MedD_drug_id,drug_id
0,1003000837,54,58.0,544,407.28,,,,,,,2019,0,1290
1,1003001363,31,31.0,879,629.18,13.0,,,,,,2019,1,1598
2,1003001363,45,45.0,1350,17152.97,21.0,20.0,20.0,7683.43,600.0,,2019,2,967
3,1003001363,95,97.0,2883,1976.63,35.0,22.0,24.0,308.41,720.0,12.0,2019,3,1600
4,1003001363,211,213.0,6390,4637.66,73.0,79.0,81.0,1642.86,2430.0,34.0,2019,4,236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2455588,1992999551,31,51.0,1450,22451.35,,,,,,,2019,507,248
2455589,1992999551,12,12.0,330,135.50,,12.0,12.0,135.50,330.0,,2019,11568,223
2455590,1992999551,27,27.0,800,311.59,,27.0,27.0,311.59,800.0,,2019,49,199
2455591,1992999569,17,18.5,472,461.76,,17.0,18.5,461.76,472.0,,2019,119,1491
