In [1]:
import pandas as pd
import numpy as np
from collections import OrderedDict

### Import and inspect data

In [2]:
#import data
base_data = pd.read_csv('meps_base_data.csv', index_col = 0)
meds_data = pd.read_csv('meps_meds.csv', index_col = 0)

  mask |= (ar1 == a)


In [3]:
print(base_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61489 entries, 1 to 61489
Data columns (total 17 columns):
id                       61489 non-null int64
panel                    61489 non-null int64
pooledWeight             61489 non-null float64
age                      61489 non-null int64
sex                      61489 non-null object
race                     61489 non-null object
married                  61489 non-null object
highBPDiagnosed          61489 non-null object
diabetesDiagnosed        61489 non-null object
chdDiagnosed             61489 non-null object
miDiagnosed              61489 non-null object
anginaDiagnosed          61489 non-null object
strokeDiagnosed          61489 non-null object
emphysemaDiagnosed       61489 non-null object
asthmaDiagnosed          61489 non-null object
otherHDDiagnosed         61489 non-null object
heartFailureDiagnosed    61489 non-null object
dtypes: float64(1), int64(3), object(13)
memory usage: 8.4+ MB
None


In [4]:
print(base_data.head())

         id  panel  pooledWeight  age     sex      race  \
1  10007101     15   3603.881236   28    Male     White   
2  10007102     15   2544.550424   25  Female     White   
3  10007103     15   4050.397468    4    Male     White   
4  10007104     15   3064.059720    3  Female     White   
5  10008101     15   3635.552466   51    Male  Multiple   

                   married highBPDiagnosed diabetesDiagnosed  chdDiagnosed  \
1                  MARRIED             Yes                No            No   
2                  MARRIED              No                No            No   
3  UNDER 16 - INAPPLICABLE    Inapplicable      Inapplicable  Inapplicable   
4  UNDER 16 - INAPPLICABLE    Inapplicable      Inapplicable  Inapplicable   
5                  MARRIED              No                No            No   

    miDiagnosed anginaDiagnosed strokeDiagnosed emphysemaDiagnosed  \
1            No              No              No                 No   
2            No              No     

In [5]:
print(meds_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1148347 entries, 1 to 3000992
Data columns (total 7 columns):
id              1148347 non-null int64
rxStartMonth    1148347 non-null int64
rxStartYear     1148347 non-null int64
rxName          1148347 non-null object
rxNDC           1148347 non-null int64
rxQuantity      1148347 non-null float64
rxForm          1148347 non-null object
dtypes: float64(1), int64(4), object(2)
memory usage: 70.1+ MB
None


In [6]:
print(meds_data.head())

         id  rxStartMonth  rxStartYear                          rxName  \
1  10007104             3         2011                     AMOXICILLIN   
2  10007104             3         2011              OTIC EDGE SOLUTION   
3  10008102             3         2011  NASAL DECONGESTANT 0.05% SPRAY   
4  10008102             3         2011  NASAL DECONGESTANT 0.05% SPRAY   
5  10008102             9         2011                    DIPHENHYDRAM   

         rxNDC  rxQuantity rxForm  
1    143988775        75.0   SUSR  
2  68032032814        14.0    SOL  
3  63981056903        15.0    SPR  
4  63981056903        15.0    SPR  
5    603333921        30.0    CAP  


#### I want to do a quick check to make sure that there are no duplicate records for a patient in the base data

In [7]:
#check to make sure that there is only one record for each id in base_data
id_count = base_data.groupby('id').size().reset_index()
id_count.columns = ('id', 'n')

print(id_count[id_count.n > 1].info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 2 columns):
id    0 non-null int64
n     0 non-null int64
dtypes: int64(2)
memory usage: 0.0 bytes
None


#### I now want to create a tidy dataframe that has a field for patient id, disease, and diagnosis. This will help with future analyses.

In [8]:
#get one data frame from base data that has only the id and the disease disagnoses
base_data_id = base_data[['id']]
base_data_disease = base_data.iloc[:, 7:17]
base_data_trunc = base_data_id.merge(base_data_disease, left_index=True, right_index=True)

#melt that dataframe so we have columns for id, disease, and diagnosis
base_data_melt = pd.melt(base_data_trunc, id_vars = 'id')
base_data_melt.columns = ['id', 'disease', 'diagnosis']
print(base_data_melt.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614890 entries, 0 to 614889
Data columns (total 3 columns):
id           614890 non-null int64
disease      614890 non-null object
diagnosis    614890 non-null object
dtypes: int64(1), object(2)
memory usage: 14.1+ MB
None


In [9]:
#print the unique values and counts in diagnosis
print(base_data_melt.diagnosis.value_counts())

No                 431619
Inapplicable       147195
Yes                 35735
DK                    237
not ascertained        57
Refused                47
Name: diagnosis, dtype: int64


In [10]:
#group by id and count the number of positive diagnoses
patient_dis_count = base_data_melt[base_data_melt.diagnosis == 'Yes'].groupby('id').size().reset_index()
patient_dis_count.columns = ('id', 'n')

#print the proportion of patients with n diagnoses
print(patient_dis_count.n.value_counts(normalize=True))

1     0.614856
2     0.210417
3     0.085260
4     0.042847
5     0.026453
6     0.011413
7     0.006287
8     0.001741
9     0.000629
10    0.000097
Name: n, dtype: float64


#### Finally, I want to create two subsets of my tidy dataframe. One with patients who were only diagnosed with one disease, and one with patients who were diagnosed with two or fewer diseases.

In [11]:
#get a dataframe with just the patients who were diagnosed with one disease
single_dis = patient_dis_count[patient_dis_count.n == 1]

#join back to melted data to get all diagnoses for each of these patients
single_dis_df = base_data_melt.merge(single_dis, how='inner', left_on = 'id', right_on = 'id')

In [12]:
#get a datagrame with just the patients who were diagnosed with 2 or fewer diseases
two_dis = patient_dis_count[patient_dis_count.n <= 2]

#join back to melted data to get all diagnoses for each of these patients
two_dis_df = base_data_melt.merge(two_dis, how='inner', left_on = 'id', right_on = 'id')

#### I want to check to see if any patients were prescribed a medication more than once

In [13]:
#group medications data by id and rxName to get a unique list of medications prescribed to each patient
meds_pat_count = meds_data.groupby(['id', 'rxName']).size().reset_index()
meds_pat_count.columns = ('id', 'rxName', 'n')

print(meds_pat_count[meds_pat_count.n>1].info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146310 entries, 0 to 242638
Data columns (total 3 columns):
id        146310 non-null int64
rxName    146310 non-null object
n         146310 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.5+ MB
None


In [14]:
#inspect the top 5 records
print(meds_pat_count.head())

         id        rxName  n
0  10007101      ATENOLOL  4
1  10007101  AZITHROMYCIN  1
2  10007102  CARISOPRODOL  1
3  10007102  HYDROCO/APAP  1
4  10007102      TREXIMET  2


#### Since it looks like we can have multiple records for patients with the same medication, I want to inspect on of these scenarios to ensure that they aren't duplicated

In [15]:
print(meds_data[(meds_data.id == 10007101) & (meds_data.rxName=='ATENOLOL')])

               id  rxStartMonth  rxStartYear    rxName        rxNDC  \
152314   10007101            -1         2005  ATENOLOL  68382002210   
2100000  10007101            -1         2005  ATENOLOL  68382002210   
310115   10007101            -1         2005  ATENOLOL  68382002210   
510000   10007101            12         2010  ATENOLOL  51079068463   

         rxQuantity rxForm  
152314         30.0   TABS  
2100000        30.0   TABS  
310115         30.0   TABS  
510000         90.0    TAB  


#### The negative codes in the month year seem strange to me. My first thought is that they are indicative of eroneous records. I want to examine thie month column values further though to check

In [16]:
#inspect the rxStartMonth column to view unique values
print(meds_data.rxStartMonth.value_counts())

-1     776549
-8      46862
 1      41646
 2      33345
 3      30912
 6      26877
 7      24598
 8      23968
 4      23198
 10     22712
 12     21881
 9      21752
 5      21666
 11     21249
-9      10922
-7        210
Name: rxStartMonth, dtype: int64


#### It looks like there are a number of negative codes mixed in with the different months. This tells me that there is likely meaning for each negative code. Since I don't have a data dictionary to help me interpret the negatives, I am going to ignore this column and treat each record as if it is accurate. I am also going to assume that duplicate medication entries for a patient are accurate.

#### I just want to take a look at the number of patients diagnosed with each disease in the datasets that I created above

In [17]:
#print the number of patients diagnosed with each disease
patient_counts = base_data_melt[base_data_melt.diagnosis=='Yes'].disease.value_counts()
print(patient_counts)

highBPDiagnosed          13637
asthmaDiagnosed           6001
diabetesDiagnosed         4340
otherHDDiagnosed          3868
chdDiagnosed              2328
strokeDiagnosed           1556
miDiagnosed               1527
anginaDiagnosed           1138
emphysemaDiagnosed         907
heartFailureDiagnosed      433
Name: disease, dtype: int64


In [18]:
#print the number of patents diagnosed with each disease
#but only for patients diagnosed with ONE disease
single_pat_counts = single_dis_df[single_dis_df.diagnosis=='Yes'].disease.value_counts()
print(single_pat_counts)

highBPDiagnosed          6676
asthmaDiagnosed          3942
diabetesDiagnosed         833
otherHDDiagnosed          817
strokeDiagnosed           164
emphysemaDiagnosed        100
chdDiagnosed               80
anginaDiagnosed            42
miDiagnosed                41
heartFailureDiagnosed      19
Name: disease, dtype: int64


In [19]:
#print the number of patents diagnosed with each disease
#but only for patients diagnosed with ONE or TWO diseases
two_pat_counts = two_dis_df[two_dis_df.diagnosis=='Yes'].disease.value_counts()
print(two_pat_counts)

highBPDiagnosed          10398
asthmaDiagnosed           4918
diabetesDiagnosed         2576
otherHDDiagnosed          1779
strokeDiagnosed            582
chdDiagnosed               388
emphysemaDiagnosed         338
miDiagnosed                207
anginaDiagnosed            181
heartFailureDiagnosed       49
Name: disease, dtype: int64


### Top medications for each disease

#### In order to get the medication prescribed the most to patients diagnosed with a disease, I first need to write a function to compile the diseases, medications, and prescriptions.

In [20]:
def med_counts(med_df, disease_df, disease):
    """
    takes a dataframe of medications, a dataframe of disease diagnoses, and a disease
    identifies medications prescribed to patients diagnosed with the target disease
    returns a dictionary with keys = medications and values = number of patients prescribed
    the dictionary is ordered by number of patients prescribed, descending
    """
    #get dataframe of patients diagnosed with the target disease
    disease_data = disease_df[(disease_df.disease == disease) & (disease_df.diagnosis == 'Yes')]

    #join medications for those patients
    disease_meds = disease_data.merge(med_df, how='inner', left_on='id', right_on='id')
    
    #count number of patients with each disease
    med_counts = disease_meds.rxName.value_counts()
    
    #loop over each medication and create a dictionary with counts
    med_dict = OrderedDict()
    for i in range(len(med_counts)):
        med = med_counts.index[i]
        med_n = med_counts[i]
        
        med_dict[med] = med_n
    
    return med_dict    

In [21]:
#get the list of medications prescibed for each disease
#this is regardles of how many diseases these patients were diagnosed with

disease_dict = {}

#loop over each disease and get the count of each med prescribed to any patient with that disease
for disease in base_data_melt.disease.unique():
    disease_dict[disease] = med_counts(meds_pat_count, base_data_melt, disease)
    #print the top 4 medications prescribed to patiens with the disease
    top_disease = list(disease_dict[disease].items())[0:4]
    print(disease + ": " + str(top_disease))

highBPDiagnosed: [('LISINOPRIL', 2996), ('SIMVASTATIN', 2184), ('HYDROCHLOROTHIAZIDE', 1464), ('LIPITOR', 1351)]
diabetesDiagnosed: [('METFORMIN', 1543), ('LISINOPRIL', 1214), ('SIMVASTATIN', 1019), ('LIPITOR', 619)]
chdDiagnosed: [('SIMVASTATIN', 606), ('LISINOPRIL', 601), ('FUROSEMIDE', 537), ('PLAVIX', 457)]
miDiagnosed: [('LISINOPRIL', 439), ('SIMVASTATIN', 408), ('FUROSEMIDE', 360), ('PLAVIX', 349)]
anginaDiagnosed: [('LISINOPRIL', 269), ('SIMVASTATIN', 263), ('FUROSEMIDE', 245), ('PLAVIX', 241)]
strokeDiagnosed: [('SIMVASTATIN', 376), ('LISINOPRIL', 372), ('FUROSEMIDE', 290), ('PLAVIX', 269)]
emphysemaDiagnosed: [('LISINOPRIL', 183), ('ALBUTEROL', 177), ('SIMVASTATIN', 166), ('FUROSEMIDE', 155)]
asthmaDiagnosed: [('ALBUTEROL', 1174), ('AMOXICILLIN', 795), ('AZITHROMYCIN', 789), ('PROAIR HFA', 717)]
otherHDDiagnosed: [('LISINOPRIL', 738), ('SIMVASTATIN', 665), ('FUROSEMIDE', 615), ('AZITHROMYCIN', 465)]
heartFailureDiagnosed: [('FUROSEMIDE', 235), ('LISINOPRIL', 152), ('CARVEDILOL

In [22]:
for disease in base_data_melt.disease.unique():
    print(disease)
    top_disease = list(disease_dict[disease].keys())[0]
    print(top_disease)
    print()

highBPDiagnosed
LISINOPRIL

diabetesDiagnosed
METFORMIN

chdDiagnosed
SIMVASTATIN

miDiagnosed
LISINOPRIL

anginaDiagnosed
LISINOPRIL

strokeDiagnosed
SIMVASTATIN

emphysemaDiagnosed
LISINOPRIL

asthmaDiagnosed
ALBUTEROL

otherHDDiagnosed
LISINOPRIL

heartFailureDiagnosed
FUROSEMIDE



#### The lists above are interesting but they aren't very interpretable. As we can see, Lisonopril was diagnosed the most to patients with 5 of the 10 diseases. This could be interpreted a few ways. One, is that Lisinopril is a medication which has utility for a number of diseases and is therefore prescribed frequently  for a myriad of patients. Another interpretation is that Lisinopril is prescribed frequently for one disease that is often diagnosed with other diseases. The likely scenario is some combination of both of the above posibilities

#### Since I don't have the ability to ascertain WHY a medication was prescribed it is imposible to get meaning out of a dataset with patients who were diagnosed with multiple diseases


#### So, I want to do ths same exercise, but only using data for patients who were diagnosed with ONE disease

In [23]:
#get the dictionary of medications prescribed for each disease
#but only for patients diagnosed with ONE disease
disease_dict_single = {}

for disease in base_data_melt.disease.unique():
    disease_dict_single[disease] = med_counts(meds_pat_count, single_dis_df, disease)
    top_disease = list(disease_dict_single[disease].items())[0:4]
    print(disease + ": " + str(top_disease))

highBPDiagnosed: [('LISINOPRIL', 1103), ('SIMVASTATIN', 670), ('HYDROCHLOROTHIAZIDE', 644), ('AZITHROMYCIN', 608)]
diabetesDiagnosed: [('METFORMIN', 309), ('LISINOPRIL', 144), ('SIMVASTATIN', 125), ('METFORMIN HCL', 118)]
chdDiagnosed: [('SIMVASTATIN', 12), ('IBUPROFEN', 7), ('AMOXICILLIN', 6), ('FUROSEMIDE', 6)]
miDiagnosed: [('HYDROCO/APAP', 7), ('SIMVASTATIN', 6), ('LIPITOR', 5), ('LISINOPRIL', 5)]
anginaDiagnosed: [('AZITHROMYCIN', 7), ('AMOXICILLIN', 4), ('IBUPROFEN', 3), ('LEVOTHYROXINE SODIUM', 3)]
strokeDiagnosed: [('SIMVASTATIN', 23), ('AZITHROMYCIN', 20), ('AMOXICILLIN', 17), ('IBUPROFEN', 12)]
emphysemaDiagnosed: [('SPIRIVA', 12), ('PREDNISONE', 11), ('AZITHROMYCIN', 10), ('AMOXICILLIN', 9)]
asthmaDiagnosed: [('ALBUTEROL', 809), ('AMOXICILLIN', 591), ('AZITHROMYCIN', 476), ('SINGULAIR', 460)]
otherHDDiagnosed: [('AMOXICILLIN', 111), ('AZITHROMYCIN', 107), ('IBUPROFEN', 60), ('SIMVASTATIN', 49)]
heartFailureDiagnosed: [('SYNTHROID', 2), ('FUROSEMIDE', 2), ('AMLODIPINE', 2), (

In [24]:
for disease in base_data_melt.disease.unique():
    print(disease)
    top_disease = list(disease_dict_single[disease].keys())[0]
    print(top_disease)
    print()

highBPDiagnosed
LISINOPRIL

diabetesDiagnosed
METFORMIN

chdDiagnosed
SIMVASTATIN

miDiagnosed
HYDROCO/APAP

anginaDiagnosed
AZITHROMYCIN

strokeDiagnosed
SIMVASTATIN

emphysemaDiagnosed
SPIRIVA

asthmaDiagnosed
ALBUTEROL

otherHDDiagnosed
AMOXICILLIN

heartFailureDiagnosed
SYNTHROID



#### Looking at this list, we now see a unique medication for almost all of the diseases. At face value this would seem to answer our next question (what medications are the most indicative of each disease?) but when looking closer it seems that many diseases share top medications. In the next section I will try to solve this issue.

In [26]:
#create a dictionary for patients with two or fewer diseases. I will use this in the next section.
disease_dict_two = {}

for disease in base_data_melt.disease.unique():
    disease_dict_two[disease] = med_counts(meds_pat_count, two_dis_df, disease)
    top_disease = list(disease_dict_two[disease].items())[0:4]
    print(disease + ": " + str(top_disease))

highBPDiagnosed: [('LISINOPRIL', 2035), ('SIMVASTATIN', 1366), ('HYDROCHLOROTHIAZIDE', 1124), ('AZITHROMYCIN', 976)]
diabetesDiagnosed: [('METFORMIN', 967), ('LISINOPRIL', 655), ('SIMVASTATIN', 522), ('METFORMIN HCL', 381)]
chdDiagnosed: [('SIMVASTATIN', 82), ('LISINOPRIL', 58), ('LIPITOR', 48), ('AZITHROMYCIN', 41)]
miDiagnosed: [('SIMVASTATIN', 39), ('LISINOPRIL', 38), ('LIPITOR', 28), ('AZITHROMYCIN', 25)]
anginaDiagnosed: [('OMEPRAZOLE', 25), ('SIMVASTATIN', 24), ('AZITHROMYCIN', 24), ('ATENOLOL', 18)]
strokeDiagnosed: [('SIMVASTATIN', 117), ('LISINOPRIL', 106), ('HYDROCHLOROTHIAZIDE', 59), ('AZITHROMYCIN', 56)]
emphysemaDiagnosed: [('AZITHROMYCIN', 50), ('PREDNISONE', 50), ('ALBUTEROL', 46), ('SPIRIVA', 41)]
asthmaDiagnosed: [('ALBUTEROL', 962), ('AMOXICILLIN', 701), ('AZITHROMYCIN', 631), ('SINGULAIR', 567)]
otherHDDiagnosed: [('AZITHROMYCIN', 220), ('AMOXICILLIN', 207), ('SIMVASTATIN', 175), ('LISINOPRIL', 160)]
heartFailureDiagnosed: [('FUROSEMIDE', 15), ('LISINOPRIL', 9), ('CA

### Meds indicative of each disease

In [27]:
def unique_meds(disease_dict, disease, unique_dis, n):
    """
    gets the list of medications unique to a certain disease 
    from the top n most prescribed medications for each disease
    prints a list of tuples with medication names and number of patients prescribed each medication
    
    disease dict = a nested dict with top-level keys = diseases and second-level keys = medications
                   values of second level dict are number of patients prescribed that medication
    diseaes = target disease
    unique_dis = iterable of unique diseases in dictionary
    n = number of medications to pull from dictionary
    
    """
    #get a list with every disease other than the one we are currently inspecting
    other_dis = [dis for dis in unique_dis if dis != disease]

    #get set containing the unique, top n meds prescribed to each of the "other" diseases
    meds_set = set()
    for dis in other_dis:
        top_meds = list(disease_dict[dis].keys())[:n]
        meds_set.update(set(top_meds))

    #get a list of the top n meds prescribed for the target disease
    target_meds = list(disease_dict[disease].keys())[:n]

    #get a list of the unique meds for the disease
    x = [med for med in target_meds if med not in meds_set]
    #get a count for the number of times that med was prescribed for the target disease
    y = [disease_dict[disease][med] for med in x]
    #zip and print meds and prescription counts
    return list(zip(x, y))

In [28]:
#inspect the unique meds prescribed for each disease
#for patients only diagnosed with one disease

#loop over unique list of diseases
unique_diseases_one = list(disease_dict_single.keys())
for dis in unique_diseases_one:
    #print the number of patients diagnosed with the disease
    print(dis, single_pat_counts[dis])
    #print the list of uniuqe meds and patients to whom they were prescribed
    print(unique_meds(disease_dict_single, dis, unique_diseases_one, 10))

highBPDiagnosed 6676
[('HYDROCHLOROT', 445), ('AMLODIPINE BESYLATE', 398)]
diabetesDiagnosed 833
[('METFORMIN', 309), ('METFORMIN HCL', 118), ('ACTOS', 110), ('ONETOUCH', 108), ('METFORMIN HYDROCHLORIDE', 95), ('GLIPIZIDE', 93), ('GLYBURIDE', 81)]
chdDiagnosed 80
[('METOPROL TAR', 5)]
miDiagnosed 41
[('PLAVIX', 5), ('METOPROLOL SUCCINATE ER', 4), ('PANTOPRAZOLE', 4), ('METOPROLOL TARTRATE', 4)]
anginaDiagnosed 42
[('LEVOTHYROXINE SODIUM', 3), ('PRISTIQ', 2), ('ACETAMINOPHEN-HYDROCODONE BITARTRATE', 2), ('DIAZEPAM', 2)]
strokeDiagnosed 164
[('GABAPENTIN', 8)]
emphysemaDiagnosed 100
[('SPIRIVA', 12), ('CLONAZEPAM', 8), ('ALPRAZOLAM', 8), ('ADVAIR DISKU', 8)]
asthmaDiagnosed 3942
[('SINGULAIR', 460), ('PROAIR HFA', 405), ('VENTOLIN HFA', 213), ('PROVENTIL', 171), ('ADVAIR DISKUS', 157)]
otherHDDiagnosed 817
[('NAPROXEN', 36), ('OMEPRAZOLE', 33)]
heartFailureDiagnosed 19
[('CARVEDILOL', 2), ('TRAMADOL HCL', 2), ('CLOBETASOL PROPIONATE', 1), ('FOLBIC', 1), ('CEROVITE SILVER', 1)]


In [29]:
#repeat of previous cell excep only print the name of the top most prescribed med

unique_diseases_one = list(disease_dict_single.keys())
for dis in unique_diseases_one:
    print(dis, 'Number of Patients:', single_pat_counts[dis])
    #print(len(unique_meds(disease_dict_single, dis, unique_diseases_one, 10)))
    top_med = unique_meds(disease_dict_single, dis, unique_diseases_one, 10)[0][0]
    num_patients = unique_meds(disease_dict_single, dis, unique_diseases_one, 10)[0][1]
    prop_patients = round(num_patients / single_pat_counts[dis], 4)
    print(top_med +": "+ str(prop_patients))
    print()

highBPDiagnosed Number of Patients: 6676
HYDROCHLOROT: 0.0667

diabetesDiagnosed Number of Patients: 833
METFORMIN: 0.3709

chdDiagnosed Number of Patients: 80
METOPROL TAR: 0.0625

miDiagnosed Number of Patients: 41
PLAVIX: 0.122

anginaDiagnosed Number of Patients: 42
LEVOTHYROXINE SODIUM: 0.0714

strokeDiagnosed Number of Patients: 164
GABAPENTIN: 0.0488

emphysemaDiagnosed Number of Patients: 100
SPIRIVA: 0.12

asthmaDiagnosed Number of Patients: 3942
SINGULAIR: 0.1167

otherHDDiagnosed Number of Patients: 817
NAPROXEN: 0.0441

heartFailureDiagnosed Number of Patients: 19
CARVEDILOL: 0.1053



#### I wanted to perform the same analysis for the patients who were diagnosed with one or two diseases and for all patents in the dataset. What I found is that in order to find a uniuqe medication for each disease, I had to choose many more medications to compare. The result was that, generally, a smaller proportion of patients diagnosed with a particular disease were prescribed the medication that was indicated by the analysis.

#### If I used all of the data in the dataset, I could not find a unique medication for every disease, even after looking at the top 50 medications for each disease.

In [30]:
#print uniuqe meds and counts for patients diagnosed with ONE or TWO diseases

unique_diseases_two = list(disease_dict_two.keys())
for dis in unique_diseases_two:
    print(dis, two_pat_counts[dis])
    print(unique_meds(disease_dict_two, dis, unique_diseases_two, 29))

highBPDiagnosed 10398
[('DIOVAN', 442), ('LISINOP/HCTZ', 321)]
diabetesDiagnosed 2576
[('METFORMIN HCL', 381), ('ONETOUCH', 350), ('METFORMIN HYDROCHLORIDE', 329), ('ACTOS', 323), ('GLIPIZIDE', 310), ('GLYBURIDE', 273), ('LANTUS', 220), ('INSULIN SYRG', 207), ('HUMULIN N', 200), ('GLIMEPIRIDE', 199), ('NOVOLIN N', 171), ('JANUVIA', 139), ('TRUETRACK SMART SYSTEM (MONITOR)', 138), ('BAYER CONTOR', 136)]
chdDiagnosed 388
[('PRAVASTATIN', 15)]
miDiagnosed 207
[('RANITIDINE', 8)]
anginaDiagnosed 181
[('PREVACID', 11), ('LYRICA', 10), ('TRAMADOL HYDROCHLORIDE', 10)]
strokeDiagnosed 582
[('LEXAPRO', 25), ('SIMVASTATIN (FILM-COATED)', 24), ('AGGRENOX', 24)]
emphysemaDiagnosed 338
[('SPIRIVA', 41), ('COMBIVENT', 25), ('SPIRIVA (W/ HANDIHALER)', 24), ('CLONAZEPAM', 18), ('LORAZEPAM', 17), ('TRAZODONE', 17)]
asthmaDiagnosed 4918
[('SINGULAIR', 567), ('VENTOLIN HFA', 267), ('PROVENTIL', 216), ('SINGULAIR (UNIT OF USE)', 168), ('ALBUTEROL SULFATE', 149), ('FLOVENT HFA', 148), ('NASONEX', 141), ('L

In [31]:
unique_diseases_two = list(disease_dict_two.keys())
for dis in unique_diseases_two:
    print(dis, 'Number of Patients:', two_pat_counts[dis])
    #print(unique_meds(disease_dict_two, dis, unique_diseases_two, 19))
    
    meds = unique_meds(disease_dict_two, dis, unique_diseases_two, 29)
    top_med = meds[0][0]
    num_patients = meds[0][1]
    prop_patients = round(num_patients / two_pat_counts[dis], 4)
    print(top_med +": "+ str(prop_patients))
    print()

highBPDiagnosed Number of Patients: 10398
DIOVAN: 0.0425

diabetesDiagnosed Number of Patients: 2576
METFORMIN HCL: 0.1479

chdDiagnosed Number of Patients: 388
PRAVASTATIN: 0.0387

miDiagnosed Number of Patients: 207
RANITIDINE: 0.0386

anginaDiagnosed Number of Patients: 181
PREVACID: 0.0608

strokeDiagnosed Number of Patients: 582
LEXAPRO: 0.043

emphysemaDiagnosed Number of Patients: 338
SPIRIVA: 0.1213

asthmaDiagnosed Number of Patients: 4918
SINGULAIR: 0.1153

otherHDDiagnosed Number of Patients: 1779
CELEBREX: 0.0309

heartFailureDiagnosed Number of Patients: 49
CARVEDILOL: 0.1429



In [32]:
#print uniuqe meds for using all patients in the dataset

unique_diseases = list(disease_dict.keys())
for dis in unique_diseases:
    print(dis, patient_counts[dis])
    print(unique_meds(disease_dict, dis, unique_diseases, 50))

highBPDiagnosed 13637
[('LISINOP/HCTZ', 395), ('DIOVAN HCT', 371), ('MELOXICAM', 370), ('NORVASC', 359)]
diabetesDiagnosed 4340
[('GLIMEPIRIDE', 354), ('BAYER CONTOR', 281), ('JANUVIA', 234), ('FREESTYLE', 190), ('ACCU-CHEK', 186)]
chdDiagnosed 2328
[('ALLOPURINOL', 113), ('TRICOR', 108)]
miDiagnosed 1527
[('RANITIDINE', 76)]
anginaDiagnosed 1138
[('ISOSORB MONO', 60)]
strokeDiagnosed 1556
[('SIMVASTATIN (FILM-COATED)', 65)]
emphysemaDiagnosed 907
[('SPIRIVA', 141), ('COMBIVENT', 84), ('SPIRIVA (W/ HANDIHALER)', 53), ('LEVAQUIN', 49), ('CLONAZEPAM', 49), ('CELEBREX', 44)]
asthmaDiagnosed 6001
[('SINGULAIR (UNIT OF USE)', 218), ('NASONEX', 181), ('LORATADINE', 179), ('FLOVENT HFA', 177), ('PROVENTIL HFA', 158), ('CEPHALEXIN', 148), ('VICODIN', 141), ('ZITHROMAX', 140), ('FLUTICASONE PROPIONATE', 133), ('FLUTICASONE', 125), ('CYMBALTA', 122), ('ADVAIR', 121), ('CYCLOBENZAPR', 120)]
otherHDDiagnosed 3868
[]
heartFailureDiagnosed 433
[('LASIX', 38), ('SPIRONOLACTONE', 36), ('SPIRONOLACT', 