In [18]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

In [19]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [20]:
def big_data(data, fields):
    dfs = []
    for filename in os.listdir(data):
        f = os.path.join(data,filename)
        df = pd.read_csv(f, delimiter='$',skipinitialspace=True,usecols=fields)
        dfs.append(df)
    return pd.concat(dfs)

In [21]:
y = []
for filename in os.listdir('DEMO'):
    f = os.path.join('DEMO', filename)
    y.append(f)
print(y)

['DEMO\\DEMO16Q1.txt', 'DEMO\\DEMO16Q2.txt', 'DEMO\\DEMO16Q3.txt', 'DEMO\\DEMO16Q4.txt', 'DEMO\\DEMO17Q1.txt', 'DEMO\\DEMO17Q2.txt', 'DEMO\\DEMO17Q3.txt', 'DEMO\\DEMO17Q4.txt', 'DEMO\\DEMO18Q1.txt', 'DEMO\\DEMO18Q2.txt', 'DEMO\\DEMO18Q3.txt', 'DEMO\\DEMO18Q4.txt', 'DEMO\\DEMO19Q1.txt', 'DEMO\\DEMO19Q2.txt', 'DEMO\\DEMO19Q3.txt', 'DEMO\\DEMO19Q4.txt', 'DEMO\\DEMO20Q1.txt', 'DEMO\\DEMO20Q2.txt', 'DEMO\\DEMO20Q3.txt', 'DEMO\\DEMO20Q4.txt', 'DEMO\\DEMO21Q1.txt']


In [22]:
drug_fields = ['primaryid','caseid','drugname','prod_ai']
drugs = big_data('DRUG', drug_fields)
demo_fields = ["primaryid","caseid","age","sex","wt"]
demo = big_data('DEMO', demo_fields)
react_fields = ['primaryid','caseid','pt']
react = big_data('REAC', react_fields)

In [23]:
demo[demo['primaryid'] == 100049985].head()

Unnamed: 0,primaryid,caseid,age,sex,wt
3,100049985,10004998,2.0,F,


In [24]:
drugs[drugs['primaryid'] == 100049985].head()

Unnamed: 0,primaryid,caseid,drugname,prod_ai
5,100049985,10004998,AFINITOR,EVEROLIMUS
6,100049985,10004998,AFINITOR,EVEROLIMUS
7,100049985,10004998,KEPPRA,LEVETIRACETAM
8,100049985,10004998,SABRIL,VIGABATRIN
9,100049985,10004998,FELBAMATE.,FELBAMATE


In [25]:
react[react['primaryid'] == 100049985].head()

Unnamed: 0,primaryid,caseid,pt
6,100049985,10004998,Drug level decreased
7,100049985,10004998,Seizure
8,100049985,10004998,Weight increased


In [26]:
print(len(pd.unique(react['primaryid'])))
print(len(pd.unique(react['caseid'])))

8307468
7259365


In [27]:
drugs.head()
print(drugs.shape[0])

33480762


In [28]:
demo.head()
print(demo.shape[0])
demo.drop_duplicates()
print(demo.shape[0])

8307690
8307690


In [29]:
react.head()

Unnamed: 0,primaryid,caseid,pt
0,100036613,10003661,Depression
1,100036613,10003661,Drug effect decreased
2,100036613,10003661,Wheelchair user
3,100043112,10004311,Renal artery thrombosis
4,100045852,10004585,Cataract


In [30]:
df = pd.merge(demo, drugs.drop(columns=['caseid']), left_on='primaryid', right_on='primaryid', how='left')

In [31]:
df.shape[0]

33488110

In [32]:
df = pd.merge(df,react.drop(columns=['caseid']), left_on='primaryid', right_on='primaryid', how='inner')

In [33]:
df.shape[0]

161114107

In [34]:
df.head()

Unnamed: 0,primaryid,caseid,age,sex,wt,drugname,prod_ai,pt
0,100036613,10003661,,F,,AMPYRA,DALFAMPRIDINE,Depression
1,100036613,10003661,,F,,AMPYRA,DALFAMPRIDINE,Drug effect decreased
2,100036613,10003661,,F,,AMPYRA,DALFAMPRIDINE,Wheelchair user
3,100043112,10004311,73.0,M,,PRADAXA,DABIGATRAN ETEXILATE MESYLATE,Renal artery thrombosis
4,100043112,10004311,73.0,M,,PRADAXA,DABIGATRAN ETEXILATE MESYLATE,Renal artery thrombosis


In [35]:
df['prod_ai'].unique()

array(['DALFAMPRIDINE', 'DABIGATRAN ETEXILATE MESYLATE', 'LATANOPROST',
       ..., 'ALPHA-PYRROLIDINOHEPTAPHENONE',
       'CALCIUM CARBONATE\\VITAMIN D\\ZINC',
       'CAFFEINE\\FAMOTIDINE\\IBUPROFEN\\ONDANSETRON'], dtype=object)

In [36]:
df.isnull().sum(axis = 0)

primaryid            0
caseid               0
age           45151600
sex           16630225
wt           101525959
drugname          1940
prod_ai        3371670
pt                   0
dtype: int64

In [37]:
df = df.dropna()

In [38]:
df.isnull().sum(axis = 0)

primaryid    0
caseid       0
age          0
sex          0
wt           0
drugname     0
prod_ai      0
pt           0
dtype: int64

In [39]:
df.shape[0]

50120772

In [40]:
df = df.drop_duplicates(keep='first')
print(df.shape[0])
df.head()

36882955


Unnamed: 0,primaryid,caseid,age,sex,wt,drugname,prod_ai,pt
75,100051155,10005115,59.0,M,86.26,HUMIRA,ADALIMUMAB,Abdominal pain
76,100051155,10005115,59.0,M,86.26,HUMIRA,ADALIMUMAB,Adenocarcinoma of colon
77,100051155,10005115,59.0,M,86.26,HUMIRA,ADALIMUMAB,Gastrointestinal inflammation
78,100051155,10005115,59.0,M,86.26,HUMIRA,ADALIMUMAB,Intestinal mucosal hypertrophy
79,100051155,10005115,59.0,M,86.26,HUMIRA,ADALIMUMAB,Intestinal obstruction


In [41]:
print(len(list(df['pt'].value_counts(ascending=True))))

15671


In [None]:
pt_dict = dict(df['pt'].value_counts(ascending=False))
print(pt_dict)

In [23]:
ai_dict = dict(df['prod_ai'].value_counts(ascending=True))
# df['prod_ai'].value_counts(ascending=True)
filter_dict = dict((k,v) for k,v in ai_dict.items() if v >= 100)
print(len(ai_dict))
print(len(filter_dict))
filter_list = list(filter_dict.keys())
filter_df = df[df['prod_ai'].isin(filter_list)]
filter_df.prod_ai.nunique()
filter_df['prod_ai'].value_counts(ascending=True)

8711
3922


CHENOPODIUM ALBUM POLLEN                                                                                                              100
LEVOMEPROMAZINE HYDROCHLORIDE                                                                                                         100
CLOSTRIDIUM TETANI TOXOID ANTIGEN (FORMALDEHYDE INACTIVATED)\MENINGOCOCCAL POLYSACCHARIDE VACCINE, GROUPS A, C, Y,W135 COMBINED       100
TRIBASIC CALCIUM PHOSPHATE                                                                                                            100
ALGINIC ACID\ALUMINUM HYDROXIDE\MAGNESIUM TRISILICATE                                                                                 100
                                                                                                                                    ...  
CHOLECALCIFEROL                                                                                                                    395204
AMLODIPINE BESYLATE               

In [24]:
filter_df = filter_df[~filter_df['sex'].isin(['UNK','T','I','P'])]
filter_df.loc[filter_df['sex'] == 'F','sex'] = 0
filter_df.loc[filter_df['sex'] == 'M','sex'] = 1
filter_df['sex'].value_counts()

0    23749080
1    13022136
Name: sex, dtype: int64

In [25]:
filter_df = filter_df[(filter_df['age'] <= 120) & (filter_df['age'] >= 0)]
filter_df['age'].value_counts()

65.0     1051895
64.0      932504
68.0      922485
62.0      910589
59.0      905596
          ...   
113.0         18
109.0         14
117.0         12
116.0         11
105.0         11
Name: age, Length: 118, dtype: int64

In [26]:
for i in range(6):
    filter_df.loc[(filter_df['age'] >= i*20) & (filter_df['age'] <= (i+1)*20),'age'] = i
filter_df.age.value_counts()

3.0    15363469
2.0    12201634
1.0     3781266
4.0     2310270
0.0     1806512
5.0        3125
Name: age, dtype: int64

In [27]:
from sklearn.preprocessing import LabelEncoder

print(dict(filter_df.pt.value_counts())) ['Nasopharyngitis','Pyrexia','Vomiting','Pneumonia']

In [28]:
from sklearn.preprocessing import LabelBinarizer
import category_encoders as ce

In [29]:
lb_prod_ai = ce.BinaryEncoder(cols='prod_ai', return_df=True)
binary_data = lb_prod_ai.fit_transform(filter_df)
# lb_prod_ai = LabelEncoder()
# filter_df['prod_ai_n'] = lb_prod_ai.fit_transform(filter_df['prod_ai'])
# filter_df.head()

  elif pd.api.types.is_categorical(cols):


MemoryError: Unable to allocate 3.44 GiB for an array with shape (13, 35466276) and data type int64

In [None]:
filter_df = binary_data 
filter_df.head()

In [None]:
filter_df.shape[0]

filter_df.to_csv('binary.csv',sep='$', index=False)

adverse_effects = ['Neutropenia','Nasopharyngitis','Pyrexia','Vomiting','Pneumonia', 'Type 2 diabetes mellitus','Acute coronary syndrome', 'Nausea']
clf = []
for effect in adverse_effects:
    temp_df = filter_df.copy()
    temp_df.loc[(temp_df['pt'] != effect),'pt'] = 0
    temp_df.loc[(temp_df['pt'] == effect),'pt'] = 1
    print(effect)
    sample_amount = len(temp_df[temp_df['pt'] == 1])
    data = temp_df[temp_df['pt'] == 1]
    data = data.append(temp_df[temp_df['pt'] != 1].sample(sample_amount),ignore_index=True)
    print(data.pt.value_counts())
    features = data.drop(columns=['primaryid','caseid','drugname','pt'])
    target = data.drop(columns=['primaryid','caseid','drugname','age','sex','wt'])
    features = features.drop(columns=['prod_ai'])
    final_target = target['pt'].astype('int')
    print(final_target.value_counts())
    X_train, X_test, y_train, y_test = train_test_split(features, final_target, test_size = 0.3, random_state=0, shuffle= True)
    rfc = RandomForestClassifier(n_estimators = 250)
    rfc.fit(X_train,y_train)
    rfc_pred = rfc.predict(X_test)
    clf.append(rfc)
    print(rfc.feature_importances_)
    print(classification_report(y_test, rfc_pred))

for c in clf:
    prediction = c.predict_proba([[3.0, 1, 59.0, 2520]])
    print(prediction)

features.head()

final_target.head()

filter_df.loc[(filter_df['pt'] != 'Neutropenia'), 'pt'] = 'not'

filter_df.pt.value_counts([1.0, 0, 44.44, 2039])

data = filter_df[filter_df['pt'] == 'Neutropenia']
data = data.append(filter_df[filter_df['pt'] != 'Neutropenia'].sample(30000),ignore_index=True)
data.head()

data.pt.value_counts()

features = data.drop(columns=['primaryid','caseid','drugname','pt'])
target = data.drop(columns=['primaryid','caseid','drugname','age','sex','wt'])
target.head()

lb_prod_ai = LabelEncoder()
features['prod_ai_n'] = lb_prod_ai.fit_transform(features['prod_ai'])
features = features.drop(columns=['prod_ai'])
features.head()


lb_pt = LabelEncoder()
target['pt_n'] = lb_pt.fit_transform(target['pt'])
target.head()
target.pt_n.value_counts()
final_target = target['pt_n']
final_target.head()

print(list(features.prod_ai_n.unique()))

X_train, X_test, y_train, y_test = train_test_split(features, final_target, test_size = 0.3, random_state=0, shuffle= True)
dtc = tree.DecisionTreeClassifier()
dtc.fit(X_train,y_train)
dtc_pred = dtc.predict(X_test)
confusion_matrix(y_test, dtc_pred)


print(classification_report(y_test, dtc_pred))