# 1 - Packages

Imports the required python packages for deep learning.
1.  numpy - number processing
2.  tensorflow - deep learning framework
3.  matplotlib.pyplot - python plotting library
4.  pandas - dataframes
5.  sklearn - machine learning
6.  cPickle - save models



In [6]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import sys
import os
import datetime

os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

#from sklearn.metrics import roc_curve, auc, average_precision_score, precision_recall_curve
#import _pickle as cPickle

%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

np.random.seed(1)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load and Format Data

In [7]:
#Directory with data
data_dir = '/home/ec2-user/cs230/scripts/matrix/'

#Get first 4 iteration files
#file_names = [data_dir+'test_iteration_%s.txt'% str(i) for i in range(0,4) ]
file_names = [data_dir+'Feature_Iteration_%s_baseline.csv'% str(i) for i in range(0,4) ]
#Put all dataframes into a list
input_data_list = []
for fn in file_names:
    input_data_list.append(pd.read_table(fn,sep=',',dtype=None,header=0))

#Concatenate individual dataframes into a single dataframe
all_data = input_data_list[0]
all_data = all_data.append(input_data_list[1:])

    

MemoryError: 

In [None]:
#Get drug columns
feature_names = list(all_data)
drugs = []
for feat in feature_names:
    index1 = feat.find('MED')
    index2 = feat.find('RX')
    index3 = feat.find('post.1d')
    if (index1 != -1 or index2 != -1) and (index3 != -1):
        if feat.endswith('.1') is False:
            drugs.append(feat)


#Check if columns with ".1" at the end are identical to drug columns
for d in drugs:
    col1 = all_data[d]
    col2 = all_data[d+'.1']
    if np.linalg.norm(col1-col2) != 0:
        print ("MISMATCH: %s" % d)

#Drop duplicate columns
to_drop_names = [d+'.1' for d in drugs]
all_data_uniq = all_data.drop(to_drop_names,axis=1)

#Get final drug columns
feature_names = list(all_data_uniq)
drugs = []
for feat in feature_names:
    index1 = feat.find('MED')
    index2 = feat.find('RX')
    index3 = feat.find('post.1d')
    if (index1 != -1 or index2 != -1) and (index3 != -1):
        drugs.append(feat)  

print (drugs)
print(len(drugs))

In [None]:
def summarize_drugs(df,drug_names):
    '''
    Tells us how many times each drug is seen in the database.
    Parameters:
        @df: pandas dataframe
        @drug_names: list of feature names
    Returns:
        summary: python dict, drug_name:count
    '''
    
    summary = {}
    for dn in drug_names:
        summary[dn] = df[dn].sum()
    
    return summary

drug_summary = summarize_drugs(all_data_uniq,drugs)

for k,v in drug_summary.iteritems():
    print ("%s\t%d" % (k,v))

print ("TOTAL: %d" % sum(drug_summary.values()))

In [None]:
#Checking for missing data, drop unwanted columns
def clean_data(df):
    '''
    Cleans the Pandas Dataframe to take care of None values and drop unwanted columns
    Parameters:
        @df: pandas dataframe
    Returns:
        @df_clean: pandas dataframe "cleaned"
    '''
    
    none_count = {}
    features = list(df)
    for feat in features:
        none_count[feat] = df[feat].isnull().sum()
    
    for k,v in none_count.iteritems():
        if v != 0:
            print ("%s\t%d" % (k,v))
        
    to_drop_possibilities = [name+'.1' for name in features]
    to_drop = [x for x in to_drop_possibilities if x in list(df)]
    
    df_clean = df.drop(to_drop,axis=1)
    
    more_drop = ['Unnamed: 0']
    for feat in list(df_clean):
        if feat.endswith(('post','post.1','postTimeDays.1')) is True:
            more_drop.append(feat)

    df_clean.drop(more_drop,axis=1,inplace=True)
    return df_clean

df_clean = clean_data(all_data_uniq)



In [None]:

def train_validate_test_split(df, train_percent=.9, validate_percent=.05, seed=None):
    '''
    Splits data into train/dev/test splits
    Parameters:
        @df: pandas dataframe
    Returns:
        @train,validate,test: pandas dataframe for each split
    '''
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

#Split data into train/dev/test sets
train,val,test = train_validate_test_split(df_clean)

y_train,x_train = train[drugs].as_matrix(),train.drop(drugs,axis=1).as_matrix()
y_val,x_val = val[drugs].as_matrix(),val.drop(drugs,axis=1).as_matrix()
y_test,x_test = test[drugs].as_matrix(),test.drop(drugs,axis=1).as_matrix()


print x_train.shape,y_train.shape



# Model

Defines the simple NN model.

Architecture

   (dense --> batch_normalization --> leaky_relu --> dropout) x 4 --> output 

Parameters

   softmax cross entropy loss
   ADAM optimizer for parameter updates
   learning rate = 1e-4
   alpha = 0.2 (for leaky_relu)
   0.05 <= dropout rate <= 0.2 (depending on number of parameters in layer).

In [None]:
def dense_batchnorm_relu_dropout(inputs,units,rate,mode):
    '''
    Wrapper function for dense --> batchnorm --> relu --> dropout
   
    parameters:
       @inputs: input from previous layer
       @units: number of hidden units
       @rate: dropout probability
    returns:
        @dropout_n: output to next layer
    '''
        
    l_n = tf.layers.dense(inputs=inputs,units = units,
                        kernel_initializer=tf.contrib.layers.xavier_initializer())
    bn_n = tf.layers.batch_normalization(l_n)
    relu_n = tf.nn.leaky_relu(bn_n)
    dropout_n = tf.layers.dropout(inputs=relu_n, rate=rate, training=mode == tf.estimator.ModeKeys.TRAIN)

    return dropout_n


def f1_score(labels,predictions):
    
    num1 = tf.constant(1,dtype=tf.float32)
    num2 = tf.constant(1,dtype=tf.float32)
    num3 = tf.constant(2,dtype=tf.float32)
    
    recall = tf.metrics.recall(labels=labels,predictions=predictions['classes'])
    precision = tf.metrics.precision(labels=labels,predictions=predictions['classes'])
    f1_score = tf.divide(num3,tf.add(tf.divide(num1,recall), tf.divide(num2,precision)))
    
    return f1_score

def nn_model_fn(features,labels,mode,params):
    '''
    Model function for NN
    
    parameters:
        @features: input features for model
        @labels: labels for dataset
        @mode: training, eval, or predict
    return:
        estimator object
    '''
    
    '''
    summary_hook = tf.train.SummarySaverHook(
        100,
        output_dir = 'log_dir2/',
        summary_op = tf.summary.merge_all())
    '''
    n = features['x'].shape[1] # number of features
    n_y = labels.shape[1]
    
    input_layer = tf.cast(tf.reshape(features['x'],[-1,n]),tf.float32) 
    prev_layer = input_layer
    
    for l in range(params['L']):
        prev_layer = dense_batchnorm_relu_dropout(prev_layer, params['num_neurons'][l], 0.2, mode)

    #h1 = dense_batchnorm_relu_dropout(input_layer,,0.2,mode)
    #h2 = dense_batchnorm_relu_dropout(h1,100,0.2,mode)
    #h3 = dense_batchnorm_relu_dropout(h2,80,0.1,mode)
    #h4 = dense_batchnorm_relu_dropout(h3,50,0.05,mode)
    
    #CHANGE NUMBER OF OUTPUT LOGITS (CURRENTLY 2)
    output = tf.layers.dense(inputs=prev_layer,units=n_y,
                            kernel_initializer=tf.contrib.layers.xavier_initializer())
    probs = tf.nn.sigmoid(output,name='sigmoid_tensor')
    
    #Stores class predictions and associated probabilities
    predictions = {
      "classes": tf.to_int32(probs >= 0.5),
      "probabilities": probs
    }
    
    #If in PREDIDCT Mode we don't need to continue with parameter optimization
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    
    labels = tf.cast(labels,tf.float32)
    # Calculate Loss
    loss = tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(targets=labels,logits=probs,pos_weight=params['pw']))

    # Configure the Training Op (i.e which optimizer to use --> ADAM)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer(learning_rate=params['learning_rate'])
        train_op = optimizer.minimize(
            loss=loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)#, training_chief_hooks=[tf.train.SummarySaverHook(save_steps=100, output_dir='./log_dir2')])
    #print ("HERE")
    # Add evaluation metrics
    eval_metric_ops = {
        "accuracy": tf.metrics.accuracy(labels=labels,predictions=predictions['classes']),
        'recall':tf.metrics.recall(labels=labels,predictions=predictions['classes']),
        'precision':tf.metrics.precision(labels=labels,predictions=predictions['classes'])}
        #'f1_score':f1_score(labels,predictions)}
    
    return tf.estimator.EstimatorSpec(
      mode=mode,loss=loss, eval_metric_ops=eval_metric_ops)

# Random Search for Parameters

In [None]:
import math
# if scale == None: generates uniform random value between start/end
# if scale == 'log': generate random variable r in [log(start),log(end)], then return 10^r
#     ex. if you input start:0.0001, end:1 it will return 10^r, where r in [-4,0]
def random_search(start, end, scale='uniform'):
    if scale == 'uniform':
        return np.random.uniform(start, end)
    elif scale == 'int':
        return np.random.randint(start,end)
    elif scale == 'log':
        a = math.log(start, 10)
        b = math.log(end, 10)
        r = np.random.uniform(a, b)
        return 10**r
    else:
        return 'ERROR'

In [None]:
# get random value from list
def random_grid_search(vals):
    length = len(vals)
    return vals[np.random.randint(0,length)]

In [None]:
def decrement_num_neurons(first, min_val, num_layers):
    layers = [first]
    random = np.random.rand()
    if random > 0.5:
        random = 50
    else:
        random = 0
    prev = first
    for i in range(num_layers-1):
        prev = max(prev-25, min_val) 
        layers.append(prev)
    return layers

In [None]:
decrement_num_neurons(250, 50, 8)

# Train the model and evaluate performance

In [None]:
#order_classifier = tf.estimator.Estimator(model_fn=nn_model_fn,model_dir='model_dir')
tensors_to_log = {"probabilities": "sigmoid_tensor"}
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=1000)
tf.logging.set_verbosity(tf.logging.INFO)



f1_scores = {}

# log 
now = datetime.datetime.now()
f = open('log_dir/log_{}.txt'.format(str(now)), 'w')

# Values to tune hyperparameters
pw_low = 5
pw_high = 20
learning_rate_low = 0.0001
learning_rate_high = 1
num_layers_low = 3
num_layers_high = 10
num_neurons_1 = [100,150,200,250]

#Try different parameters. N iterations with random parameters
for i in range(100):

    pw = random_search(pw_low,pw_high, scale='int') # positive error weight 
    
    #learning_rate = random_search(learning_rate_low,learning_rate_high,scale='log')
    learning_rate = 0.0001
    
    num_layers = random_search(num_layers_low,num_layers_high, scale='int')
    
    num_neurons = decrement_num_neurons(random_grid_search(num_neurons_1), 50, num_layers)
    
    num_neurons = [200,175,150]
    num_layers = 3
    pw = 13
    
    params = {
        'pw':pw, 
        'L':num_layers, 
        'learning_rate':learning_rate,
        'num_neurons':num_neurons
             }
    
    order_classifier = tf.estimator.Estimator(model_fn=nn_model_fn,params=params, model_dir = './log_dir2')
    #Set up training
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
      x={"x": x_train},
      y=y_train,
      batch_size=128,
      num_epochs=50,
      shuffle=True)

    order_classifier.train(input_fn=train_input_fn,hooks=[logging_hook])
    #writer = tf.summary.FileWriter('./log_dir2', tf.get_default_graph())
   # print ("Model training complete")

    # validation
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
      x={"x": x_val},
      y=y_val,
      num_epochs=50,
      shuffle=False)    
    
     # test
    test_input_fn = tf.estimator.inputs.numpy_input_fn(
      x={"x": x_test},
      y=y_test,
      num_epochs=50,
      shuffle=False) 

    #train_results = order_classifier.evaluate(input_fn=train_input_fn)
    eval_results = order_classifier.evaluate(input_fn=eval_input_fn)
    #test_results = order_classifier.evaluate(input_fn=test_input_fn)

    #print (params)
    #print (eval_results)
    #print ('#############')
    f1_val = 2./(1./eval_results['recall']+1./eval_results['precision'])
    
    
    for param in ['pw', 'L', 'learning_rate', 'num_neurons']:
        f.write(param + ' : ' + str(params[param]) + '\n')
        
    for k,v in eval_results.iteritems():
        f.write(str(k) + ' : ' + str(v) + '\n')
    
    f.write('f1_score: %.3f\n' % f1_val)
    
    f.write('###############\n')
    #writer.close()
    #print(logging_hook)
    break
 
    
    

In [None]:
for k,v in f1_scores.iteritems():
    print "Pos Weight: %s\tF1_score: %.3f" % (k,v)

In [None]:

predict_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {"x":x_test},
    y=y_test,
    num_epochs=1,
    shuffle=False)

predict_results = order_classifier.predict(input_fn=predict_input_fn,checkpoint_path='model_dir')

for p in predict_results:
    print p

# Evaluate Model Functions

In [None]:
#Plotting Functions
def plot_roc_curve(prediction_results,y_test,title,text,y_probs=[]):
    '''
    Plots a ROC curve for given set of predictions
    
    parameters:
        @predict_results: predict generator object
    return:
        None
    '''
    
    if len(y_probs) == 0:
        y_hat = []
        for p_d in prediction_results:
            y_hat.append(p_d['probabilities'][1])
    else:
        y_hat = y_probs

    y = np.asarray(y_test).reshape((len(y_test),1))
    y_hat = np.asarray(y_hat).reshape((len(y_hat),1))
    
    fpr, tpr, _ = roc_curve(y,y_hat)

    AUROC = auc(fpr,tpr)
    plt.plot(fpr,tpr,label=text + ' (AUC = %.3f)' % AUROC)
    plt.legend(loc='lower right')
    plt.title(title)
    plt.xlabel('1 - Specificity')
    plt.ylabel('Sensitivity')
    plt.show()
    
    return


def plot_pr_curve(prediction_results,y_test,title,text,y_probs=[]):
    '''
    Plots a PR Curve for a given set of predictions
     
    parameters:
        @predict_results: predict generator object
    return:
        None
    '''    

    if len(y_probs) == 0:
        y_hat = []
        for p_d in prediction_results:
            y_hat.append(p_d['probabilities'][1])
    else:
        y_hat = y_probs
    
    y = np.asarray(y_test).reshape((len(y_test),1))
    y_hat = np.asarray(y_hat).reshape((len(y_hat),1))

    precision, recall, thresholds = precision_recall_curve(y, y_hat)
    AUPR = average_precision_score(y,y_hat)
    
    plt.plot(recall,precision,label=text + ' (AP = %.3f)' % AUPR,color='r')
    plt.legend(loc='lower left')
    plt.title(title)
    plt.xlabel('Recall (p(y_hat ==1 | y==1))')
    plt.ylabel('Precision')
    plt.show()
    
    return