# Tensorflow Deep Neural Network Classification
## Predicting Dietary Guidelines based on Food Consumption

### Author: Nick Rosso

In [1]:
"""
=======
Predicting Dietary Guidelines Based on food factors
Author: Nick Rosso

"""
print (__doc__)


Predicting Dietary Guidelines Based on food factors
Author: Nick Rosso




In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from collections import defaultdict
from sklearn.model_selection import KFold

import pandas_ml as pdml
import pandas
import numpy as np
import tensorflow as tf
import csv



#dataset for predicting Sugar, Fat, SatFat, Salt 
filename = "tensorflowDataset-Complete.csv"
#dataset for predicting fruits and vegies
removed_Unknowns = "tensorflowDataset-FruitsVeg.csv"

In [3]:
#Loads Headers into an Array
#f = open(filename, 'r', newline='', encoding='utf8')
#row = f.readline()
#guideline = row.split(',')
#headers is a list of guidelines
headers = ["GDLINES_PortionFruitsVeg", "GDLINES_FreeSugars", "GDLINES_Salt", "GDLINES_Fat", "GDLINES_SatFat"]

In [4]:
#Drops columns other than the one passed into function x is the guideline index to be stored.

def keep_Guideline(dataframe,x):
    """
        Input: pandas dataframe
        Output: dataframe with all other columns EXCEPT for 'X'
        
        This function uses a list of boolean to loop over the guidelines and remove all of the guidelines except
        for the true boolean 'X'.
    """
    for i in range(0, 5):
        if i==x:
            continue
        dataframe = dataframe.drop(headers[i], 1)

    return dataframe

In [5]:
#smotes the filename given. 
#x is the index of the guideline to use as the target to smote.

def SMOTE_Dataset(filename,x):
    target=headers[x]
    """
        Input: input data path for the specified csv file.In this case-> tensorflowDataset-Complete.csv
        Output:dataframe with balanced classes.
        
        This function reads the input from the csv file, using the headers stored in the file, and using a , as the seperator.
        Then it balances the classes of the dataframe using SMOTE that is built into pandas_ml
    """
    
    df = pandas.read_csv(filename, sep=',', header=0)
    df = pdml.ModelFrame(df.to_dict(orient='list'),target=target)
    
    #Fruits and vegies dataset is different than the others so nothing needs to be dropped
    #invokes keep_guideline to remove the uneeded guidelines.
    if headers[x] != "GDLINES_PortionFruitsVeg":
        df = keep_Guideline(df,x)
    
    print("%s has a class count of:\n%s" % (target, df.target.value_counts()))
    
    #applies SMOTE to the dataframe
    sampler = df.imbalance.over_sampling.SMOTE()
    sampled = df.fit_sample(sampler)
    print("%s has a class count of:\n%s" % (target, sampled.target.value_counts()))
    #rounds SMOTED Data to 0 or 1
    sampled = sampled.round(0)
    
    return sampled


In [None]:
def ten_Fold_Cross_validation(dataframe, no_folds, guideline, hidden_layer_array):
    """
        Input: SMOTED dataframe, amount of folds, guideline index 0-5, and an array of hidden layer configuration.
        output: DNN classifier accuracy of the 10 fold cross validation on the 10 tests.
        
        This function performs k fold cross validation on a DNN using Proximal adaptive gradient Optimization.
        Note: dimensions must be set to the number of real valued columns to use in training.
    """
    #converts dataframe to numpy array 
    dataframe = dataframe.as_matrix()
    
    kf = KFold(n_splits = no_folds, shuffle=True)
    print("Splitting data: %s times" % (kf.get_n_splits(dataframe)))
    print(kf)
    
    
    accuracy_list = []
    for train, test in kf.split(dataframe):
        
        trainX = []
        trainY = []
        testX = []
        testY = []
        
        columns = [i ++1 for i in range(1,3913)]
        for index in train:
            #appends columns 1-3914 incremeneting by 1. column 0 must be ignored as its the target
            trainX.append(dataframe[index,columns])
            #Stores the first column of the selected index to trainY
            trainY.append(dataframe[index,0])

        

        for index in test:
            #appends columns 1-3914 incremeneting by 1. Column 0 must be ignored as its the target
            testX.append(dataframe[index, columns])
            #Stores the first column of the selected index to testY
            testY.append(dataframe[index,0])
        
        #Converts to np array to apply feature datatype so it can be used by DNN classifer.
        trainX = np.asarray(trainX, dtype = np.bool_)
        trainY = np.asarray(trainY, dtype = np.bool_)
        testX = np.asarray(testX, dtype = np.bool_)
        testY = np.asarray(testY, dtype = np.bool_)
        
        feature_columns = [tf.contrib.layers.real_valued_column("", dimension=3913)]

        classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns, hidden_units= hidden_layer_array,
                                                    n_classes = 2,
                                                    #model_dir="/tmp/FruitsVegies/8-7-2017-109",
                                                    optimizer=tf.train.ProximalAdagradOptimizer(learning_rate=0.1,
                                                                                                l1_regularization_strength=0.001),
                                                    enable_centered_bias=None)
    

        
        classifier.fit(x=trainX, y=trainY, steps=1000)

        accuracy_score = classifier.evaluate(x=testX,y=testY)["accuracy"]
        print('Accuracy: {0:f}'.format(accuracy_score))

        accuracy_list.append(accuracy_score)
        
    return accuracy_list
        


### Driving Script:

In [None]:
#list of accuracies for each network configuration
accuracy_dict = defaultdict(list)
#list of arrays to test the network on.
hidden_layer_parameters = [[64,32]]



for test in range(len(hidden_layer_parameters)):
    #0 - 5 guidelines that we are testing
    for index in range(0,5):
        #Seperate dataset for predicting fruits and vegies. It contains Unknowns that needed to be removed
        print("Performing test on %s with hidden layers: %s------------------------------------------" % (headers[index],hidden_layer_parameters[test]))
        
        if index == 0:
            smoted_Df = SMOTE_Dataset(removed_Unknowns, index)
            #stores list of accuracy to be added to the accuracy dictionary
            results = ten_Fold_Cross_validation(smoted_Df, 10, index, hidden_layer_parameters[test])
            print("TEST %s DONE ----------------------------------------------------------------------" % (index+1))
            
            for value in results:
                #stores results as key in format GDLINE-10,20,10
                #joins two index's together to form the key in the dict
                dict_key = ("%s-%s" % (headers[index], hidden_layer_parameters[test]))
                accuracy_dict[dict_key].append(value)
                
        #uses other file that is not altered from having the unknowns in the dataset.
        else:
            smoted_Df = SMOTE_Dataset(filename, index)
            results = ten_Fold_Cross_validation(smoted_Df, 10, index, hidden_layer_parameters[test])
            print("TEST %s DONE -----------------------------------------------------------------------" % (index+1))
            
            
            for value in results:
                #stores results as key in format GDLINE10,20
                #hidden_layer_string = "".join(str(hidden_layer_parameters[test]))
                #joins two index's together to form the key in the dict
                dict_key = ("%s-%s" % (headers[index], hidden_layer_parameters[test]))

                accuracy_dict[dict_key].append(value)

    print("Tests Done!")

Performing test on GDLINES_PortionFruitsVeg with hidden layers: [64, 32]------------------------------------------
GDLINES_PortionFruitsVeg has a class count of:
0    2311
1     656
Name: GDLINES_PortionFruitsVeg, dtype: int64
GDLINES_PortionFruitsVeg has a class count of:
1    2311
0    2311
Name: GDLINES_PortionFruitsVeg, dtype: int64
Splitting data: 10 times
KFold(n_splits=10, random_state=None, shuffle=True)
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_session_config': None, '_environment': 'local', '_num_worker_replicas': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000000138F4DD8>, '_master': '', '_save_checkpoints_secs': 600, '_tf_random_seed': None, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_save_checkpoints_steps': None, '_save_summary_steps': 100, '_num_ps_replicas': 0, '_task_id': 0, '_is_chief': True, '_model_dir': 'C:\\Users\\Nick\\AppData\\Local\\Temp\\t

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\Nick\AppData\Local\Temp\tmpd8qs2mh9\model.ckpt.
INFO:tensorflow:loss = 0.693962, step = 1
INFO:tensorflow:global_step/sec: 6.74553
INFO:tensorflow:loss = 0.374606, step = 101 (14.815 sec)
INFO:tensorflow:global_step/sec: 7.34413
INFO:tensorflow:loss = 0.279715, step = 201 (13.610 sec)
INFO:tensorflow:global_step/sec: 7.47332
INFO:tensorflow:loss = 0.192271, step = 301 (13.381 sec)
INFO:tensorflow:global_step/sec: 7.36101
INFO:tensorflow:loss = 0.128187, step = 401 (13.585 sec)
INFO:tensorflow:global_step/sec: 7.45982
INFO:tensorflow:loss = 0.0749103, step = 501 (13.410 sec)
INFO:tensorflow:global_step/sec: 7.64445
INFO:tensorflow:loss = 0.0575531, step = 601 (13.092 sec)
INFO:tensorflow:global_step/sec: 7.51632
INFO:tensorflow:loss = 0.0485711, step = 701 (13.304 sec)
INFO:tensorflow:global_step/sec: 7.65039
INFO:tensorflow:loss = 0.0440161, step = 801 (13.071 sec)
INFO:tensorflow:global_

INFO:tensorflow:Starting evaluation at 2017-09-12-19:19:39
INFO:tensorflow:Restoring parameters from C:\Users\Nick\AppData\Local\Temp\tmp9lf_264s\model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2017-09-12-19:19:41
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.91342, accuracy/baseline_label_mean = 0.497835, accuracy/threshold_0.500000_mean = 0.91342, auc = 0.950722, auc_precision_recall = 0.924832, global_step = 1000, labels/actual_label_mean = 0.497835, labels/prediction_mean = 0.538559, loss = 0.292876, precision/positive_threshold_0.500000_mean = 0.876984, recall/positive_threshold_0.500000_mean = 0.96087
Accuracy: 0.913420
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_session_config': None, '_environment': 'local', '_num_worker_replicas': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000000016DC6198>, '_master': '', '_save_checkpoints_secs': 600, '_tf_random_seed': None, '_evaluation_master': '', 

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\Nick\AppData\Local\Temp\tmprcvenrcn\model.ckpt.
INFO:tensorflow:loss = 0.697056, step = 1
INFO:tensorflow:global_step/sec: 6.90664
INFO:tensorflow:loss = 0.365971, step = 101 (14.479 sec)
INFO:tensorflow:global_step/sec: 7.61015
INFO:tensorflow:loss = 0.255073, step = 201 (13.130 sec)
INFO:tensorflow:global_step/sec: 7.23547
INFO:tensorflow:loss = 0.16775, step = 301 (13.816 sec)
INFO:tensorflow:global_step/sec: 7.56026
INFO:tensorflow:loss = 0.0927327, step = 401 (13.227 sec)
INFO:tensorflow:global_step/sec: 7.40192
INFO:tensorflow:loss = 0.0634074, step = 501 (13.

Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Starting evaluation at 2017-09-12-19:27:31
INFO:tensorflow:Restoring parameters from C:\Users\Nick\AppData\Local\Temp\tmp0zfs791q\model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2017-09-12-19:27:32
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.935065, accuracy/baseline_label_mean = 0.497835, accuracy/threshold_0.500000_mean =

Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Sav

### Stores Output into csv file.

In [None]:
outputFile = 'results.csv'

with open(outputFile, 'w') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(accuracy_dict.keys())
    writer.writerows(zip(*accuracy_dict.values()))