In [1]:
# imports

import os
import sys
import re
from math import pi
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_rows = 4000

from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource, HoverTool, Label, Legend, Div, LabelSet, FuncTickFormatter
from bokeh.models.widgets import Panel, Tabs
from bokeh.models.tickers import FixedTicker
from bokeh.plotting import figure
from bokeh.transform import dodge, cumsum
from bokeh.layouts import row
from bokeh.io import output_notebook
output_notebook()

In [2]:
def preprocess_intake_files(subject):
    '''
    Function that preprocesses the intakes files from the given subject to the correct formattings
    
    Parameters
    --------------
    subject : chr
        The given subject's initial
    
    Returns
    --------------
    list
        baseline_filename    the filename of the created baseline intake file
        vegan_filename       the filename of the created vegan intake file
    '''
    
    days = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
    baseline = ["week1", "week2"]
    header = "Day,Date,Year,Calories,Fat,Saturated fat,Carbs,Fiber,Sugar,Protein,Sodium,Cholesterol,Potassium\n"
    
    food_intake_files_path = '../data/intake_files'
    processed_files_path = food_intake_files_path + '/processed'
    
    baseline_filename = processed_files_path + '/nutrition_baseline_subject_'+ subject +'.csv'
    nutrition_baseline = open(baseline_filename, 'w')
    nutrition_baseline.write(header)
    
    vegan_filename = processed_files_path + '/nutrition_vegan_subject_'+ subject +'.csv'
    nutrition_vegan = open(vegan_filename, 'w')
    nutrition_vegan.write(header)
    
    with nutrition_baseline as n_baseline, nutrition_vegan as n_vegan:
        for file in os.scandir(food_intake_files_path):
            filename = file.path
            # only use the file from the given subject
            if (('subject_' + subject in filename) and filename.lower().endswith('.csv')):
                file = open(file, 'r')
                with file as food_intake:
                    # start at line 10 with reading the food log
                    for i in range(9):
                        food_intake, next(food_intake)
                    for line in food_intake:
                        # replace double quotes with single quotes
                        index = 1 if '""' in line else 0
                        line = line.strip().replace('""', '"')[index:].rstrip('"')
                        # get the nutrition details per day
                        if line.split(',')[0].strip('"').lower() in days:
                            if any(week in filename.lower() for week in baseline):
                                n_baseline.write(line + '\n')
                            else:
                                n_vegan.write(line + '\n')
    
    return [baseline_filename, vegan_filename]

In [3]:
def create_nutrition_tables(subject):
    '''
    Function that creates the food intake data tables for the given subject
    
    Parameters
    --------------
    subject : chr
        The given subject's initial
    
    Returns
    --------------
    list
        nutrition_data    food intake data table
    '''
    
    nutrition_figures = []
    
    subject_data = preprocess_intake_files(subject)
    base_data = pd.read_csv(subject_data[0], sep=',')
    vegan_data = pd.read_csv(subject_data[1], sep=',')

    convert_columns = ['Calories', 'Fat', 'Saturated fat', 'Carbs', 
                       'Fiber', 'Sugar', 'Protein', 
                       'Sodium', 'Cholesterol', 'Potassium']

    # convert commas to points as delimiter and convert to numeric values
    base_data[convert_columns] = base_data[convert_columns].replace(',', '.', regex=True)
    base_data[convert_columns] = base_data[convert_columns].apply(pd.to_numeric, errors='coerce')
    vegan_data[convert_columns] = vegan_data[convert_columns].replace(',', '.', regex=True)
    vegan_data[convert_columns] = vegan_data[convert_columns].apply(pd.to_numeric, errors='coerce')

    # convert sodium, cholesterol and potassium to grams (they are initially in mg)
    base_data[convert_columns[7:]] = base_data[convert_columns[7:]] / 100
    vegan_data[convert_columns[7:]] = vegan_data[convert_columns[7:]] / 100
    
    # set the average saturated fat of the week to '0' if it is higher than the fat value
    base_data.loc[(base_data['Saturated fat'] > base_data['Fat']), 'Saturated fat'] = 0
    vegan_data.loc[(vegan_data['Saturated fat'] > vegan_data['Fat']), 'Saturated fat'] = 0
    
    # add the diet definition column
    base_data['Diet'] = 'Normal'
    vegan_data['Diet'] = 'Vegan'
    
    # concat diets and set date as index column
    nutrition_data = pd.concat([base_data, vegan_data])
    nutrition_data['Date'] = nutrition_data['Date'].astype(str) + ' ' +nutrition_data['Year'].astype(str)
    nutrition_data['Date'] = pd.to_datetime(nutrition_data['Date'], format=' %B %d %Y')
    nutrition_data.drop(['Year'], inplace=True, axis=1)
    nutrition_data.set_index('Date', inplace=True)
    nutrition_data.sort_index(inplace=True)
    
    return nutrition_data

In [26]:
def create_nutrition_boxplot(subject, nutrition_data):
    '''
    '''
    
    # graph labels and positions 
    bar_labels = ['Carbs', 'Protein', 'Fat', 
                 'Saturated fat', 'Fiber', 'Sugar', 
                 'Sodium', 'Cholesterol', 'Potassium']
    base_positions = [0.5, 2.5, 4.5, 6.5, 8.5, 10.5, 12.5, 14.5, 16.5]
    vegan_positions = [pos+0.6 for pos in base_positions]
    label_positions = [pos+0.3 for pos in base_positions]
    graph_labels = str(dict(zip(label_positions, bar_labels)))
    outlier_plot_positions = dict(zip(bar_labels, base_positions))
    outlier_plot_positions_v = dict(zip(bar_labels, vegan_positions))
    
    vegan = nutrition_data.loc[nutrition_data['Diet'] == 'Vegan'][bar_labels]
    baseline = nutrition_data.loc[nutrition_data['Diet'] == 'Normal'][bar_labels]
    
    # transform both dataframes to 2 column dataframe
    baseline_df = pd.DataFrame(columns=['group', 'value'])
    vegan_df = pd.DataFrame(columns=['group', 'value'])
    for label in bar_labels:
        tmp = pd.DataFrame(columns=['group', 'value'])
        tmp_v = pd.DataFrame(columns=['group', 'value'])
        tmp['value'] = baseline[label]
        tmp_v['value'] = vegan[label]
        tmp['group'] = label
        tmp_v['group'] = label
        baseline_df = pd.concat([baseline_df, tmp])
        vegan_df = pd.concat([vegan_df, tmp_v])
    
    # find the quartiles and IQR for each category
    groups = baseline_df.groupby('group', sort=False)
    groups_v = vegan_df.groupby('group', sort=False)
    q1 = groups.quantile(q=0.25)
    q1_v = groups_v.quantile(q=0.25)
    q2 = groups.quantile(q=0.5)
    q2_v = groups_v.quantile(q=0.5)
    q3 = groups.quantile(q=0.75)
    q3_v = groups_v.quantile(q=0.75)
    iqr = q3 - q1
    iqr_v = q3_v - q1_v
    upper = q3 + 1.5*iqr
    upper_v = q3_v + 1.5*iqr_v
    lower = q1 - 1.5*iqr
    lower_v = q1_v - 1.5*iqr_v
    
    # get the outliers
    out = groups.apply(get_outliers, upper=upper, lower=lower).dropna()
    out_v = groups_v.apply(get_outliers, upper=upper_v, lower=lower_v).dropna()
    
    # prepare outlier data for plotting
    if not out.empty:
        outx = []
        outy = []
        for keys in out.index:
            outx.append(keys[0])
            outy.append(out.loc[keys[0]].loc[keys[1]])
    if not out_v.empty:
        outx_v = []
        outy_v = []
        for keys in out_v.index:
            outx_v.append(keys[0])
            outy_v.append(out_v.loc[keys[0]].loc[keys[1]])
    
    p = figure(tools="", toolbar_location=None, plot_height=500, plot_width=900)
    
    # if no outliers, shrink lengths of stems to be no longer than the minimums or maximums
    qmin = groups.quantile(q=0.00)
    qmin_v = groups_v.quantile(q=0.00)
    qmax = groups.quantile(q=1.00)
    qmax_v = groups_v.quantile(q=1.00)
    upper.value = [min([x,y]) for (x,y) in zip(list(qmax.loc[:,'value']),upper.value)]    
    upper_v.value = [min([x,y]) for (x,y) in zip(list(qmax_v.loc[:,'value']),upper_v.value)]
    lower.value = [max([x,y]) for (x,y) in zip(list(qmin.loc[:,'value']),lower.value)]
    lower_v.value = [max([x,y]) for (x,y) in zip(list(qmin_v.loc[:,'value']),lower_v.value)]
    
    # stems
    p.segment(base_positions, upper.value, base_positions, q3.value, line_color="black")
    p.segment(base_positions, lower.value, base_positions, q1.value, line_color="black")
    p.segment(vegan_positions, upper_v.value, vegan_positions, q3_v.value, line_color="black")
    p.segment(vegan_positions, lower_v.value, vegan_positions, q1_v.value, line_color="black")
    
    # boxes
    p.vbar(base_positions, 0.4, q2.value, q3.value, fill_color="#abdfff", line_color="black")
    p.vbar(base_positions, 0.4, q1.value, q2.value, fill_color="#abdfff", line_color="black")
    p.vbar(vegan_positions, 0.4, q2_v.value, q3_v.value, fill_color="#ceffc4", line_color="black")
    p.vbar(vegan_positions, 0.4, q1_v.value, q2_v.value, fill_color="#ceffc4", line_color="black")

    # whiskers
    p.rect(base_positions, lower.value, 0.2, 0.01, line_color="black")
    p.rect(base_positions, upper.value, 0.2, 0.01, line_color="black")
    p.rect(vegan_positions, lower_v.value, 0.2, 0.01, line_color="black")
    p.rect(vegan_positions, upper_v.value, 0.2, 0.01, line_color="black")

    # outliers
    if not out.empty:
        p.circle([outlier_plot_positions[x] for x in outx], outy, size=3, color="#F38630", fill_alpha=0.6)
    if not out_v.empty:
        p.circle([outlier_plot_positions_v[x] for x in outx_v], outy_v, size=3, color="#F38630", fill_alpha=0.6)

    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = "white"
    p.grid.grid_line_width = 2
    p.xaxis.major_label_text_font_size = "9px"
    p.xaxis.major_label_orientation = 0.5
    p.xaxis.ticker = label_positions
    
    # set labels at the tick positions
    p.xaxis.formatter = FuncTickFormatter(code="""
            var mapping = {};
            return mapping[tick];
        """.format(graph_labels))
    
    show(p)

def get_outliers(group, upper, lower):
    '''
    Function that finds the outliers for each category
    '''
    
    cat = group.name
    return group[(group.value > upper.loc[cat]['value']) | (group.value < lower.loc[cat]['value'])]['value']

In [5]:
def create_average_nutrition_figure(subject, nutrition_data):
    '''
    Function that creates the nutrition graph for the given subject
    
    Parameters
    --------------
    subject : chr
        The given subject's initial
    nutrition_data : DataFrame
        The food intake data table of the given subject
    
    Returns
    --------------
    figure
        The created nutrition graph visualization
    '''
    
    # calcuate the average nutritional values
    averages = nutrition_data.groupby(['Diet']).mean().round(2)
    
    bar_labels = ['Carbs', 'Protein', 'Fat', 
                 'Saturated fat', 'Fiber', 'Sugar', 
                 'Sodium', 'Cholesterol', 'Potassium']
    data_obj = {
            'labels': bar_labels,
            'Normal': averages[bar_labels].loc['Normal'],
            'Vegan': averages[bar_labels].loc['Vegan'],
            'None' : [0 for i in bar_labels]
        }
    source = ColumnDataSource(data=data_obj)
    
    p = figure(x_range=bar_labels, y_range=(0, 400), plot_height=400,
               title="Average nutritional information subject " + subject + " in grams",
               toolbar_location=None, tools="")
    
    p.vbar(x=dodge('labels', -0.18, range=p.x_range), 
           top='Normal', name='Normal', width=0.3, 
           source=source, color="#abdfff", legend_label="Normal diet", line_color="#75cbff", line_width=0.4)

    # middle label
    p.vbar(x=dodge('labels',  0,  range=p.x_range), top='None', width=0.1, source=source)
    
    p.vbar(x=dodge('labels',  0.18,  range=p.x_range), 
           top='Vegan', name='Vegan', width=0.3, 
           source=source, color="#ceffc4", legend_label="Vegan diet", line_color="#8dff75", line_width=0.4)
    
    p.x_range.range_padding = 0.05
    p.xgrid.grid_line_color = None
    p.legend.location = "top_right"
    p.legend.orientation = "vertical"
    p.xaxis.major_label_orientation = 0.5
    
    p.add_tools(HoverTool(
        names = ['Normal', 'Vegan'],
        tooltips = [
            ('', '$name diet'),
            ('', '@$name{1.11} grams')
        ],
        mode = 'mouse',
        show_arrow = False,
        point_policy = 'follow_mouse'
    ))
    
    style = {
        'font-size': 'smaller'
    }
    div = Div(text=
              """Average calories:<br>
                 Normal diet: {} kcal <br>
                 Vegan diet:  {} kcal"""
              .format(averages['Calories'].loc['Normal'], averages['Calories'].loc['Vegan']),
              width=200, height=100, style=style)
    show(p)
    show(div)
    return p

In [6]:
def create_nutrition_piecharts(subject, nutrition_data):
    '''
    Function that creates the food intake pie charts per week for the given subject
    
    Parameters
    --------------
    subject : chr
        The given subject's initial
    nutrition_data : list
        The food intake data table
    
    Returns
    --------------

    '''
    
    tabs = []
    weekly_averages = get_weekly_averages(nutrition_data)
    
    macro_colors = ['#266298', '#89c609', '#fc2a35']
    sug_fib_colors = ['#f9dda9', '#ac545c']
    micro_colors = ['#a21d22', '#f6871e']
    
    for i in range(0, len(weekly_averages.index)):
        macros = weekly_averages[['Carbs', 'Protein', 'Fat']].iloc[i]
        sug_fib = weekly_averages[['Sugar', 'Fiber']].iloc[i]
        micros = weekly_averages[['Sodium', 'Potassium']].iloc[i]

        p1 = get_piechart_data(macros, macro_colors, 'Average calorie composition')
        p2 = get_piechart_data(sug_fib, sug_fib_colors, 'Average sugar and fiber devision')
        p3 = get_piechart_data(micros, micro_colors, 'Average sodium and potassium devision')

        pie_charts = row(p1, p2, p3)
        tab = Panel(child=pie_charts, title="Week " + str(i+1))
        tabs.append(tab)
        
    tabs = Tabs(tabs=tabs)
    show(tabs)

In [7]:
def get_piechart_data(nutrients, nutrient_colors, chart_description):
    '''
    Function that creates a food intake pie chart for the given nutrients DataFrame
    
    Parameters
    --------------
    nutrients : DataFrame
        The DataFrame with the nutrient values
    nutrition_colors : list
        The nutrient colors list
    chart_description : string
        The title of the pie chart graph
    
    Returns
    --------------
    p
        The generated pie chart
    '''
    
    R = 0.35
    p_range = (-R * 1.1, R * 1.5)
    p_size = 350
    
    data = (pd.Series(nutrients.to_dict())
            .reset_index(name='value')
            .rename(columns={'index':'nutrient'})
            .assign(end_angle=lambda d: np.cumsum(d['value'] / d['value'].sum() * 2 * pi),
                    start_angle=lambda d: np.pad(d['end_angle'], (1, 0))[:-1],
                    label_x=lambda d: R * 0.9 * np.cos(d['start_angle']),
                    label_y=lambda d: R * 0.95 * np.sin(d['start_angle'])))
    data['percentage'] = (data['value']/data['value'].sum() * 100).round(2).astype(str) + '%'
    data['color'] = nutrient_colors[:len(nutrients)]

    p = figure(title=chart_description, toolbar_location=None,
               plot_height=p_size, plot_width=p_size,
               x_range=p_range, y_range=p_range,
               tools="")
    p.wedge(x=0, y=0, radius=R,
            start_angle='start_angle', end_angle='end_angle',
            line_color="white", fill_color='color', legend_field='nutrient', source=data)

    source = ColumnDataSource(data)
    labels = LabelSet(x='label_x', y='label_y', text='percentage',
                      angle='start_angle', source=source, render_mode='canvas', 
                      text_align='right', text_font_size="9pt", text_color='white')

    p.add_layout(labels)
    p.axis.axis_label=None
    p.axis.visible=False
    p.grid.grid_line_color = None
    p.legend.label_text_font_size = '9pt'
    
    return p

In [8]:
def get_weekly_averages(nutrition_data):
    '''
    Function that calculates the weekly average nutrient values
    '''
    
    nutrition_data['week_number'] = ((nutrition_data.Day.str.lower() == 'monday').cumsum())
    return nutrition_data.groupby('week_number').mean().round(2)

In [31]:
def create_nutrition_graphs(subjects):
    for subject in subjects:
        nutrition_data = create_nutrition_tables(subject)
        #figure = create_average_nutrition_figure(subject, nutrition_data)
        create_nutrition_boxplot(subject, nutrition_data)
        #create_nutrition_piecharts(subjects[:-1], nutrition_data)

def main():
    #subjects = ['A', 'B', 'C', 'D', 'E']
    subjects = ['E']
    create_nutrition_graphs(subjects)

if __name__ == '__main__':
    main()