# Visualizing Boxplots

In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import matplotlib.backends.backend_pdf

import pandas as pd
import numpy as np
import time
import os

from boxplot_utils import *

In [2]:
# read data as input
fn_in = "/Users/don/Desktop/Hamish McWilliam & SebastianMCG_5_OP_RU_Metaboanalyst_Final2.csv"
#fn_in = "/Users/don/Documents/flask_boxplot_reports_v2/sample_data/kiryu_gcms_normalized.csv"
d0 = pd.read_csv(fn_in)

grp_name_col = d0.columns[1]
sample_name_col = d0.columns[0]
metabs_ls = list(d0.columns)[2:]
metabs_ls.sort()
d0 = d0[list(d0.columns)[:2] + metabs_ls] 
# A more convoluted way of grabbing the list of groups
# Because just using list(d0[grp_name_col]) doesn't preserve the group order
groups_ls = []
for index, row in d0.iterrows():
    grp_name = str(row[grp_name_col])
    if grp_name not in groups_ls:
        groups_ls.append(grp_name)


print("Num. metabs = %s" % len(metabs_ls))
batches_ls = batch_metabs(metabs_ls, batch_size=30)

print("Grouping into %s batches: first %s of size 30, last of size %s" % (len(batches_ls), len(batches_ls)-1, len(batches_ls[-1])))

# Set some plot params
matplotlib.rc('axes',edgecolor='grey')
title_fontsize=11
# colours_ls corresponds to colorbrewer's qualitative 7-class Set3
colours_ls = ["#8dd3c7", "#bebada", "#fb8072", "#ffffb3", "#80b1d3", "#fdb462", "#fdb462", "#b3de69"]

Num. metabs = 199
Grouping into 7 batches: first 6 of size 30, last of size 19


In [3]:
# Start plotting loop
t0 = time.time()
figs_ls = []

batch_counter = 0 # Used for print() purposes only
for batch in batches_ls:
    num_cols = 5
    num_rows = 6
    
    # Adjust num_rows and num_cols for the last batch as necessary
    if len(batch) < (num_cols * num_rows):
        if divmod(len(batch), num_cols)[1] > 0: # If there's a remainder
            num_rows = divmod(len(batches_ls[-1]), num_cols)[0] + 1
        else: # if the last batch so happens to be a multipe of num_cols (5)
            num_rows = divmod(len(batches_ls[-1]), num_cols)[0]
    print("batch %s: len(batch) = %s, num_rows = %s, num_cols = %s" % (batch_counter, len(batch), num_rows, num_cols))
    
    # plot!
    fig, axarr = plt.subplots(num_rows, num_cols, figsize=(20, 4.5*num_rows), sharex='col')

    idx = 0
    for i in np.arange(num_rows):
        for j in np.arange(num_cols):
            # Continue to proc if this is NOT the last batch
            # OR if is it, check that idx < len(last_batch), 
            # because last_batch[idx] will throw an index-out-of-bounds error
            if (len(batch) == 30) or ((len(batch) < 30) and (idx < len(batch))):
                plot_input_arr = get_bplot_inputs(d0, groups_ls, batch[idx], group_colname=grp_name_col)
                
                # See note above on why num_rows > 1 and num_rows == 1 need different treatment
                if num_rows > 1:
                    axarr[i, j].set_title(batch[idx], fontsize=title_fontsize)
                    bplot = axarr[i, j].boxplot(plot_input_arr, 
                                                patch_artist=True, 
                                                widths=tuple([0.85]*len(groups_ls)),
                                                labels=groups_ls, 
                                                showfliers=False, 
                                                zorder=10)

                    # Manually add scatterplot of datapoints
                    for grp_idx in range(len(groups_ls)):
                        scatter_y = plot_input_arr[grp_idx]
                        scatter_x = np.random.normal(grp_idx+1, 0.04, size=len(scatter_y))
                        axarr[i, j].scatter(scatter_x, scatter_y, c="black", zorder=11, alpha=0.7)
                        
                    # Add bg colour on odd rows
                    if i%2 == 0:
                        axarr[i, j].set_facecolor('#EAF2F6')
                        
                    # Add grid
                    axarr[i, j].grid(True)
                    # Use scientific format for numbers
                    axarr[i, j].ticklabel_format(axis="y", scilimits=(0,0))
                    # Rotate xtick labels
                    for tck in axarr[i, j].get_xticklabels():
                        tck.set_rotation(90)
                    
                # ========= Duplicate code ===========
                # But this duplicate code is necessary because iterating by axarr[i,j] yields a syntax error
                # For the special case where num_rows == 1, and matplotlib isn't smart enough to see this
                elif num_rows == 1:
                    axarr[j].set_title(batch[idx], fontsize=title_fontsize)
                    bplot = axarr[j].boxplot(plot_input_arr, 
                                             patch_artist=True, 
                                             widths=tuple([0.85]*len(groups_ls)),
                                             labels=groups_ls, 
                                             showfliers=False, 
                                             zorder=10)

                    # Manually add scatterplot of datapoints
                    for grp_idx in range(len(groups_ls)):
                        scatter_y = plot_input_arr[grp_idx]
                        scatter_x = np.random.normal(grp_idx+1, 0.04, size=len(scatter_y))
                        axarr[j].scatter(scatter_x, scatter_y, c="black", zorder=11, alpha=0.7)
                        
                    # Add grid
                    axarr[j].grid(True)
                    # Use scientific format for numbers
                    axarr[j].ticklabel_format(axis="y", scilimits=(0,0))

                # Boxplot Aesthetics
                for box_idx in range(len(bplot['boxes'])):
                    bplot['medians'][box_idx].set_color('black')
                    bplot['boxes'][box_idx].set(linewidth=1.5)

                idx += 1
                # colour in boxplots
                for patch, color in zip(bplot['boxes'], colours_ls):
                    patch.set_facecolor(color)

    fig.subplots_adjust(wspace=0.125, hspace=0.12)
    figs_ls.append(fig)
    
    plt.close()

    batch_counter +=1

print("Done in %.2fs" % (time.time() - t0))

batch 0: len(batch) = 30, num_rows = 6, num_cols = 5
batch 1: len(batch) = 30, num_rows = 6, num_cols = 5
batch 2: len(batch) = 30, num_rows = 6, num_cols = 5
batch 3: len(batch) = 30, num_rows = 6, num_cols = 5
batch 4: len(batch) = 30, num_rows = 6, num_cols = 5
batch 5: len(batch) = 30, num_rows = 6, num_cols = 5


  wiskhi = x[x <= hival]
  wisklo = x[x >= loval]
  x[x < stats['whislo']],
  x[x > stats['whishi']],


batch 6: len(batch) = 19, num_rows = 4, num_cols = 5
Done in 27.56s


>Note: different treatment of `num_row` > 1, or ==1, because iterating through the array object will change. More specifically, `axarr[i, j]` will throw a syntax error if `num_row` == 1; must use `axarr[j]`

In [4]:
# print out boxplots in a multi-page pdf
# This is the time-consuming step

t0 = time.time()

fn_out = "/Users/don/Desktop/hamish-sebastian-boxplots.pdf"
pdf = matplotlib.backends.backend_pdf.PdfPages(fn_out)
for fig in figs_ls:
    pdf.savefig(fig, bbox_inches = 'tight')
pdf.close()

print("Done in %.2fs" % (time.time() - t0))

Done in 42.75s
