In [1]:
%matplotlib notebook

import os
import pandas
import numpy as np

In [2]:
%reload_ext autoreload
%autoreload 2

<hr style="border-width:4px; border-color:coral"/>

# Timing : Read the data

<hr style="border-width:4px; border-color:coral"/>

In this notebook, you will 

* Read in the data from a data file

* Create a Pandas DataFrame

* Display the DataFrame

* Explore information from the data frame and practice using MultiIndex indexing in Python.

<hr style="border-width:2px; border-color:black"/>

### Try out this notebook on sample data : 

This notebook should run on sample data stored in the `sample_data` directory under this directory.  This directory contains data for the shockbubble problem.  

In [3]:
ex_list = ['shockbubble']   # Later : ['shockbubble','swirl', ...]

example = ex_list[0]

# Specify path to location of 'results.out' file for each run
#    <path_to_data>/shockbubble/gpu
#    <path_to_data>/shockbubble/cpu

rsb_cpu = 'Documents/phd-research/cpu_gpu/timing_results/sample_data/shockbubble/cpu/results.out'
rsb_gpu'Documents/phd-research/cpu_gpu/timing_results/sample_data/shockbubble/gpu/results.out'


path_to_data = './sample_data'   # Data from sample shockbubble example.

<hr style="border-width:2px; border-color:black"/>

### Explore your own data

To explore your own data, you need to first create data from a series of runs.  

We will focus on the "shockbubble" problem.  Later, we can add more examples. 

**1.**  Create a directory `results` where you can store output from your runs. 


**2.** Run the shockbubble code on the CPU on 1,2,4,8 and 16 processors.  Store resulting SLURM console output files (e.g. sb_0001.o12335) files in the directory `results/shockbubble/cpu`. 


**3.**  Run the GPU code on 1,2,4,8, and 16, processors and store the output files in the directory `results/shockbubble/gpu`. 


**4.**  Navigate to the CPU directory.  Run the file `compile_results.py` to create a `results.out` file in that directory.  

    % python compile_results.py
    
You will need to set your PYTHONPATH so that this file can find other needed files : 

    % export PYTHONPATH=<path_to_forestclaw>/scripts
    
Ignore the errors about `ELLIPTIC_GRIDS_COUNTER` not found. 

**Very important:** Edit the resulting `results.out` file to remove the `#` character at the start of the file.  


**5.**  Navigate to the GPU directory.  Run the file `compile_results.py` to create a `results.out` file in that directory.  Edit the resulting `results.out` file to **remove** the `#` character at the start of the file. 

Once you have done the above, you should be able to run this notebook to explore your data. 

**NOTE:** When running the GPU results, you will need to be sure that you run on enough nodes.  Each node on Borah only has 2 GPUs, so to run on 16 GPUs, use the following : 

    #SBATCH --ntasks=16    # 16 MPI ranks
    #SBATCH --ntasks-per-node=2
    
    ...
    
    mpirun ./shockbubble_cuda


In [4]:
# Specify your path to location of 'results.out' file for each run
#
#
#    <path_to_data>/shockbubble/gpu
#    <path_to_data>/shockbubble/cpu

# path_to_data = './results'   # Data from sample shockbubble example.

<hr style="border-width:4px; border-color:coral"/>

# Data Tools (file)

<hr style="border-width:4px; border-color:coral"/>

In [5]:
%%file data_tools.py
import os
import pandas
import numpy as np

cols = ['walltime','advance','ghostfill','regrid','adapt',
            'adv_steps','adv_step2', 'mx', 'patch_comm', 'output', 'grids_proc',
            'memcopy_h2h','memcopy_h2d','memcopy_d2h']

    
dtypes = {'walltime': float,
          'advance': float,
          'ghostfill': float,
          'regrid': float,
          'adapt': float,
          'adv_steps': int,
          'adv_step2': int,
          'mx': int,
          'patch_comm': float,
          'output' : float,
          'grids_proc' : int,          
          'memcopy_h2h': float,
          'memcopy_h2d': float,
          'memcopy_d2h': float}    

def read_data(dir,device):
    fname = os.path.join('{:s}'.format(dir),'{:s}'.format(device),'results.out')
    df = pandas.read_table(fname,delim_whitespace=True)
    df.sort_values('p',inplace=True)

    f = '{:.2f}'.format
    fstr = {'p' : '{:3d}'.format,
            'walltime' : f,
            'advance' : f,
            'ghostfill' : f, 
            'patch_comm' : f,
            'regrid' : f, 
            'partition' : f,
            'adapt' : f,             
            'cfl' : f, 
            'grids_proc' : '{:4d}'.format, 
            'DOF/s' : '{:.1e}'.format,
            'Speedup': '{:.1f}'.format, 
            'Eff.' : '{:.1f}%'.format, 
            'output':'{:.1f}'.format,
            'memcopy_h2h' : '{:.2f}'.format, 
            'memcopy_d2h' : '{:.2f}'.format,
            'memcopy_h2d' : '{:.2f}'.format}

    return df,fstr

Overwriting data_tools.py


<hr style="border-width:4px; border-color:coral"/>

# Read data and set up Pandas MultiIndex

<hr style="border-width:4px; border-color:coral"/>

In [6]:
import data_tools
import os

idx = pandas.IndexSlice

procs = [1,2,4,8,16]

cols = ['walltime','advance','ghostfill','regrid','adapt',
            'adv_steps','adv_step2', 'mx', 'patch_comm', 'output', 'grids_proc',
            'memcopy_h2h','memcopy_h2d','memcopy_d2h']

iterables = [ex_list, ['GPU','CPU'], procs]

index = pandas.MultiIndex.from_product(iterables,names=['example','device','procs'])
df = pandas.DataFrame(index=index,columns=cols).sort_index()

#ex_data = {}
for d in ex_list:
    data_dir = os.path.join(path_to_data,d)
    df_gpu,fstr = data_tools.read_data(data_dir,'gpu')
    df.loc[idx[d,'GPU',:],:] = df_gpu[cols].values
    
    df_cpu,fstr = data_tools.read_data(data_dir,'cpu')
    df.loc[idx[d,'CPU',:],:] = df_cpu[cols].values

# df.index.name = 'procs'
for col, dtype in data_tools.dtypes.items():
    df[col] = df[col].astype(dtype)

# example_data = df.transpose().unstack(level=2).stack(level=0)
example_data = df
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,walltime,advance,ghostfill,regrid,adapt,adv_steps,adv_step2,mx,patch_comm,output,grids_proc,memcopy_h2h,memcopy_h2d,memcopy_d2h
example,device,procs,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
shockbubble,CPU,1,2505.1,2388.0,112.96,2.9135,1.1091,3162510,2386,32,0.036478,0.0,382,0.0,0.0,0.0
shockbubble,CPU,2,1381.8,1193.8,68.496,1.2081,0.65591,1581250,1192,32,117.4,0.0,190,0.0,0.0,0.0
shockbubble,CPU,4,775.29,598.27,45.187,0.8878,0.45943,790627,597,32,128.99,0.0,95,0.0,0.0,0.0
shockbubble,CPU,8,433.98,298.91,28.14,0.43952,0.36179,395314,298,32,103.88,0.0,47,0.0,0.0,0.0
shockbubble,CPU,16,244.74,149.29,16.434,0.2075,0.31154,197657,149,32,75.364,0.0,23,0.0,0.0,0.0
shockbubble,GPU,1,458.98,342.98,112.42,2.3,1.103,3162510,341,32,0.04646,0.0,382,35.785,10.985,10.485
shockbubble,GPU,2,272.85,186.24,70.239,1.3477,0.6573,1581250,185,32,13.867,0.0,190,19.448,5.8454,5.3197
shockbubble,GPU,4,171.32,102.78,43.592,0.73535,0.46007,790627,102,32,22.924,0.0,95,7.8659,3.0234,2.6898
shockbubble,GPU,8,109.09,59.014,25.037,0.4057,0.32909,395314,58,32,23.268,0.0,47,3.06,1.6402,1.3716
shockbubble,GPU,16,124.09,71.691,16.732,0.31207,0.30489,197657,71,32,33.504,0.0,23,1.6573,0.96905,0.73838


<hr style="border-width:4px; border-color:coral"/>

# Exploring the data

<hr style="border-width:4px; border-color:coral"/>

The following will show you how to use a MultiIndex to extract data for a variety of purposes. 

In [7]:
# Extract all of the data for one example

example_data = df.loc['shockbubble']
example_data

Unnamed: 0_level_0,Unnamed: 1_level_0,walltime,advance,ghostfill,regrid,adapt,adv_steps,adv_step2,mx,patch_comm,output,grids_proc,memcopy_h2h,memcopy_h2d,memcopy_d2h
device,procs,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
CPU,1,2505.1,2388.0,112.96,2.9135,1.1091,3162510,2386,32,0.036478,0.0,382,0.0,0.0,0.0
CPU,2,1381.8,1193.8,68.496,1.2081,0.65591,1581250,1192,32,117.4,0.0,190,0.0,0.0,0.0
CPU,4,775.29,598.27,45.187,0.8878,0.45943,790627,597,32,128.99,0.0,95,0.0,0.0,0.0
CPU,8,433.98,298.91,28.14,0.43952,0.36179,395314,298,32,103.88,0.0,47,0.0,0.0,0.0
CPU,16,244.74,149.29,16.434,0.2075,0.31154,197657,149,32,75.364,0.0,23,0.0,0.0,0.0
GPU,1,458.98,342.98,112.42,2.3,1.103,3162510,341,32,0.04646,0.0,382,35.785,10.985,10.485
GPU,2,272.85,186.24,70.239,1.3477,0.6573,1581250,185,32,13.867,0.0,190,19.448,5.8454,5.3197
GPU,4,171.32,102.78,43.592,0.73535,0.46007,790627,102,32,22.924,0.0,95,7.8659,3.0234,2.6898
GPU,8,109.09,59.014,25.037,0.4057,0.32909,395314,58,32,23.268,0.0,47,3.06,1.6402,1.3716
GPU,16,124.09,71.691,16.732,0.31207,0.30489,197657,71,32,33.504,0.0,23,1.6573,0.96905,0.73838


In [8]:
# Verify that same number of patch updates are done for both CPU and GPU codes

idx = pandas.IndexSlice
df_adv = example_data.loc[:,'adv_steps']
df_adv.unstack()

procs,1,2,4,8,16
device,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CPU,3162510,1581250,790627,395314,197657
GPU,3162510,1581250,790627,395314,197657


In [9]:
# Extract only the GPU data for a particular example

df_gpu = example_data.loc[('GPU'),:]
df_gpu

Unnamed: 0_level_0,walltime,advance,ghostfill,regrid,adapt,adv_steps,adv_step2,mx,patch_comm,output,grids_proc,memcopy_h2h,memcopy_h2d,memcopy_d2h
procs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,458.98,342.98,112.42,2.3,1.103,3162510,341,32,0.04646,0.0,382,35.785,10.985,10.485
2,272.85,186.24,70.239,1.3477,0.6573,1581250,185,32,13.867,0.0,190,19.448,5.8454,5.3197
4,171.32,102.78,43.592,0.73535,0.46007,790627,102,32,22.924,0.0,95,7.8659,3.0234,2.6898
8,109.09,59.014,25.037,0.4057,0.32909,395314,58,32,23.268,0.0,47,3.06,1.6402,1.3716
16,124.09,71.691,16.732,0.31207,0.30489,197657,71,32,33.504,0.0,23,1.6573,0.96905,0.73838


In [10]:
# Extract only the GPU data for a particular example

df_cpu = example_data.loc[('CPU'),:]
df_cpu

Unnamed: 0_level_0,walltime,advance,ghostfill,regrid,adapt,adv_steps,adv_step2,mx,patch_comm,output,grids_proc,memcopy_h2h,memcopy_h2d,memcopy_d2h
procs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,2505.1,2388.0,112.96,2.9135,1.1091,3162510,2386,32,0.036478,0.0,382,0.0,0.0,0.0
2,1381.8,1193.8,68.496,1.2081,0.65591,1581250,1192,32,117.4,0.0,190,0.0,0.0,0.0
4,775.29,598.27,45.187,0.8878,0.45943,790627,597,32,128.99,0.0,95,0.0,0.0,0.0
8,433.98,298.91,28.14,0.43952,0.36179,395314,298,32,103.88,0.0,47,0.0,0.0,0.0
16,244.74,149.29,16.434,0.2075,0.31154,197657,149,32,75.364,0.0,23,0.0,0.0,0.0


In [11]:
# Extract data from columns that account for most of the time spent. 

df_cols = example_data.loc[:,['walltime','advance','ghostfill','patch_comm']]

# Account for percentage of time for these three columns
percent_of_time = df_cols['advance'] + df_cols['ghostfill'] + df_cols['patch_comm']
df_cols['Total (%)'] = 100*percent_of_time/df_cols['walltime']
fstr['Total (%)'] = '{:.1f}%'.format
df_cols.style.format(fstr).set_caption('Most of the time is spent in ' \
                                       'advance, ghostfill and patch communication.  The last column ' \
                                      'indicates what percent of total time is spent in these ' \
                                      'three columns.')

Unnamed: 0_level_0,Unnamed: 1_level_0,walltime,advance,ghostfill,patch_comm,Total (%)
device,procs,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CPU,1,2505.1,2388.0,112.96,0.04,99.8%
CPU,2,1381.8,1193.8,68.5,117.4,99.8%
CPU,4,775.29,598.27,45.19,128.99,99.6%
CPU,8,433.98,298.91,28.14,103.88,99.3%
CPU,16,244.74,149.29,16.43,75.36,98.5%
GPU,1,458.98,342.98,112.42,0.05,99.2%
GPU,2,272.85,186.24,70.24,13.87,99.1%
GPU,4,171.32,102.78,43.59,22.92,98.8%
GPU,8,109.09,59.01,25.04,23.27,98.4%
GPU,16,124.09,71.69,16.73,33.5,98.3%


In [12]:
# Extract CPU/GPU data for one example and display side-by-side

df_gpu = example_data.loc[:,['walltime','advance','ghostfill','patch_comm']]
df_gpu.unstack(level=0).style.format('{:.2f}'.format)

Unnamed: 0_level_0,walltime,walltime,advance,advance,ghostfill,ghostfill,patch_comm,patch_comm
device,CPU,GPU,CPU,GPU,CPU,GPU,CPU,GPU
procs,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,2505.1,458.98,2388.0,342.98,112.96,112.42,0.04,0.05
2,1381.8,272.85,1193.8,186.24,68.5,70.24,117.4,13.87
4,775.29,171.32,598.27,102.78,45.19,43.59,128.99,22.92
8,433.98,109.09,298.91,59.01,28.14,25.04,103.88,23.27
16,244.74,124.09,149.29,71.69,16.43,16.73,75.36,33.5


In [13]:
# Compare speed-up of GPU over the CPU for single example.  Highlight maximum speedup.

idx = pandas.IndexSlice
df1 = example_data.loc[:,['walltime','advance','ghostfill','patch_comm']]
(df1.loc['CPU']/df1.loc['GPU']).style.format('{:.1f}'.format).highlight_max()

Unnamed: 0_level_0,walltime,advance,ghostfill,patch_comm
procs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,5.5,7.0,1.0,0.8
2,5.1,6.4,1.0,8.5
4,4.5,5.8,1.0,5.6
8,4.0,5.1,1.1,4.5
16,2.0,2.1,1.0,2.2


In [14]:
# Extract wall time information for GPUs and CPUs

idx = pandas.IndexSlice
df_scale = pandas.DataFrame(data = example_data.loc[:,'walltime']).unstack(level=0).droplevel(0,axis=1)
df_scale

device,CPU,GPU
procs,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2505.1,458.98
2,1381.8,272.85
4,775.29,171.32
8,433.98,109.09
16,244.74,124.09


In [15]:
# Strong scale : T_0/T_p
df_scale = pandas.DataFrame(data = example_data.loc[:,'walltime']).unstack(level=0).droplevel(0,axis=1)
df_scale['CPU_scaling'] = df_scale.loc[1,('CPU')]/df_scale.loc[:,('CPU')]
df_scale['GPU_scaling'] = df_scale.loc[1,('GPU')]/df_scale.loc[:,('GPU')]
df_scale.style.format('{:.1f}'.format)\
      .background_gradient(subset=['CPU_scaling','GPU_scaling'],cmap='YlOrBr',low=0,high=1)

device,CPU,GPU,CPU_scaling,GPU_scaling
procs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2505.1,459.0,1.0,1.0
2,1381.8,272.9,1.8,1.7
4,775.3,171.3,3.2,2.7
8,434.0,109.1,5.8,4.2
16,244.7,124.1,10.2,3.7


In [16]:
# Compare multiple CPU cores to one GPU or two GPUs. 
df_scale = pandas.DataFrame(data = example_data.loc[:,'walltime']).unstack(level=0).droplevel(0,axis=1)

df_scale['CPUs/1 GPU'] = df_scale.loc[:,'CPU']/df_scale.loc[1,'GPU']
df_scale['CPUs/2 GPUs'] = df_scale.loc[:,'CPU']/df_scale.loc[2,'GPU']
df_scale.style.format('{:.1f}'.format)\
      .background_gradient(subset=['CPUs/1 GPU','CPUs/2 GPUs'],cmap='YlOrBr',low=0,high=1) \
      .set_caption("Numbers indicate speed-up of multiple CPUs vs. 1 or 2 GPUs")

device,CPU,GPU,CPUs/1 GPU,CPUs/2 GPUs
procs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2505.1,459.0,5.5,9.2
2,1381.8,272.9,3.0,5.1
4,775.3,171.3,1.7,2.8
8,434.0,109.1,0.9,1.6
16,244.7,124.1,0.5,0.9
