# Exploring pipeline log information using plots

In [1]:
#imports & reading data
import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly
from plotly.offline import iplot, init_notebook_mode
from datetime import datetime

#config
pipeline_name = 'ardetype'
init_notebook_mode(connected = True)
aggr_path = [path for path in os.listdir(f'../../{pipeline_name}_job_logs/') if '-log_aggregate_' in path][0]
df = pd.read_csv(f'../../{pipeline_name}_job_logs/{aggr_path}')
df.start_time = pd.to_datetime(df.start_time,  format='%Y:%m:%d-%H:%M:%S')
pd.options.mode.chained_assignment = None

In [2]:
#exploring job_name count distribution
fig = px.histogram(
    df, 
    x='job_name',
    title='Total job count by type',
    labels={"job_name": "Job Name"}
    )
plotly.offline.iplot(fig)


In [3]:
#Exploring proportions of failed job by job name
fig = px.histogram(
    df.loc[df.is_failed == 1],
    x='job_name',
    y='is_failed', 
    color='job_name', 
    title='Failed job count by job type',
    labels={"job_name": "Job Name", "is_failed":"failed"})
plotly.offline.iplot(fig)

In [4]:
#exploring effeciency by job_name
df['Eff'].replace(to_replace = 0, value = 0.01, inplace=True)
df['Eff'] = np.log10(df['Eff'])
fig = px.box(
    df[['job_name', 'Eff']], 
    x='job_name', 
    y='Eff', 
    color='job_name', 
    title='log10(Job efficiency) box plots by job type',
    labels={"job_name": "Job Name"}
    )

fig.update_layout(
    title='Time Series: log10(Eff) by job type; Eff = 100*(time_used/time_requested)',
    xaxis_title="Job Name",
    yaxis_title="log10(Eff)",
    legend_title="Job Name",
)

plotly.offline.iplot(fig)

In [5]:
#exploring real runtime in seconds by job_name
df['time_sec_total'].replace(to_replace = 0, value = 0.01, inplace=True)
df['time_sec_total'] = np.log10(df['time_sec_total'])
fig = px.box(df[['job_name', 'time_sec_total']], x='job_name', y='time_sec_total', color='job_name', title='log10(Job run time) box plots by job type')

fig.update_layout(
    title='Time Series: log10(time_sec_total) by job type',
    xaxis_title="Job Name",
    yaxis_title="log10(time_sec_total)",
    legend_title="Job Name",
)

plotly.offline.iplot(fig)

In [6]:
fig = px.scatter(df, x='start_time', y="Eff", color='job_name', labels={"job_name": "Job Name", "start_time":"Job start time"})
fig.add_hline(y=4, line_dash="dash", line_color="orange", line_width = 1)
fig.add_hline(y=3, line_dash="dash", line_color="#62d5a6", line_width = 1)
fig.add_hline(y=2, line_dash="dash", line_color="green", line_width = 1)
fig.add_hline(y=1, line_dash="dash", line_color="#62d5a6", line_width = 1)
fig.add_hline(y=0.5, line_dash="dash", line_color="orange", line_width = 1)
fig.add_hline(y=0, line_dash="dash", line_color="red", line_width = 1)
fig.add_hline(y=-0.5, line_dash="dash", line_color="#c061de", line_width = 1)
fig.add_hline(y=-1, line_dash="dash", line_color="blue", line_width = 1)
fig.add_hline(y=-2, line_dash="dash", line_color="black", line_width = 1)

fig.update_layout(
    title='Time Series: log10(Eff) by job type; Eff = 100*(time_used/time_requested)',
    xaxis_title="Job Start Time",
    yaxis_title="log10(Eff)",
    legend_title="Job Name",
)

plotly.offline.iplot(fig)


In [7]:
fig = px.scatter(df, x='start_time', y="time_sec_total", color='job_name', labels={"job_name": "Job Name", "start_time":"Job start time"})
fig.update_layout(
    title='Time Series: log10(time_sec_total) by job type',
    xaxis_title="Job Start Time",
    yaxis_title="log10(time_sec_total)",
    legend_title="Job Name",
)

plotly.offline.iplot(fig)

In [8]:
df_1 = df[['start_time', 'job_name', 'is_failed']]
df_1.loc[:,'start_time'] = df_1['start_time'].dt.strftime('%Y-%m-%d %H')
df_1 = df_1.loc[df_1.is_failed == 1]
df_1 = df_1.groupby(['start_time', 'job_name']).sum()
df_1.reset_index(drop=False, inplace=True)


fig_f = px.scatter(df_1, x='start_time', y="is_failed", color='job_name', labels={"job_name": "Job Name", "start_time":"Job start time", "is_failed":"Failed count"})

fig_f.update_layout(
    title='Time Series: Failed job count by job type',
    xaxis_title="Job Start Time",
    yaxis_title="Failed job count",
    legend_title="Job Name",
)

plotly.offline.iplot(fig_f)

In [9]:
df_1 = df[['start_time', 'job_name']]
df_1.loc[:,'start_time'] = df_1['start_time'].dt.strftime('%Y-%m-%d %H')
s_1 = pd.DataFrame(df_1.value_counts(["start_time", "job_name"]))
s_1.reset_index(inplace=True)
s_1.columns = ["start_time", "job_name", "counts"]


fig = px.area(s_1, x='start_time', y="counts", color='job_name', labels={"job_name": "Job Name", "start_time":"Job start time", "counts": "Job count"})

fig.update_layout(
    title='Time Series: Job count by Job type',
    xaxis_title="Job Start Time",
    yaxis_title="Job count",
    legend_title="Job Name",
)

plotly.offline.iplot(fig)

In [10]:
df_1 = df[['start_time', 'job_name', 'mem_gb_req']]
df_1.loc[:,'start_time'] = df_1['start_time'].dt.strftime('%Y-%m-%d %H')

df_2 = df_1.groupby(['start_time', 'job_name']).sum()
df_2.reset_index(inplace = True)
df_3 = df_1[['start_time','mem_gb_req']].groupby(by='start_time').sum()
df_3.reset_index(inplace=True)
df_3['job_name'] = ['total' for _ in df_3.index]
df_2 = pd.concat([df_2, df_3], sort=False)

fig = px.area(df_2, x='start_time', y="mem_gb_req", color='job_name', labels={"job_name": "Job Name", "start_time":"Job start time", "mem_gb_req":"Requested RAM(Gb/hour)"})
fig.update_layout(
    title='Time Series: RAM request by Job type',
    xaxis_title="Job Start Time",
    yaxis_title="Requested RAM(Gb/hour)",
    legend_title="Job Name"
)

plotly.offline.iplot(fig)


In [11]:
df_1 = df[['start_time', 'job_name', 'cpu_snakemake']]
df_1.loc[:,'start_time'] = df_1['start_time'].dt.strftime('%Y-%m-%d %H')

df_2 = df_1.groupby(['start_time', 'job_name']).sum()
df_2.reset_index(inplace = True)
df_3 = df_1[['start_time','cpu_snakemake']].groupby(by='start_time').sum()
df_3.reset_index(inplace=True)
df_3['job_name'] = ['total' for _ in df_3.index]
df_2 = pd.concat([df_2, df_3], sort=False)

fig = px.area(df_2, x='start_time', y="cpu_snakemake", color='job_name', labels={"job_name": "Job Name", "start_time":"Job start time", "cpu_snakemake":"CPU count limit (1/hour)"})
fig.update_layout(
    title='Time Series: CPU count limit (1/hour)',
    xaxis_title="Job Start Time",
    yaxis_title="CPU count limit (1/hour)",
    legend_title="Job Name"
)

plotly.offline.iplot(fig)