## Data Understanding
Code used to support the data quality assesment and exploratory data analysis as described in paper "Rescue Drivers Deployment".

In [10]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import dtale
import scipy.stats as sts
from plotly.subplots import make_subplots
import plotly.express as px

In [11]:
df=pd.read_csv(r"C:\Users\BothmK01\OneDrive - FrieslandCampina\Data science master\Case Study Model Engineering\Case 2 Rescue Drivers\sickness_table.csv")

In [12]:
# Parsing date and adding some columns to enhance visualizations
df['date']= pd.to_datetime(df['date'])
df['day_of_year']=df['date'].dt.dayofyear
df['year']=df['date'].dt.year

## d-tale
d-tale is a great open source python package that speeds up exploratory data analysis dramatically. The drag and drop interface makes it simple to get quick insights. All leads that were found during this step were more extensively researched and each visual that ended up in the paper was reproduced as shown in the code below. To get an idea of how d-tale helps data analysis the function that proved to be most useful for this work can be found under menu D-Tale > Visualize > Charts. 

In [13]:
#Imports might have to be re-run
dtale.show(df)



In [14]:
# Adoptation of code generated by dtale to get histograms with Kernel Density Estimation based on the code generated by dtale
# On the Describe function of dtale generates the same histograms but can only export to html or png
# This function recreates the histograms and saves them to vector plots in pdf format which works well with LaTeX
# Please note this is a function which is called on next line

def histogram(column_name,df):
    if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
        df = df.to_frame(index=False)

    # remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
    df = df.reset_index().drop('index', axis=1, errors='ignore')
    df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

    s = df[~pd.isnull(df[column_name])][[column_name]]
    chart, labels = np.histogram(s, bins=20)
    import scipy.stats as sts
    kde = sts.gaussian_kde(s[column_name])
    kde_data = kde.pdf(np.linspace(labels.min(), labels.max()))
    # main statistics
    stats = df[column_name].describe().to_frame().T
    charts = [
        go.Bar(x=labels[1:], y=chart, name='Histogram'),
        go.Scatter(
            x=list(range(len(kde_data))), y=kde_data, name='KDE',		yaxis='y2', xaxis='x2',		line={'shape': 'spline', 'smoothing': 0.3}, mode='lines'
            )
    ]
    figure = go.Figure(data=charts, layout=go.Layout({
        'barmode': 'group',
        'legend': {'orientation': 'h'},
        'title': {'text': 'column_name Histogram (bins: 20) w/ KDE'},
        'xaxis2': {'anchor': 'y', 'overlaying': 'x', 'side': 'top'},
        'yaxis': {'side': 'left', 'title': {'text': 'Frequency'}},
        'yaxis2': {'overlaying': 'y', 'side': 'right', 'title': {'text': 'KDE'}}
    }))

    if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
        df = df.to_frame(index=False)

    # remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
    df = df.reset_index().drop('index', axis=1, errors='ignore')
    df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

    s = df[~pd.isnull(df[column_name])][[column_name]]
    chart, labels = np.histogram(s, bins=20)
    import scipy.stats as sts
    kde = sts.gaussian_kde(s[column_name])
    kde_data = kde.pdf(np.linspace(labels.min(), labels.max()))
    # main statistics
    stats = df[column_name].describe().to_frame().T
    charts = [
        go.Bar(x=labels[1:], y=chart, name='Histogram'),
        go.Scatter(
            x=list(range(len(kde_data))), y=kde_data, name='KDE',		yaxis='y2', xaxis='x2',		line={'shape': 'spline', 'smoothing': 0.3}, mode='lines'
        )
    ]
    figure = go.Figure(data=charts, layout=go.Layout({
        'barmode': 'group',
        'legend': {'orientation': 'h'},
        'title': {'text': column_name+' Histogram (bins: 20) w/ KDE'},
        'xaxis2': {'anchor': 'y', 'overlaying': 'x', 'side': 'top'},
        'yaxis': {'side': 'left', 'title': {'text': 'Frequency'}},
        'yaxis2': {'overlaying': 'y', 'side': 'right', 'title': {'text': 'KDE'}}
    }))

    figure.show()
    figure.write_image(column_name+"hist.pdf")

In [15]:
#Calling the function on all columns but the date column
for col in df:
    if col != 'date':
        histogram(col,df)

In [16]:
# Discovery of seasonal pattern made in d-tale but simple to reproduce with plotly
df_plot = df
df["year_"] = df["year"].astype(str)
fig = px.scatter(df_plot, x="day_of_year", y="n_sick", color="year_",
                 title="sick rescue drivers by day of year")

fig.show()
fig.write_image('sick_doy.pdf')

In [17]:
# Discovery of seasonal pattern made in d-tale but simple to reproduce with plotly
df_plot = df
df["year_"] = df["year"].astype(str)
fig = px.scatter(df_plot, x="day_of_year", y="calls", color="year_",
                 title="number of calls per day of year")

fig.show()
fig.write_image('calls_doy.pdf')

In [18]:
# Discovery of seasonal pattern made in d-tale but simple to reproduce with plotly
df_plot = df
df["year_"] = df["year"].astype(str)
fig = px.scatter(df_plot, x="date", y="calls", color="year_",
                 title="number of calls per day of year")

fig.show()
fig.write_image('calls_date.pdf')

In [19]:
# Discovery of Relu relationship made in d-tale but simple to reproduce with plotly
df_plot = df
df["n_duty"] = df["n_duty"].astype(str)
fig = px.scatter(df_plot, x="calls", y="sby_need", color="n_duty",
                 title="Use of stand-by drivers by calls")

fig.show()
fig.write_image('calls_sby.pdf')

fig = px.scatter(df_plot, x="calls", y="dafted", color="n_duty",
                 title="Dafted drivers by calls")

fig.show()
fig.write_image('calls_dafted.pdf')