# Imports and config

In [None]:
import os

import numpy as np
import pandas as pd

directory = os.getcwd().split('/')[-1]

if directory == 'notebooks':
    %cd ..

In [None]:
# Matplotlib & seaborn setup
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
import seaborn as sns


sns.set(rc={'figure.figsize':(11, 4)})
sns.set_style('ticks')

FILE_EXTENSION = '.pdf'

font = {
    'family': 'serif',
    'size': 14
}
figure = {
    'autolayout': True,
}
matplotlib.rc('font', **font)
matplotlib.rc('figure', **figure)

if FILE_EXTENSION == '.pdf':
    matplotlib.use('PDF')

print(f'matplotlib backend: {matplotlib.get_backend()}')

In [None]:
# configuration variables
should_save = True

In [None]:
processed = pd.read_csv('proprietary_data/processed_data.csv', index_col=0, parse_dates=True).sort_index()
unprocessed = pd.read_csv('proprietary_data/half_processed_data.csv', index_col=0, parse_dates=True).sort_index()

In [None]:
processed = processed[processed.index.year == 2018]
processed

In [None]:
unprocessed

# Resampling

In [None]:
processed_hourly = pd.DataFrame(processed.resample('H').count()['hastegrad'])
processed_hourly = processed_hourly.rename(columns={'hastegrad': 'processed'})
processed_hourly = processed_hourly.sort_index()

unprocessed_hourly = pd.DataFrame(unprocessed.resample('H').count()['hastegrad'])
unprocessed_hourly = unprocessed_hourly.rename(columns={'hastegrad': 'unprocessed'})
unprocessed_hourly = unprocessed_hourly.sort_index()

## Hourly incidents

In [None]:
processed_hourly['hour'] = processed_hourly.index.hour
processed_hourly['day'] = processed_hourly.index.day
processed_hourly['month'] = processed_hourly.index.month
processed_hourly['year'] = processed_hourly.index.year
processed_hourly['weekday'] = processed_hourly.index.weekday

unprocessed_hourly['hour'] = unprocessed_hourly.index.hour
unprocessed_hourly['day'] = unprocessed_hourly.index.day
unprocessed_hourly['month'] = unprocessed_hourly.index.month
unprocessed_hourly['year'] = unprocessed_hourly.index.year
unprocessed_hourly['weekday'] = unprocessed_hourly.index.weekday

In [None]:
processed_hourly

## Daily incidents

In [None]:
processed_daily =  pd.DataFrame(processed_hourly['processed'].resample('D').sum())
processed_daily['month'] = processed_daily.index.month
processed_daily['year'] = processed_daily.index.year
processed_daily['weekday'] = processed_daily.index.weekday

unprocessed_daily =  pd.DataFrame(unprocessed_hourly['unprocessed'].resample('D').sum())
unprocessed_daily['month'] = unprocessed_daily.index.month
unprocessed_daily['year'] = unprocessed_daily.index.year
unprocessed_daily['weekday'] = unprocessed_daily.index.weekday

## Aggregated weekly average

In [None]:
processed_weekly_average = processed_hourly['processed'] \
    .groupby([processed_hourly.weekday, processed_hourly.hour]) \
    .agg(['mean', 'std'])

reference_week = pd.date_range(start='1/5/2015', end='1/12/2015', closed='left', freq='H')
processed_weekly_average = processed_weekly_average.set_index(reference_week)
processed_weekly_average['weekday'] = processed_weekly_average.index.weekday

unprocessed_weekly_average = unprocessed_hourly['unprocessed'] \
    .groupby([unprocessed_hourly.weekday, unprocessed_hourly.hour]) \
    .agg(['mean', 'std'])

reference_week = pd.date_range(start='1/5/2015', end='1/12/2015', closed='left', freq='H')
unprocessed_weekly_average = unprocessed_weekly_average.set_index(reference_week)
unprocessed_weekly_average['weekday'] = unprocessed_weekly_average.index.weekday


## Monthly incidents

In [None]:
processed_monthly =  pd.DataFrame(processed_daily['processed'].resample('M').sum())
processed_monthly['year'] = processed_monthly.index.year

unprocessed_monthly =  pd.DataFrame(unprocessed_daily['unprocessed'].resample('M').sum())
unprocessed_monthly['year'] = unprocessed_monthly.index.year

## Yearly incidents

In [None]:
processed_yearly =  pd.DataFrame(processed_monthly['processed'].resample('Y').sum())

unprocessed_yearly =  pd.DataFrame(unprocessed_monthly['unprocessed'].resample('Y').sum())

# Plotting

## Incidents per day

In [None]:
x, y = processed_daily.index, processed_daily.processed
ax = sns.lineplot(data=processed_daily, x=x, y=y)
ax.set(xlabel='Time', ylabel='Incidents per day')

rolling_mean = y.rolling(window=30, center=True, min_periods=28).mean()
ax.plot(rolling_mean, 'k-', label='30-days rolling mean')

x = mdates.date2num(x)
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
ax.plot(x, p(x), linestyle='-.', color='red', label='Trend line')


sns.despine()
ax.legend()

ax.set_xlim(left=processed_daily.index[0])

# Minor axis
ax.grid(b=True, which='minor')
ax.xaxis.set_minor_locator(mdates.MonthLocator(bymonth=[4, 7, 10]))
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%b'))

# Major axis
ax.grid(b=True, which='major')
ax.tick_params(which='major', axis='x', length=15, width=1)
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

plt.tight_layout()

plt.plot()


In [None]:
x, y = unprocessed_daily.index, unprocessed_daily.unprocessed
ax = sns.lineplot(data=unprocessed_daily, x=x, y=y)
ax.set(xlabel='Time', ylabel='Incidents per day')

rolling_mean = y.rolling(window=30, center=True, min_periods=28).mean()
ax.plot(rolling_mean, 'k-', label='30-days rolling mean')

x = mdates.date2num(x)
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
ax.plot(x, p(x), linestyle='-.', color='red', label='Trend line')


sns.despine()
ax.legend()

ax.set_xlim(left=unprocessed_daily.index[0])

# Minor axis
ax.grid(b=True, which='minor')
ax.xaxis.set_minor_locator(mdates.MonthLocator(bymonth=[4, 7, 10]))
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%b'))

# Major axis
ax.grid(b=True, which='major')
ax.tick_params(which='major', axis='x', length=15, width=1)
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

plt.tight_layout()

plt.plot()


## Monthly plots

In [None]:
def int_to_month(x, _):
    months = [
        'Jan',
        'Feb',
        'Mar',
        'Apr',
        'May',
        'Jun',
        'Jul',
        'Aug',
        'Sep',
        'Oct',
        'Nov',
        'Dec'
    ]
    return months[x]


In [None]:
ax = sns.violinplot(data=processed_daily, x='month', y='processed')
ax.set(xlabel='Month', ylabel='Average incidents per day')

sns.despine(trim=True)

ax.xaxis.set_major_formatter(FuncFormatter(int_to_month))

plt.plot()

In [None]:
ax = sns.violinplot(data=unprocessed_daily, x='month', y='unprocessed')
ax.set(xlabel='Month', ylabel='Average incidents per day')

sns.despine(trim=True)

ax.xaxis.set_major_formatter(FuncFormatter(int_to_month))

plt.plot()

In [None]:
mean = processed_daily['processed'].mean()
deviation = processed_daily['processed'].groupby([processed_daily.month]).mean() - mean

ax = sns.barplot(x=deviation.index, y=deviation)
ax.set(xlabel='Month', ylabel='Deviation from daily mean')
ax.axhline(0, color='k')

sns.despine(trim=True)

ax.xaxis.set_major_formatter(FuncFormatter(int_to_month))

plt.plot()

In [None]:
mean = unprocessed_daily['unprocessed'].mean()
deviation = unprocessed_daily['unprocessed'].groupby([unprocessed_daily.month]).mean() - mean

ax = sns.barplot(x=deviation.index, y=deviation)
ax.set(xlabel='Month', ylabel='Deviation from daily mean')
ax.axhline(0, color='k')

sns.despine(trim=True)

ax.xaxis.set_major_formatter(FuncFormatter(int_to_month))

plt.plot()

In [None]:
unprocessed_mean = unprocessed_daily['unprocessed'].mean()
deviation = processed_daily['processed'].groupby([processed_daily.month]).mean() - unprocessed_mean

ax = sns.barplot(x=deviation.index, y=deviation)
ax.set(xlabel='Month', ylabel='Deviation from daily mean')
ax.axhline(0, color='k')

sns.despine(trim=True)

ax.xaxis.set_major_formatter(FuncFormatter(int_to_month))

plt.plot()

In [None]:
processed_mean = processed_daily['processed'].mean()
unprocessed_mean = unprocessed_daily['unprocessed'].mean()
deviation = (processed_daily['processed'].groupby([processed_daily.month]).mean() - processed_mean) - (unprocessed_daily['unprocessed'].groupby([unprocessed_daily.month]).mean() - unprocessed_mean)

ax = sns.barplot(x=deviation.index, y=deviation)
ax.set(xlabel='Month', ylabel='Deviation from daily mean')
ax.axhline(0, color='k')

sns.despine(trim=True)

ax.xaxis.set_major_formatter(FuncFormatter(int_to_month))

plt.plot()

## Weekday violin plot

In [None]:
def int_to_weekday(x, _):
    return ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][x]

ax = sns.violinplot(data=processed_hourly, x='weekday', y='processed')
ax.set(xlabel='Weekday', ylabel='Average incidents per hour')

sns.despine(trim=True)

ax.xaxis.set_major_formatter(FuncFormatter(int_to_weekday))

plt.plot()

In [None]:
def int_to_weekday(x, _):
    return ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][x]

ax = sns.violinplot(data=unprocessed_hourly, x='weekday', y='unprocessed')
ax.set(xlabel='Weekday', ylabel='Average incidents per hour')

sns.despine(trim=True)

ax.xaxis.set_major_formatter(FuncFormatter(int_to_weekday))

plt.plot()

In [None]:
processed_hourly_december_2018 = processed_hourly[(processed_hourly.month == 12) & (processed_hourly.year == 2018)]
processed_hourly_december_2018

In [None]:
processed_hourly_december_w50_2018 = processed_hourly[processed_hourly.day.between(10, 16) & (processed_hourly.month == 12) & (processed_hourly.month == 12) & (processed_hourly.year == 2018)]
processed_hourly_december_w50_2018

## Hourly average

In [None]:
ax = sns.lineplot(data=unprocessed_hourly, x='hour', y='unprocessed', label='unfiltered')
sns.lineplot(data=processed_hourly, x='hour', y='processed',  label='filtered', ax=ax)
sns.lineplot(data=processed_hourly_december_2018, x='hour', y='processed',  label='December 2018', ax=ax)
sns.lineplot(data=processed_hourly_december_w50_2018, x='hour', y='processed',  label='December w50 2018', ax=ax)

ax.set(xlabel='Hour of the day', ylabel='Average incidents per hour')

sns.despine()

ax.set_ylim(bottom=0)
ax.set_xlim(left=0)

ax.grid()
ax.xaxis.set_ticks(np.arange(0, 25, 1))

plt.legend()
plt.plot()

## Per day average 

In [None]:

ax = sns.lineplot(data=processed_weekly_average, x=processed_weekly_average.index, y='mean', label='processed')
sns.lineplot(data=unprocessed_weekly_average, x=unprocessed_weekly_average.index, y='mean', ax=ax, label='unprocessed')
ax.set(xlabel='Time of week', ylabel='Average incidents per hour')

sns.despine()

ax.set_xlim([processed_weekly_average.index[0], processed_weekly_average.index[-1]])
ax.set_ylim(bottom=0)

# Minor axis
ax.grid(b=True, which='minor', color='rosybrown', linestyle='--')
ax.xaxis.set_minor_locator(mdates.HourLocator(byhour=[6, 12, 18]))
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%H'))

# Major axis
ax.grid(b=True, which='major')
ax.tick_params(which='major', axis='x', length=17.5, width=1)
ax.xaxis.set_major_locator(mdates.DayLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%A'))

plt.legend()
plt.tight_layout()

plt.savefig(f'../output/validation/weekly_average_comparison{FILE_EXTENSION}')