In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, DoubleType, IntegerType, DateType
import pyspark.sql.functions as f
import os
from custom_utils import project_base_dir, rolling_window_size, fuel_type, visualisation_dir
import pandas as pd
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
from calendar import day_name

In [None]:
spark = SparkSession.\
    builder.\
    appName("visualisation-notebook").\
    getOrCreate()

In [None]:
training_schema = StructType() \
      .add("station_uuid",StringType(),True) \
      .add("date",DateType(),True) \
      .add("hour",IntegerType(),True) \
      .add("weekday",IntegerType(),True) \
      .add("deviation",DoubleType(),True) \
      .add("cloudcover",IntegerType(),True) \
      .add("rain",DoubleType(),True) \
      .add("temperature_2m",DoubleType(),True) \
      .add("hour_sin",DoubleType(),True) \
      .add("hour_cos",DoubleType(),True) \
      .add("weekday_sin",DoubleType(),True) \
      .add("weekday_cos",DoubleType(),True)

In [None]:
train_df = spark.read.format("csv") \
      .option("header", True) \
      .schema(training_schema) \
      .load(os.path.join(project_base_dir, "outputs/training_data.csv"))

In [None]:
deviation_histogram = train_df.select('deviation').rdd.flatMap(lambda x: x).histogram(50)
deviation_mean = train_df.select(f.mean(f.col('deviation')).alias('mean')).collect()[0]['mean']

In [None]:
def calculate_mean_position(mean, xticks, label_values):
    min_label, max_label = min(label_values), max(label_values)
    min_tick, max_tick = min(xticks), max(xticks)
    mean_moved = mean - min_label
    min_scaled1 = mean_moved / (max_label - min_label)
    mean_scaled = min_scaled1 * (max_tick - min_tick)
    mean_position = mean_scaled + min_tick
    return mean_position

def add_mean_deviation(ax, means, tick_labels, custom_labels=["mean deviation"], colours=["r"], add_legend=True):
    for c, m in zip(colours, means):
        ax.axvline(calculate_mean_position(m, ax.get_xticks(), tick_labels), c=c)
    if add_legend:
        legend = ax.legend(loc='upper right')
        ax = legend.axes

        handles, labels = ax.get_legend_handles_labels()
        for c, l in zip(colours, custom_labels):
            handles.append(Patch(facecolor=c, edgecolor=c))
            labels.append(l)

        legend._legend_box = None
        legend._init_legend_box(handles, labels)
        legend._set_loc(legend._loc)
        legend.set_title(legend.get_title().get_text())

In [None]:
ax = pd.DataFrame(
    list(zip(*deviation_histogram)), 
    columns=['bin', 'frequency']
).set_index(
    'bin'
).plot(
    kind='bar',
    figsize=(10, 5),
    title="Distribution of deviations"
)
labels = [item.get_text() for item in ax.get_xticklabels()]
tick_labels = [float(label) for label in labels]
ax.set_xticklabels([f"{label:.03f}" for label in tick_labels])
ax.set_xlabel(f"Deviation of the {fuel_type} price from its {rolling_window_size}-day rolling average")

add_mean_deviation(ax, [deviation_mean], tick_labels)
fig = ax.get_figure()
fig.tight_layout()
fig.savefig(os.path.join(visualisation_dir, "distribution_of_deviations.svg"))

#### Plot distribution per weekday

In [None]:
fig, axis = plt.subplots(7, figsize=(10, 15), sharex=True, sharey=True)
for i in range(1, 8):
    weekday_deviation = train_df \
        .filter(f.col("weekday") == i) \
        .select('deviation') \
        .rdd.flatMap(lambda x: x)
    dev_histogram = weekday_deviation.histogram(50)
    ax = axis[i-1]
    ax = pd.DataFrame(
        list(zip(*dev_histogram)), 
        columns=['bin', 'frequency']
    ).set_index(
        'bin'
    ).plot(
        kind='bar',
        ax=ax,
        legend=False,
        title=day_name[i-1]
    )
    add_mean_deviation(ax, [deviation_mean], tick_labels, add_legend=i==1)

ax.set_xticklabels([f"{label:.03f}" for label in tick_labels])
ax.set_xlabel(f"Deviation of the {fuel_type} price from its {rolling_window_size}-day rolling average")

fig.suptitle(f"{fuel_type.capitalize()} price deviations per weekday", y=1.003, fontsize="xx-large")
fig.tight_layout()
fig.savefig(os.path.join(visualisation_dir, "distribution_of_deviations_per_weekday.svg"))

#### Plot distribution per hour

In [None]:
fig, axis = plt.subplots(12, 2, figsize=(15, 30), sharex=True, sharey=True)
for i in range(0, 24):
    hour_deviation = train_df \
        .filter(f.col("hour") == i) \
        .select('deviation') \
        .rdd.flatMap(lambda x: x)
    dev_histogram = hour_deviation.histogram(50)
    dev_mean = hour_deviation.mean()
    ax = axis[i//2][i%2]
    ax = pd.DataFrame(
        list(zip(*dev_histogram)), 
        columns=['bin', 'frequency']
    ).set_index(
        'bin'
    ).plot(
        kind='bar',
        ax=ax,
        legend=False,
        title=str(i)
    )
    
    add_mean_deviation(ax, [deviation_mean], tick_labels, add_legend=i==1)
    if i>=22:
        labels = [item.get_text() for item in ax.get_xticklabels()]
        ax.set_xticklabels([f"{label:.03f}" for label in tick_labels])
        ax.set_xlabel("")

fig.supxlabel(f"Deviation of the {fuel_type} price from its {rolling_window_size}-day rolling average", y=-0.001)
fig.suptitle(f"{fuel_type.capitalize()} price deviations per hour", y=1.002, fontsize="xx-large")
fig.tight_layout()
fig.savefig(os.path.join(visualisation_dir, "distribution_of_deviations_per_hour.svg"))