In [None]:
# Set up
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from preprocess_data import *

df = get_df()
print('Data Size: ' + str(df.shape))
print(df.info())
print(df.describe())

In [None]:
def combined_plot(df: pd.DataFrame, columns, func):
    nrows, ncols = 3, 2
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 16))
    axes = axes.flatten()

    # Clean out the extra axes
    if len(axes) > len(columns):
        for i in range(len(columns), len(axes)):
            fig.delaxes(axes[i])

    for i, col in enumerate(columns):
        # This rounds all the time to the whole hour
        hourly_counts = df[col].dt.hour.value_counts().sort_index()
        func(hourly_counts, ax=axes[i])
        axes[i].set_title(f'{col} distribution')
        axes[i].set_xlabel('Hour of a day')
        axes[i].set_ylabel('Count')

    fig.tight_layout()
    plt.show()

In [None]:
# plot time
combined_plot(df, TIME_COLS, sns.barplot)
combined_plot(df, TIME_COLS, sns.boxplot)

In [None]:
# plot other columns
nrows, ncols = 3, 2
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 16))
axes = axes.flatten()

for i, col in enumerate(FEW_UNIQUE_VALS):
    plt.sca(axes[i]) # Set the current ax
    data = df[col]
    val_counts = data.value_counts() # count numbers

    if len(val_counts) > 3:
        top_three = val_counts.head(3) # get three top values
        others = pd.Series({'Others': val_counts[3:].sum()}) # label the rest to be 'other'
        data = pd.concat([top_three, others]) # concat the two series
    else:
        data = val_counts
    
    wedges, texts, autotexts = plt.pie(data, autopct='%1.1f%%')
    plt.legend(wedges, data.keys(), loc="center left", bbox_to_anchor=(0.5, -0.1))
    plt.title('Distribution of ' + col)
    axes[i].set_xlabel(col)

fig.tight_layout()
plt.show()