In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import plotly.express as px
import numpy as np
import seaborn as sns
from scipy.stats import pearsonr, spearmanr

In [None]:
# hier dann preprocessed parquet-Datei einlesen
df = pd.read_parquet("..\escooter_history_2022_preprocessed.parquet")

In [None]:
df_agg = df.groupby(["date","hour", "weekday", "daytype"], as_index=False).agg(
    total_rental=("datetime","count"),
    temp_mode=("temp", pd.Series.mode),
    temp_mean=("temp", "mean"),
    atemp_mode=("atemp", pd.Series.mode),
    atemp_mean=("atemp", "mean"),
    humidity_mode=("humidity", pd.Series.mode),
    humidity_mean=("humidity", "mean"),
    weather_mode=("weather", pd.Series.mode),
    windspeed_mode=("windspeed", pd.Series.mode),
    windspeed_mean=("windspeed", "mean"),
    total_registered=("registered_customer", "sum")
    )
df_agg["total_unregistered"] = df_agg["total_rental"] - df_agg["total_registered"] 
df_agg['day_hour'] = (pd.to_datetime(df_agg['date'], dayfirst=False)
               + pd.to_timedelta(df_agg['hour'].astype(int), unit='H')
              )

In [None]:
def normalized_df(param, aggregation, df=df_agg):
    """
    Funktion, um Dataframes zu erstellen, die gruppiert nach 
    einer Attribut des df_agg bspw. temp_mode (param_aggregation) die durchschnittliche Anzahl
    Ausleihen pro Stunde je Attribut-Ausprägung enthält
    Beispielsweise:
    Bei Temperatur 30 Grad gibt es 200 Ausleihen pro Stunde durchschnittlich 
    
    """
    df_new = df.groupby(f"{param}_{aggregation}", as_index=False).agg(
        count = ("date", "count"),
        total_rental = ("total_rental", "sum"),
    )
    df_new[f"mean_rental_per_{param}_hour"] = df_new["total_rental"] / df_new["count"]   
    return df_new

In [None]:
# Für alle Attribut wird ein DF erstellt
df_temp_mean = normalized_df("temp", "mean")
df_temp_mode = normalized_df("temp", "mode")
df_atemp_mean = normalized_df("atemp", "mean")
df_atemp_mode = normalized_df("atemp", "mode")
df_humidity_mean = normalized_df("humidity", "mean")
df_humidity_mode = normalized_df("humidity", "mode")
df_windspeed_mean = normalized_df("windspeed", "mean")
df_windspeed_mode = normalized_df("windspeed", "mode")

In [None]:
px.scatter(df_temp, x= "temp_mode", y="mean_rental_per_temp_hour", color="count", trendline="ols", trendline_color_override="darkgrey")

In [None]:
spearmanr(df_temp["temp_mode"], df_temp["mean_rental_per_temp_hour"])

In [None]:
px.scatter(df_atemp, x= "atemp_mode", y="mean_rental_per_atemp_hour", color="count", hover_name="count", trendline="ols", trendline_color_override="darkgrey")

In [None]:
spearmanr(df_atemp["atemp_mode"], df_atemp["mean_rental_per_atemp_hour"])

In [None]:
px.scatter(df_windspeed, x= "windspeed_mode", y="mean_rental_per_windspeed_hour", color="count", trendline="ols", trendline_color_override="darkgrey")

In [None]:
spearmanr(df_windspeed["windspeed_mode"], df_windspeed["mean_rental_per_windspeed_hour"])