In [26]:
# Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import jinja2
import matplotlib.dates as mdates

In [27]:
# Read Data
df=pd.read_csv('monatszahlen2307_verkehrsunfaelle_10_07_23_nosum.csv')

In [28]:
# Convert German Feature Names to English
column_name_mapping = {
    "MONATSZAHL": "Category",
    "AUSPRAEGUNG": "Accident-type",
    "JAHR": "Year",
    "MONAT": "Month",
    "WERT": "Value",
    "VORJAHRESWERT": "Previous_Year_Value",
    "VERAEND_VORMONAT_PROZENT": "Change_From_Previous_Month_Percentage",
    "VERAEND_VORJAHRESMONAT_PROZENT": "Change_From_Previous_Year_Month_Percentage",
    "ZWOELF_MONATE_MITTELWERT": "Twelve_Month_Average"
}
df=df.rename(columns=column_name_mapping)

In [29]:
# Explore Data Only Before 2021
df = df[df['Year'] < 2021]

In [30]:
# From "Convert German feature names to English" segment
# you can see that some features are computed from "Value"
# and more features need to be computed from Value in the future
# so we don't need to study these features here for now

columns_to_exclude = ['Previous_Year_Value', 'Change_From_Previous_Month_Percentage', 
                      'Change_From_Previous_Year_Month_Percentage', 'Twelve_Month_Average']
df = df.drop(columns=[col for col in columns_to_exclude if col in df.columns])


In [31]:
df

Unnamed: 0,Category,Accident-type,Year,Month,Value
36,Alkoholunfälle,insgesamt,2020,202001,28.0
37,Alkoholunfälle,insgesamt,2020,202002,40.0
38,Alkoholunfälle,insgesamt,2020,202003,27.0
39,Alkoholunfälle,insgesamt,2020,202004,26.0
40,Alkoholunfälle,insgesamt,2020,202005,40.0
...,...,...,...,...,...
2011,Verkehrsunfälle,Verletzte und Getötete,2000,200008,647.0
2012,Verkehrsunfälle,Verletzte und Getötete,2000,200009,675.0
2013,Verkehrsunfälle,Verletzte und Getötete,2000,200010,615.0
2014,Verkehrsunfälle,Verletzte und Getötete,2000,200011,578.0


In [79]:
def extract_advanced_time_features(df, month_column, value_column, filters, rolling_functions, rolling_windows, year_windows):
    # Apply filters if provided
    if filters:
        for key, value in filters.items():
            assert key in df.columns, f"Filter key {key} not in DataFrame."
            df = df[df[key] == value]

    # Sort by month and reset index
    df = df.sort_values(by=month_column).reset_index(drop=True)
    df[month_column] = df[month_column].astype(str)

    # Add quarter and weekday features
    df['Quarter'] = pd.to_datetime(df[month_column], format='%Y%m').dt.quarter
    df['Month_Start_Weekday'] = pd.to_datetime(df[month_column], format='%Y%m').dt.dayofweek + 1  # Monday=1, Sunday=7
    df['Month_End_Weekday'] = pd.to_datetime(df[month_column], format='%Y%m').dt.daysinmonth
    df['Month_End_Weekday'] = df.apply(lambda row: pd.Timestamp(year=int(row[month_column][:4]), 
                                                                month=int(row[month_column][4:]), 
                                                                day=row['Month_End_Weekday']).dayofweek + 1, axis=1)

    # Calculate rolling features
    for func_name in rolling_functions:
        for window in rolling_windows:
            col_name = f'{func_name}_{window}m'
            df[col_name] = df[value_column].shift(1).rolling(window=window, min_periods=window).agg(func_name)

    # Calculate historical features based on the windows
    for year_window in year_windows:
        shift_periods = year_window * 12
        lag_col_name = f'value_{year_window}_years_ago'
        df[lag_col_name] = df[value_column].shift(shift_periods)

    return df




filters = {'Category': 'Alkoholunfälle', 'Accident-type': 'insgesamt'}
rolling_functions = ['mean'] 
rolling_windows = [2, 3, 6, 9] # note if window size is 1, then std is not meaningful

year_windows = [1] 
# The 'year_window' parameter determines how many initial years will have NaN in the expanded features.
# note if window size is 1, then std is not meaningful



model_ready_data = extract_advanced_time_features(df, 
                                                 month_column='Month', 
                                                 value_column='Value', 
                                                 filters=filters, 
                                                 rolling_functions=rolling_functions, 
                                                 rolling_windows=rolling_windows,
                                                 year_windows=year_windows)




In [80]:
model_ready_data.head(20)

Unnamed: 0,Category,Accident-type,Year,Month,Value,Quarter,Month_Start_Weekday,Month_End_Weekday,mean_2m,mean_3m,mean_6m,mean_9m,value_1_years_ago
0,Alkoholunfälle,insgesamt,2000,200001,78.0,1,6,1,,,,,
1,Alkoholunfälle,insgesamt,2000,200002,53.0,1,2,2,,,,,
2,Alkoholunfälle,insgesamt,2000,200003,73.0,1,3,5,65.5,,,,
3,Alkoholunfälle,insgesamt,2000,200004,78.0,2,6,7,63.0,68.0,,,
4,Alkoholunfälle,insgesamt,2000,200005,96.0,2,1,3,75.5,68.0,,,
5,Alkoholunfälle,insgesamt,2000,200006,57.0,2,4,5,87.0,82.333333,,,
6,Alkoholunfälle,insgesamt,2000,200007,99.0,3,6,1,76.5,77.0,72.5,,
7,Alkoholunfälle,insgesamt,2000,200008,77.0,3,2,4,78.0,84.0,76.0,,
8,Alkoholunfälle,insgesamt,2000,200009,84.0,3,5,6,88.0,77.666667,80.0,,
9,Alkoholunfälle,insgesamt,2000,200010,83.0,4,7,2,80.5,86.666667,81.833333,77.222222,
