In [26]:
# Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import jinja2
import matplotlib.dates as mdates

In [27]:
# Read Data
df=pd.read_csv('monatszahlen2307_verkehrsunfaelle_10_07_23_nosum.csv')

In [28]:
# Convert German Feature Names to English
column_name_mapping = {
    "MONATSZAHL": "Category",
    "AUSPRAEGUNG": "Accident-type",
    "JAHR": "Year",
    "MONAT": "Month",
    "WERT": "Value",
    "VORJAHRESWERT": "Previous_Year_Value",
    "VERAEND_VORMONAT_PROZENT": "Change_From_Previous_Month_Percentage",
    "VERAEND_VORJAHRESMONAT_PROZENT": "Change_From_Previous_Year_Month_Percentage",
    "ZWOELF_MONATE_MITTELWERT": "Twelve_Month_Average"
}
df=df.rename(columns=column_name_mapping)

In [29]:
# Explore Data Only Before 2021
df = df[df['Year'] < 2021]

In [30]:
# From "Convert German feature names to English" segment
# you can see that some features are computed from "Value"
# and more features need to be computed from Value in the future
# so we don't need to study these features here for now

columns_to_exclude = ['Previous_Year_Value', 'Change_From_Previous_Month_Percentage', 
                      'Change_From_Previous_Year_Month_Percentage', 'Twelve_Month_Average']
df = df.drop(columns=[col for col in columns_to_exclude if col in df.columns])


In [31]:
df

Unnamed: 0,Category,Accident-type,Year,Month,Value
36,Alkoholunfälle,insgesamt,2020,202001,28.0
37,Alkoholunfälle,insgesamt,2020,202002,40.0
38,Alkoholunfälle,insgesamt,2020,202003,27.0
39,Alkoholunfälle,insgesamt,2020,202004,26.0
40,Alkoholunfälle,insgesamt,2020,202005,40.0
...,...,...,...,...,...
2011,Verkehrsunfälle,Verletzte und Getötete,2000,200008,647.0
2012,Verkehrsunfälle,Verletzte und Getötete,2000,200009,675.0
2013,Verkehrsunfälle,Verletzte und Getötete,2000,200010,615.0
2014,Verkehrsunfälle,Verletzte und Getötete,2000,200011,578.0


In [59]:
def extract_advanced_time_features(df, month_column='Month', value_column='Value', 
                                   filters=None, rolling_functions=None, rolling_windows=None, 
                                   year_windows=None):
    if filters:
        for key, value in filters.items():
            df = df[df[key] == value]

    df = df.sort_values(by=month_column).reset_index(drop=True)
    df[month_column] = df[month_column].astype(str)

    func_dict = {'mean': np.mean, 'std': np.std, 'var': np.var, 'max': np.max, 'min': np.min}

    for func_name in rolling_functions or []:
        func = func_dict[func_name]
        for window in rolling_windows or []:
            col_name = f'{func_name}_{window}m'
            # Shift the rolling window to exclude the current month's value
            # Use 'min_periods=window' to ensure no calculation is done if there are not enough data points
            df[col_name] = df[value_column].shift(1).rolling(window=window, min_periods=window).apply(func, raw=True)

    for year_window in year_windows or []:
        shift_periods = year_window * 12
        lag_col_name = f'value_{year_window}_years_ago'
        df[lag_col_name] = df[value_column].shift(shift_periods)
        change_col_name = f'change_from_{year_window}_years_ago'
        df[change_col_name] = (df[value_column].shift(1) - df[lag_col_name]) / df[lag_col_name].shift(1)

    return df


filters = {'Category': 'Alkoholunfälle', 'Accident-type': 'insgesamt'}
rolling_functions = ['mean', 'std'] 
rolling_windows = [2, 3, 6, 9] # note if window size is 1, then std is not meaningful

year_windows = [1] 
# The 'year_window' parameter determines how many initial years will have NaN in the expanded features.




model_ready_data = extract_advanced_time_features(df, 
                                                 month_column='Month', 
                                                 value_column='Value', 
                                                 filters=filters, 
                                                 rolling_functions=rolling_functions, 
                                                 rolling_windows=rolling_windows,
                                                 year_windows=year_windows)

In [60]:
model_ready_data.head(20)

Unnamed: 0,Category,Accident-type,Year,Month,Value,mean_2m,mean_3m,mean_6m,mean_9m,std_2m,std_3m,std_6m,std_9m,value_1_years_ago,change_from_1_years_ago
0,Alkoholunfälle,insgesamt,2000,200001,78.0,,,,,,,,,,
1,Alkoholunfälle,insgesamt,2000,200002,53.0,,,,,,,,,,
2,Alkoholunfälle,insgesamt,2000,200003,73.0,65.5,,,,12.5,,,,,
3,Alkoholunfälle,insgesamt,2000,200004,78.0,63.0,68.0,,,10.0,10.801234,,,,
4,Alkoholunfälle,insgesamt,2000,200005,96.0,75.5,68.0,,,2.5,10.801234,,,,
5,Alkoholunfälle,insgesamt,2000,200006,57.0,87.0,82.333333,,,9.0,9.877022,,,,
6,Alkoholunfälle,insgesamt,2000,200007,99.0,76.5,77.0,72.5,,19.5,15.937377,14.338177,,,
7,Alkoholunfälle,insgesamt,2000,200008,77.0,78.0,84.0,76.0,,21.0,19.131126,17.47379,,,
8,Alkoholunfälle,insgesamt,2000,200009,84.0,88.0,77.666667,80.0,,11.0,17.152907,14.189198,,,
9,Alkoholunfälle,insgesamt,2000,200010,83.0,80.5,86.666667,81.833333,77.222222,3.5,9.177267,13.873436,14.482002,,
