In [1]:
# DataPrep.ipynb
import pandas as pd

# Load the merged dataset
wetter_umsatzdaten_kiwo = pd.read_csv("../0_DataPreparation/wetter_umsatzdaten_kiwo_hol_id.csv")

# Step 1: Inspect the data
print("Data Overview:")
print(wetter_umsatzdaten_kiwo.head())
print("Shape of Data:", wetter_umsatzdaten_kiwo.shape)
print("Missing values per column:")
print(wetter_umsatzdaten_kiwo.isnull().sum())


Data Overview:
        Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0  2013-07-01         6.0     17.8375                   15        20.0   
1  2013-07-01         6.0     17.8375                   15        20.0   
2  2013-07-01         6.0     17.8375                   15        20.0   
3  2013-07-01         6.0     17.8375                   15        20.0   
4  2013-07-01         6.0     17.8375                   15        20.0   

   Warengruppe      Umsatz  KiWo  Holiday            ID  
0            1  148.828353     0        0  2013-07-01_1  
1            2  535.856285     0        0  2013-07-01_2  
2            3  201.198426     0        0  2013-07-01_3  
3            4   65.890169     0        0  2013-07-01_4  
4            5  317.475875     0        0  2013-07-01_5  
Shape of Data: (9318, 10)
Missing values per column:
Datum                     0
Bewoelkung               54
Temperatur                0
Windgeschwindigkeit       0
Wettercode             2309


# Step 2: Feature Engineering

In [2]:

# Convert 'Datum' to datetime and add a weekday column
wetter_umsatzdaten_kiwo['Datum'] = pd.to_datetime(wetter_umsatzdaten_kiwo['Datum'], errors='coerce')
wetter_umsatzdaten_kiwo['Wochentag'] = wetter_umsatzdaten_kiwo['Datum'].dt.day_name()
wetter_umsatzdaten_kiwo['Is_Weekend'] = wetter_umsatzdaten_kiwo['Datum'].dt.weekday.isin([5, 6]).astype(int)

# Add a temperature category
def classify_temperature(temp):
    if temp < 0:
        return -2  # Very Cold
    elif 0 <= temp <= 10:
        return -1  # Cold
    elif 10 < temp <= 20:
        return 1   # Warm
    else:
        return 2   # Very Warm

wetter_umsatzdaten_kiwo['Temperature_Category'] = wetter_umsatzdaten_kiwo['Temperatur'].apply(classify_temperature)

# Add a Beaufort scale categorization function
def classify_wind_speed_beaufort(wind_speed):
    """
    Classify wind speed according to the Beaufort scale
    
    Beaufort Number | Description | Wind Speed (m/s)
    0              | Calm        | 0-0.2
    1              | Light Air   | 0.3-1.5
    2              | Light Breeze| 1.6-3.3
    3              | Gentle Breeze| 3.4-5.4
    4              | Moderate Breeze| 5.5-7.9
    5              | Fresh Breeze| 8.0-10.7
    6              | Strong Breeze| 10.8-13.8
    7              | Near Gale   | 13.9-17.1
    8              | Gale        | 17.2-20.7
    9              | Strong Gale | 20.8-24.4
    10             | Storm       | 24.5-28.4
    11             | Violent Storm| 28.5-32.6
    12             | Hurricane   | >32.6
    """
    if wind_speed <= 0.2:
        return 0  # Calm
    elif wind_speed <= 1.5:
        return 1  # Light Air
    elif wind_speed <= 3.3:
        return 2  # Light Breeze
    elif wind_speed <= 5.4:
        return 3  # Gentle Breeze
    elif wind_speed <= 7.9:
        return 4  # Moderate Breeze
    elif wind_speed <= 10.7:
        return 5  # Fresh Breeze
    elif wind_speed <= 13.8:
        return 6  # Strong Breeze
    elif wind_speed <= 17.1:
        return 7  # Near Gale
    elif wind_speed <= 20.7:
        return 8  # Gale
    elif wind_speed <= 24.4:
        return 9  # Strong Gale
    elif wind_speed <= 28.4:
        return 10  # Storm
    elif wind_speed <= 32.6:
        return 11  # Violent Storm
    else:
        return 12  # Hurricane

# Apply Beaufort scale classification
wetter_umsatzdaten_kiwo['Windgeschwindigkeit_Beaufort'] = wetter_umsatzdaten_kiwo['Windgeschwindigkeit'].apply(classify_wind_speed_beaufort)

# Perform one-hot encoding for 'Warengruppe'
warengruppe_encoded = pd.get_dummies(wetter_umsatzdaten_kiwo['Warengruppe'], prefix='Warengruppe')
wetter_umsatzdaten_kiwo = pd.concat([wetter_umsatzdaten_kiwo, warengruppe_encoded], axis=1)

# Dynamically create a list of feature columns
feature_columns = ['KiWo', 'Is_Weekend', 'Temperature_Category'] + list(warengruppe_encoded.columns)   


# Step 3: Save Processed Data
wetter_umsatzdaten_kiwo.to_csv("processed_data.csv", index=False)

# Save feature columns to a file for use in the modeling stage
with open("feature_columns.txt", "w") as f:
    f.write("\n".join(feature_columns))

print("Data preparation and characterization complete. Processed data and feature columns saved.")


Data preparation and characterization complete. Processed data and feature columns saved.
