In [9]:
# DataPrep.ipynb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the merged dataset
wetter_umsatzdaten_kiwo = pd.read_csv("../0_DataPreparation/wetter_umsatzdaten_kiwo.csv")

# Step 1: Inspect the data
print("Data Overview:")
print(wetter_umsatzdaten_kiwo.head())
print("Shape of Data:", wetter_umsatzdaten_kiwo.shape)
print("Missing values per column:")
print(wetter_umsatzdaten_kiwo.isnull().sum())

# Step 2: Feature Engineering
# Convert 'Datum' to datetime and add a weekday column
wetter_umsatzdaten_kiwo['Datum'] = pd.to_datetime(wetter_umsatzdaten_kiwo['Datum'], errors='coerce')
wetter_umsatzdaten_kiwo['Wochentag'] = wetter_umsatzdaten_kiwo['Datum'].dt.day_name()
wetter_umsatzdaten_kiwo['Is_Weekend'] = wetter_umsatzdaten_kiwo['Datum'].dt.weekday.isin([5, 6]).astype(int)

# Add a temperature category
def classify_temperature(temp):
    if temp < 0:
        return -2  # Very Cold
    elif 0 <= temp <= 10:
        return -1  # Cold
    elif 10 < temp <= 20:
        return 1   # Warm
    else:
        return 2   # Very Warm

wetter_umsatzdaten_kiwo['Temperature_Category'] = wetter_umsatzdaten_kiwo['Temperatur'].apply(classify_temperature)

# Perform one-hot encoding for 'Warengruppe'
warengruppe_encoded = pd.get_dummies(wetter_umsatzdaten_kiwo['Warengruppe'], prefix='Warengruppe')
wetter_umsatzdaten_kiwo = pd.concat([wetter_umsatzdaten_kiwo, warengruppe_encoded], axis=1)

# Dynamically create a list of feature columns
feature_columns = ['KiWo', 'Is_Weekend', 'Temperature_Category', 'Windgeschwindigkeit'] + list(warengruppe_encoded.columns)

# Step 3: Save Processed Data
wetter_umsatzdaten_kiwo.to_csv("processed_data.csv", index=False)

# Save feature columns to a file for use in the modeling stage
with open("feature_columns.txt", "w") as f:
    f.write("\n".join(feature_columns))

print("Data preparation and characterization complete. Processed data and feature columns saved.")


Data Overview:
        Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0  2013-07-01         6.0     17.8375                   15        20.0   
1  2013-07-01         6.0     17.8375                   15        20.0   
2  2013-07-01         6.0     17.8375                   15        20.0   
3  2013-07-01         6.0     17.8375                   15        20.0   
4  2013-07-01         6.0     17.8375                   15        20.0   

   Warengruppe      Umsatz  KiWo  
0            1  148.828353     0  
1            2  535.856285     0  
2            3  201.198426     0  
3            4   65.890169     0  
4            5  317.475875     0  
Shape of Data: (9318, 8)
Missing values per column:
Datum                     0
Bewoelkung               54
Temperatur                0
Windgeschwindigkeit       0
Wettercode             2309
Warengruppe               0
Umsatz                    0
KiWo                      0
dtype: int64
Data preparation and characterization com