In [5]:
# DataPrep.ipynb
import pandas as pd

# Load the merged dataset
wetter_umsatzdaten_kiwo = pd.read_csv("../0_DataPreparation/wetter_umsatzdaten_kiwo_hol_id.csv")

# Step 1: Inspect the data
print("Data Overview:")
print(wetter_umsatzdaten_kiwo.tail())
print("Shape of Data:", wetter_umsatzdaten_kiwo.shape)
print("Missing values per column:")
print(wetter_umsatzdaten_kiwo.isnull().sum())


Data Overview:
            Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
10098  2019-07-28         3.0     23.3500                   14         5.0   
10099  2019-07-29         6.0     25.2500                    7        61.0   
10100  2019-07-30         7.0     20.7375                    8        61.0   
10101  2019-07-31         6.0     20.4500                    7        61.0   
10102  2019-08-01         5.0     21.0625                    9        61.0   

       id  Warengruppe  Umsatz  KiWo  Holiday       ID  
10098 NaN            0     0.0     0        0  1907280  
10099 NaN            0     0.0     0        0  1907290  
10100 NaN            0     0.0     0        0  1907300  
10101 NaN            0     0.0     0        0  1907310  
10102 NaN            0     0.0     0        0  1908010  
Shape of Data: (10103, 11)
Missing values per column:
Datum                     0
Bewoelkung               55
Temperatur                0
Windgeschwindigkeit       0
Wetterco


# Step 2: Feature Engineering

In [6]:
import pandas as pd

# Convert 'Datum' to datetime and add a weekday column
wetter_umsatzdaten_kiwo['Datum'] = pd.to_datetime(wetter_umsatzdaten_kiwo['Datum'], errors='coerce')
wetter_umsatzdaten_kiwo['Wochentag'] = wetter_umsatzdaten_kiwo['Datum'].dt.day_name()
wetter_umsatzdaten_kiwo['Is_Weekend'] = wetter_umsatzdaten_kiwo['Datum'].dt.weekday.isin([5, 6]).astype(int)

# Add a temperature category
def classify_temperature(temp):
    if temp < 0:
        return -2  # Very Cold
    elif 0 <= temp <= 10:
        return -1  # Cold
    elif 10 < temp <= 20:
        return 1   # Warm
    else:
        return 2   # Very Warm

wetter_umsatzdaten_kiwo['Temperature_Category'] = wetter_umsatzdaten_kiwo['Temperatur'].apply(classify_temperature)


# Add a Beaufort scale categorization function
def classify_wind_speed_beaufort(wind_speed):
    if wind_speed <= 0.2:
        return 0  # Calm
    elif wind_speed <= 1.5:
        return 1  # Light Air
    elif wind_speed <= 3.3:
        return 2  # Light Breeze
    elif wind_speed <= 5.4:
        return 3  # Gentle Breeze
    elif wind_speed <= 7.9:
        return 4  # Moderate Breeze
    elif wind_speed <= 10.7:
        return 5  # Fresh Breeze
    elif wind_speed <= 13.8:
        return 6  # Strong Breeze
    elif wind_speed <= 17.1:
        return 7  # Near Gale
    elif wind_speed <= 20.7:
        return 8  # Gale
    elif wind_speed <= 24.4:
        return 9  # Strong Gale
    elif wind_speed <= 28.4:
        return 10  # Storm
    elif wind_speed <= 32.6:
        return 11  # Violent Storm
    else:
        return 12  # Hurricane

wetter_umsatzdaten_kiwo['Windgeschwindigkeit_Beaufort'] = wetter_umsatzdaten_kiwo['Windgeschwindigkeit'].apply(classify_wind_speed_beaufort)

# Add a column for Rain_Status based on Wettercode
def map_rain_status(wettercode):
    rain_status_mapping = {
        0: 0,
        20: 0,
        21: 0,
        61: 1,
        63: 1,
        65: 1,
        95: 2
    }
    return rain_status_mapping.get(wettercode, -1)  # Default to -1 for unknown codes

wetter_umsatzdaten_kiwo['Rain_Status'] = wetter_umsatzdaten_kiwo['Wettercode'].apply(map_rain_status)

#Add acolumn for Cloud_Status based on Bewölkung
def map_cloud_status(bewoelkung):
    if bewoelkung < 1:
        return 0  # sunny
    elif bewoelkung <= 3:
        return 1 # clear
    elif bewoelkung <= 6:
        return 2  # Partly Cloudy
    elif bewoelkung <= 7:
        return 3  # Cloudy
    else:
        return 4  # totally Cloudy

wetter_umsatzdaten_kiwo['Cloud_Status'] = wetter_umsatzdaten_kiwo['Bewoelkung'].apply(map_cloud_status)


# Generate rows for all Warengruppe categories where Warengruppe is 0
unique_warengruppen = wetter_umsatzdaten_kiwo['Warengruppe'].unique()
rows_with_zero_warengruppe = wetter_umsatzdaten_kiwo[wetter_umsatzdaten_kiwo['Warengruppe'] == 0]

# Expand rows for each Warengruppe
expanded_rows = []
for _, row in rows_with_zero_warengruppe.iterrows():
    for warengruppe in unique_warengruppen:
        if warengruppe != 0:  # Exclude 0 as it's already in the dataset
            new_row = row.copy()
            new_row['Warengruppe'] = warengruppe
            expanded_rows.append(new_row)

# Append expanded rows back to the main dataset
expanded_df = pd.DataFrame(expanded_rows)
wetter_umsatzdaten_kiwo = pd.concat([wetter_umsatzdaten_kiwo, expanded_df], ignore_index=True)

# Perform one-hot encoding for 'Warengruppe'
warengruppe_encoded = pd.get_dummies(wetter_umsatzdaten_kiwo['Warengruppe'], prefix='Warengruppe')
wetter_umsatzdaten_kiwo = pd.concat([wetter_umsatzdaten_kiwo, warengruppe_encoded], axis=1)

# Adjust IDs based on Warengruppe
def adjust_id(row):
    for i in range(1, 7):  # Warengruppe 1 to 6
        if row[f'Warengruppe_{i}']:
            return str(row['ID'])[:-1] + str(i)  # Replace last digit with Warengruppe
    return row['ID']  # Return original ID if no adjustment is needed

# Apply ID adjustments
wetter_umsatzdaten_kiwo['ID'] = wetter_umsatzdaten_kiwo.apply(adjust_id, axis=1)

# Remove entries where Warengruppe 0 is True
wetter_umsatzdaten_kiwo = wetter_umsatzdaten_kiwo[~wetter_umsatzdaten_kiwo['Warengruppe_0']]

# Remove the column for Warengruppe 0
wetter_umsatzdaten_kiwo.drop(columns=['Warengruppe_0'], inplace=True)

# Dynamically create a list of feature columns
feature_columns = [
    'KiWo', 
    'Is_Weekend', 
    'Temperature_Category', 
    'Windgeschwindigkeit_Beaufort', 
    'Rain_Status'
    'Cloud_Status'
] + [col for col in warengruppe_encoded.columns if col != 'Warengruppe_0']

# Step 3: Save Processed Data
wetter_umsatzdaten_kiwo.to_csv("processed_data.csv", index=False)

# Save feature columns to a file for use in the modeling stage
with open("feature_columns.txt", "w") as f:
    f.write("\n".join(feature_columns))

print("Data preparation and characterization complete. Processed data and feature columns saved.")


Data preparation and characterization complete. Processed data and feature columns saved.
