In [7]:
##############################################################################
# Training
##############################################################################

import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import os

# Load and preprocess data
data = pd.read_csv("/workspaces/Team_Raum-3_BakerySalesPredictions/0_DataPreparation/processed_data_imputed.csv")

# Filter out rows with Umsatz = 0
data = data[data['Umsatz'] != 0]

# Ensure 'Datum' is in datetime format
data['Datum'] = pd.to_datetime(data['Datum'], errors='coerce')
data = data.sort_values('Datum')

# Add time series features
data['Umsatz_7day_avg'] = data['Umsatz'].rolling(window=7).mean()
data['Umsatz_last_year'] = data['Datum'].apply(lambda x: x - pd.DateOffset(years=1) if pd.notnull(x) else pd.NaT)
data = data.merge(data[['Datum', 'Umsatz']], how='left', left_on='Umsatz_last_year', right_on='Datum', suffixes=('', '_last_year'))
data.rename(columns={'Umsatz_last_year': 'Umsatz_last_year'}, inplace=True)
data.drop(columns=['Datum_last_year'], inplace=True)

# Drop rows with NaN values introduced by rolling and shifting
data.dropna(subset=['Umsatz_7day_avg', 'Umsatz_last_year'], inplace=True)

# Define feature columns
feature_columns = [
    'KiWo',
    'Is_Ferien',
    'Holiday',
    'Is_Weekend',
    'Weihnachten_Sommer',
    'Christmas_Sales',
    'Temperature_Category',
    'Windgeschwindigkeit_Beaufort',
    'Rain_Status',
    'Cloud_Status',
    'Warengruppe_1',
    'Warengruppe_2',
    'Warengruppe_3',
    'Warengruppe_4',
    'Warengruppe_5',
    'Warengruppe_6',
    'Umsatz_7day_avg',
    'Umsatz_last_year'
]

print("Final feature columns used in training:", feature_columns)

# Split data into training and validation sets
training_start_date = '2013-07-01'
training_end_date = '2017-07-31'
validation_start_date = '2017-08-01'
validation_end_date = '2018-07-31'

train_data = data[(data['Datum'] >= training_start_date) & (data['Datum'] <= training_end_date)]
val_data = data[(data['Datum'] >= validation_start_date) & (data['Datum'] <= validation_end_date)]

# Ensure all features are numeric
valid_features = []
for col in feature_columns:
    if col in train_data.columns and col in val_data.columns:
        try:
            train_data[col] = pd.to_numeric(train_data[col], errors='coerce').fillna(0)
            val_data[col] = pd.to_numeric(val_data[col], errors='coerce').fillna(0)
            valid_features.append(col)
        except Exception as e:
            print(f"Warning: Could not process column {col} - {e}")
    else:
        print(f"Warning: Column {col} is missing in train_data or val_data.")

X_train = train_data[valid_features].to_numpy()
y_train = train_data['Umsatz'].to_numpy()
X_val = val_data[valid_features].to_numpy()
y_val = val_data['Umsatz'].to_numpy()

# Reshape data for RNN input
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Convert to 3D shape (samples, timesteps, features)
def create_sequences(data, target, timesteps=1):
    X, y = [], []
    for i in range(len(data) - timesteps):
        X.append(data[i:i+timesteps])
        y.append(target[i+timesteps])
    return np.array(X), np.array(y)

timesteps = 7  # Using 7 days of history
X_train, y_train = create_sequences(X_train, y_train, timesteps)
X_val, y_val = create_sequences(X_val, y_val, timesteps)

# Build the RNN model
model = Sequential([
    LSTM(64, input_shape=(timesteps, X_train.shape[2]), activation='relu', return_sequences=True, kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    LSTM(32, activation='relu', return_sequences=False, kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    Dense(16, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    Dense(1, activation='linear')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

# Callbacks for early stopping and learning rate adjustment
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)
lr_adjustment = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping, lr_adjustment],
    verbose=1
)

# Evaluate the model
y_val_pred = model.predict(X_val).flatten()
mse = mean_squared_error(y_val, y_val_pred)
mape = mean_absolute_percentage_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
validation_cost = mse * len(y_val) / 2

print(f"Best R²: {r2:.4f}")
print(f"Validation cost: {validation_cost:.4e}")
print(f"Mean Squared Error (MSE) on validation set: {mse:.4e}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# Save the trained model
model_save_path = "/workspaces/Team_Raum-3_BakerySalesPredictions/3_Model/rnn_model.h5"
os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
model.save(model_save_path)
print(f"Model saved to: {model_save_path}")

##############################################################################
# Simulation
##############################################################################

from tensorflow.keras.models import load_model
from tensorflow.keras.losses import MeanSquaredError

# Paths to the files
processed_data_path = "/workspaces/Team_Raum-3_BakerySalesPredictions/0_DataPreparation/processed_data_imputed.csv"
sample_submission_path = "/workspaces/Team_Raum-3_BakerySalesPredictions/0_DataPreparation/sample_submission.csv"
final_submission_path = "/workspaces/Team_Raum-3_BakerySalesPredictions/0_DataPreparation/final_submission.csv"
model_path = "/workspaces/Team_Raum-3_BakerySalesPredictions/3_Model/rnn_model.h5"

# Load the processed data and sample submission
processed_data = pd.read_csv(processed_data_path)
sample_submission = pd.read_csv(sample_submission_path)

# Dynamically rebuild feature columns
feature_columns = [
    'KiWo',
    'Is_Ferien',
    'Holiday',
    'Is_Weekend',
    'Weihnachten_Sommer',
    'Christmas_Sales',
    'Temperature_Category',
    'Windgeschwindigkeit_Beaufort',
    'Rain_Status',
    'Cloud_Status',
    'Warengruppe_1',
    'Warengruppe_2',
    'Warengruppe_3',
    'Warengruppe_4',
    'Warengruppe_5',
    'Warengruppe_6',
    'Umsatz_7day_avg',
    'Umsatz_last_year'
]

# Add time series features to processed_data
processed_data['Datum'] = pd.to_datetime(processed_data['Datum'], errors='coerce')
processed_data['Umsatz_7day_avg'] = processed_data['Umsatz'].rolling(window=7).mean()
processed_data['Umsatz_last_year'] = processed_data['Datum'].apply(lambda x: x - pd.DateOffset(years=1) if pd.notnull(x) else pd.NaT)
processed_data = processed_data.merge(
    processed_data[['Datum', 'Umsatz']], how='left', left_on='Umsatz_last_year', right_on='Datum', suffixes=('', '_last_year')
)
processed_data.rename(columns={'Umsatz_last_year': 'Umsatz_last_year'}, inplace=True)
processed_data.drop(columns=['Datum_last_year'], inplace=True)
processed_data.dropna(subset=['Umsatz_7day_avg', 'Umsatz_last_year'], inplace=True)

# Extract features for prediction
valid_features = []
for col in feature_columns:
    if col in processed_data.columns:
        try:
            processed_data[col] = pd.to_numeric(processed_data[col], errors='coerce').fillna(0)
            valid_features.append(col)
        except Exception as e:
            print(f"Warning: Could not process column {col} - {e}")
    else:
        print(f"Warning: Column {col} is missing in processed_data.")

X_new = processed_data[valid_features].to_numpy()

# Normalize features using the same scaler used during training
X_new = scaler.transform(X_new)

# Convert to 3D shape for RNN input
X_new, _ = create_sequences(X_new, np.zeros(X_new.shape[0]), timesteps=timesteps)

# Load the trained RNN model
model = load_model(
    model_path,
    custom_objects={'mse': MeanSquaredError()}  # Ensure compatibility with saved model
)

# Predict the output using the trained RNN model
y_pred = model.predict(X_new).flatten()

# Add predictions to the processed data DataFrame
processed_data['Predicted_Umsatz'] = np.nan
processed_data.iloc[-len(y_pred):, processed_data.columns.get_loc('Predicted_Umsatz')] = y_pred

# Merge predictions with sample submission to ensure matching structure
final_submission = sample_submission[['id']].copy()
final_submission = final_submission.merge(
    processed_data[['id', 'Predicted_Umsatz']],
    how='left',
    left_on='id',
    right_on='id'
)

# Rename 'Predicted_Umsatz' to 'Umsatz'
final_submission.rename(columns={'Predicted_Umsatz': 'Umsatz'}, inplace=True)

# Replace null values in the Umsatz column with 0
final_submission['Umsatz'] = final_submission['Umsatz'].fillna(0)

# Save the final submission file
os.makedirs(os.path.dirname(final_submission_path), exist_ok=True)
final_submission.to_csv(final_submission_path, index=False)

print(f"Final submission saved to: {final_submission_path}")


Final feature columns used in training: ['KiWo', 'Is_Ferien', 'Holiday', 'Is_Weekend', 'Weihnachten_Sommer', 'Christmas_Sales', 'Temperature_Category', 'Windgeschwindigkeit_Beaufort', 'Rain_Status', 'Cloud_Status', 'Warengruppe_1', 'Warengruppe_2', 'Warengruppe_3', 'Warengruppe_4', 'Warengruppe_5', 'Warengruppe_6', 'Umsatz_7day_avg', 'Umsatz_last_year']
Epoch 1/100


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[col] = pd.to_numeric(train_data[col], errors='coerce').fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data[col] = pd.to_numeric(val_data[col], errors='coerce').fillna(0)
  super().__init__(**kwargs)


[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 6ms/step - loss: 27590.8242 - mae: 117.5950 - val_loss: 8639.7559 - val_mae: 67.7635 - learning_rate: 0.0010
Epoch 2/100
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - loss: 14117.2002 - mae: 84.4527 - val_loss: 6469.8628 - val_mae: 57.7515 - learning_rate: 0.0010
Epoch 3/100
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 11416.5928 - mae: 75.5525 - val_loss: 4984.8013 - val_mae: 48.2040 - learning_rate: 0.0010
Epoch 4/100
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - loss: 10527.0293 - mae: 70.2679 - val_loss: 5429.3999 - val_mae: 49.8260 - learning_rate: 0.0010
Epoch 5/100
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - loss: 10351.3291 - mae: 69.8987 - val_loss: 5207.4155 - val_mae: 49.5431 - learning_rate: 0.0010
Epoch 6/100
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5



Best R²: 0.7946
Validation cost: 1.5954e+07
Mean Squared Error (MSE) on validation set: 3.4336e+03
Mean Absolute Percentage Error (MAPE): 0.27%
Model saved to: /workspaces/Team_Raum-3_BakerySalesPredictions/3_Model/rnn_model.h5




[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
Final submission saved to: /workspaces/Team_Raum-3_BakerySalesPredictions/0_DataPreparation/final_submission.csv
