# Datenvorbereitung

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 1. Daten einlesen und vorbereiten
bakery_data = pd.read_csv('/content/bakery_data (3).csv')
bakery_target = pd.read_csv('/content/bakery_target (3).csv')

# Zielvariable hinzufügen
bakery_data['demand'] = bakery_target['demand']

# Entfernen der 'date' Spalte
bakery_data = bakery_data.drop('date', axis=1)

# Kategorische Variablen
categorical_columns = ['weekday', 'month', 'store', 'product']

# Label Encoding der kategorischen Variablen
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    bakery_data[column] = label_encoders[column].fit_transform(bakery_data[column])

# Alle Features und Zielvariable
X = bakery_data.drop(['demand'], axis=1)
y = bakery_data['demand']

# Normalisierung der Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Aufteilen der Daten in Trainings- und Testsets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Training eines XGBoost-Modells

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# 2. XGBoost-Modell trainieren
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                          max_depth=5, alpha=10, n_estimators=100, random_state=42)
xg_reg.fit(X_train, y_train)

# Vorhersagen mit XGBoost-Modell
y_pred_xgb = xg_reg.predict(X_test)

# Metriken für XGBoost
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost Mean Squared Error: {mse_xgb:.2f}")
print(f"XGBoost R^2 Score: {r2_xgb:.2f}")

XGBoost Mean Squared Error: 6853.27
XGBoost R^2 Score: 0.64


# Deep Learning-Modell zur Erstellung von Embeddings

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense

# Embedding Input-Größen
embedding_sizes = {'weekday': 7, 'month': 12, 'store': 10, 'product': 50}  # Beispielhafte Input-Dimensionen

input_layers = []
embedding_layers = []

# Für jede kategorische Variable eine Embedding-Schicht definieren
for col in categorical_columns:
    input_layer = Input(shape=(1,))
    input_layers.append(input_layer)
    vocab_size = bakery_data[col].nunique() + 1  # Anzahl der eindeutigen Werte + 1
    embed_size = embedding_sizes[col]
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embed_size)(input_layer)
    embedding_layer = Flatten()(embedding_layer)
    embedding_layers.append(embedding_layer)

# Numerische Eingaben
input_numeric = Input(shape=(X_train.shape[1],))
input_layers.append(input_numeric)

# Kombination von Embeddings und numerischen Features
all_features = Concatenate()(embedding_layers + [input_numeric])

# Dichtes neuronales Netz zur Feature-Extraktion
x = Dense(64, activation='relu')(all_features)
x = Dense(32, activation='relu')(x)
output = Dense(1, activation='linear')(x)

# Modell zur Extraktion der Embeddings
embedding_model = Model(inputs=input_layers, outputs=output)
embedding_model.compile(optimizer='adam', loss='mse')

# Training des Embedding-Modells
X_train_inputs = [X_train_cat[col].values for col in categorical_columns] + [X_train]
X_test_inputs = [X_test_cat[col].values for col in categorical_columns] + [X_test]
embedding_model.fit(X_train_inputs, y_train, epochs=20, batch_size=32, validation_data=(X_test_inputs, y_test))

# 3. Embeddings extrahieren
feature_extractor = Model(inputs=embedding_model.inputs, outputs=embedding_model.layers[-2].output)
X_train_embeddings = feature_extractor.predict(X_train_inputs)
X_test_embeddings = feature_extractor.predict(X_test_inputs)

Epoch 1/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - loss: 9255.6152 - val_loss: 3965.5815
Epoch 2/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 3902.4695 - val_loss: 3695.6963
Epoch 3/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 3688.5000 - val_loss: 3473.3945
Epoch 4/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 3526.9077 - val_loss: 3247.4031
Epoch 5/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 3289.4629 - val_loss: 3216.5601
Epoch 6/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 3172.7214 - val_loss: 2822.5076
Epoch 7/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 2688.6375 - val_loss: 2109.0791
Epoch 8/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 1

# Training eines XGBoost-Modells mit den Embeddings

In [None]:
# 4. XGBoost-Modell trainieren (mit Embeddings als Input)
xg_reg_embeddings = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                                     max_depth=5, alpha=10, n_estimators=100, random_state=42)
xg_reg_embeddings.fit(X_train_embeddings, y_train)

# Vorhersagen mit XGBoost-Modell (mit Embeddings)
y_pred_xgb_embeddings = xg_reg_embeddings.predict(X_test_embeddings)

# Metriken für XGBoost (mit Embeddings)
mse_xgb_embeddings = mean_squared_error(y_test, y_pred_xgb_embeddings)
r2_xgb_embeddings = r2_score(y_test, y_pred_xgb_embeddings)

print(f"XGBoost (mit Embeddings) Mean Squared Error: {mse_xgb_embeddings:.2f}")
print(f"XGBoost (mit Embeddings) R^2 Score: {r2_xgb_embeddings:.2f}")

XGBoost (mit Embeddings) Mean Squared Error: 1233.56
XGBoost (mit Embeddings) R^2 Score: 0.94


# LightGBM ohne Embedding

In [None]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# LightGBM-Modell ohne Embeddings
lgb_reg = lgb.LGBMRegressor()

# Hyperparameter-Tuning für LightGBM
param_grid_lgb = {
    'num_leaves': [31, 50],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200]
}
grid_search_lgb = GridSearchCV(estimator=lgb_reg, param_grid=param_grid_lgb, cv=3, scoring='neg_mean_squared_error', verbose=2)
grid_search_lgb.fit(X_train, y_train)

# Beste Parameter
print("Beste Parameter für LightGBM:", grid_search_lgb.best_params_)

# Modell mit besten Parametern
lgb_reg_best = grid_search_lgb.best_estimator_

# Vorhersagen mit LightGBM-Modell
y_pred_lgb = lgb_reg_best.predict(X_test)

# Metriken für LightGBM
mse_lgb = mean_squared_error(y_test, y_pred_lgb)
r2_lgb = r2_score(y_test, y_pred_lgb)

print(f"LightGBM Mean Squared Error: {mse_lgb:.2f}")
print(f"LightGBM R^2 Score: {r2_lgb:.2f}")

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009559 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 558
[LightGBM] [Info] Number of data points in the train set: 68040, number of used features: 12
[LightGBM] [Info] Start training from score 100.194637
[CV] END learning_rate=0.01, n_estimators=100, num_leaves=31; total time=   1.2s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007429 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 561
[LightGBM] [Info] Number of data points in the train set: 68040, number of used features: 12
[LightGBM] [Info] Start training from score 99.687671
[CV] END learning_rate=0.01, n_estimators=100,

# LightGBM mit Embeddings

In [None]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# LightGBM-Modell mit Embeddings als Input
lgb_reg_embeddings = lgb.LGBMRegressor()

# Hyperparameter-Tuning für LightGBM mit Embeddings
grid_search_lgb_embeddings = GridSearchCV(estimator=lgb_reg_embeddings, param_grid=param_grid_lgb, cv=3, scoring='neg_mean_squared_error', verbose=2)
grid_search_lgb_embeddings.fit(X_train_embeddings, y_train)

# Beste Parameter
print("Beste Parameter für LightGBM (mit Embeddings):", grid_search_lgb_embeddings.best_params_)

# Modell mit besten Parametern
lgb_reg_best_embeddings = grid_search_lgb_embeddings.best_estimator_

# Vorhersagen mit LightGBM-Modell (mit Embeddings)
y_pred_lgb_embeddings = lgb_reg_best_embeddings.predict(X_test_embeddings)

# Metriken für LightGBM (mit Embeddings)
mse_lgb_embeddings = mean_squared_error(y_test, y_pred_lgb_embeddings)
r2_lgb_embeddings = r2_score(y_test, y_pred_lgb_embeddings)

print(f"LightGBM (mit Embeddings) Mean Squared Error: {mse_lgb_embeddings:.2f}")
print(f"LightGBM (mit Embeddings) R^2 Score: {r2_lgb_embeddings:.2f}")

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017055 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 68040, number of used features: 30
[LightGBM] [Info] Start training from score 100.194637
[CV] END learning_rate=0.01, n_estimators=100, num_leaves=31; total time=   1.8s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016420 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 68040, number of used features: 30
[LightGBM] [Info] Start training from score 99.687671
[CV] END learning_rate=0.01, n_estimators=10

# XGBoost besser ohne Embeddings? Hyperparameter-Tuning für ein XGBoost-Modell, Grid Search Cross-Validation (CV)

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Daten einlesen und vorbereiten
bakery_data = pd.read_csv('/content/bakery_data (3).csv')
bakery_target = pd.read_csv('/content/bakery_target (3).csv')
bakery_data['demand'] = bakery_target['demand']
bakery_data = bakery_data.drop('date', axis=1)

# Kategorische Variablen
categorical_columns = ['weekday', 'month', 'store', 'product']
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    bakery_data[column] = label_encoders[column].fit_transform(bakery_data[column])

# Alle Features und Zielvariable
X = bakery_data.drop(['demand'], axis=1)
y = bakery_data['demand']

# Normalisierung der Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Aufteilen der Daten in Trainings- und Testsets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# XGBoost-Modell
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Hyperparameter-Tuning
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'colsample_bytree': [0.3, 0.5, 0.7]
}
grid_search_xgb = GridSearchCV(estimator=xg_reg, param_grid=param_grid_xgb, cv=3, scoring='neg_mean_squared_error', verbose=2)
grid_search_xgb.fit(X_train, y_train)

# Bestes Modell und Vorhersagen
best_xgb = grid_search_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

# Metriken für XGBoost
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost Mean Squared Error: {mse_xgb:.2f}")
print(f"XGBoost R^2 Score: {r2_xgb:.2f}")


Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.8s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   2.1s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   1.9s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=5, n_estimators=100; total time=   0.5s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=5, n_estimators=100; total time=   0.6s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=5, n_estimators=100; total time=   1.5s
[CV] END colsample_bytree=0.3

# XGBoost mit Embeddings

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense

# Embedding Input-Größen
embedding_sizes = {'weekday': 7, 'month': 12, 'store': 10, 'product': 50}

input_layers = []
embedding_layers = []

# Für jede kategorische Variable eine Embedding-Schicht definieren
for col in categorical_columns:
    input_layer = Input(shape=(1,))
    input_layers.append(input_layer)
    vocab_size = bakery_data[col].nunique() + 1
    embed_size = embedding_sizes[col]
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embed_size)(input_layer)
    embedding_layer = Flatten()(embedding_layer)
    embedding_layers.append(embedding_layer)

# Numerische Eingaben
input_numeric = Input(shape=(X_train.shape[1] - len(categorical_columns),))
input_layers.append(input_numeric)

# Kombination von Embeddings und numerischen Features
all_features = Concatenate()(embedding_layers + [input_numeric])

# Dichtes neuronales Netz zur Feature-Extraktion
x = Dense(64, activation='relu')(all_features)
x = Dense(32, activation='relu')(x)
output = Dense(1, activation='linear')(x)

# Modell zur Extraktion der Embeddings
embedding_model = Model(inputs=input_layers, outputs=output)
embedding_model.compile(optimizer='adam', loss='mse')

# Daten vorbereiten für das Training des Embedding-Modells
X_train_cat = [X_train[col].values for col in categorical_columns]
X_test_cat = [X_test[col].values for col in categorical_columns]
X_train_num = X_train.drop(columns=categorical_columns)
X_test_num = X_test.drop(columns=categorical_columns)

X_train_inputs = X_train_cat + [X_train_num.values]
X_test_inputs = X_test_cat + [X_test_num.values]

# Training des Embedding-Modells
embedding_model.fit(X_train_inputs, y_train, epochs=20, batch_size=32, validation_data=(X_test_inputs, y_test))

# 4. Embeddings extrahieren
feature_extractor = Model(inputs=embedding_model.inputs, outputs=embedding_model.layers[-2].output)
X_train_embeddings = feature_extractor.predict(X_train_inputs)
X_test_embeddings = feature_extractor.predict(X_test_inputs)

# 5. XGBoost-Modell trainieren (mit Embeddings als Input)
xg_reg_embeddings = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                                     max_depth=5, alpha=10, n_estimators=100, random_state=42)
xg_reg_embeddings.fit(X_train_embeddings, y_train)

# Vorhersagen mit XGBoost-Modell (mit Embeddings)
y_pred_xgb_embeddings = xg_reg_embeddings.predict(X_test_embeddings)

# Metriken für XGBoost (mit Embeddings)
mse_xgb_embeddings = mean_squared_error(y_test, y_pred_xgb_embeddings)
r2_xgb_embeddings = r2_score(y_test, y_pred_xgb_embeddings)

print(f"XGBoost (mit Embeddings) Mean Squared Error: {mse_xgb_embeddings:.2f}")
print(f"XGBoost (mit Embeddings) R^2 Score: {r2_xgb_embeddings:.2f}")

Epoch 1/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - loss: 10080.4268 - val_loss: 4012.2776
Epoch 2/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 3918.5989 - val_loss: 3654.6050
Epoch 3/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 3616.8767 - val_loss: 3496.3254
Epoch 4/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 4ms/step - loss: 3434.4839 - val_loss: 3295.3628
Epoch 5/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 3327.0933 - val_loss: 3151.4429
Epoch 6/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - loss: 3223.4062 - val_loss: 3030.6929
Epoch 7/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 3118.7805 - val_loss: 2913.2332
Epoch 8/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - loss:

# Ergebnisse

In [None]:
import pandas as pd

# Ergebnisse in einem Dictionary speichern
results = {
    "Model": ["XGBoost", "XGBoost (mit Embeddings)", "LightGBM", "LightGBM (mit Embeddings)", "XGBoost", "XGBoost (mit Embeddings)"],
    "Mean Squared Error": [6853.27, 1233.56, 1329.10, 1197.59, 1353.44, 1330.67],
    "R^2 Score": [0.64, 0.94, 0.93, 0.94, 0.93, 0.93]
}

# Erstellen eines DataFrames
df_results = pd.DataFrame(results)

# Tabelle anzeigen
print(df_results)

                       Model  Mean Squared Error  R^2 Score
0                    XGBoost             6853.27       0.64
1   XGBoost (mit Embeddings)             1233.56       0.94
2                   LightGBM             1329.10       0.93
3  LightGBM (mit Embeddings)             1197.59       0.94
4                    XGBoost             1353.44       0.93
5   XGBoost (mit Embeddings)             1330.67       0.93
