In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense
import xgboost as xgb

# 1. Daten einlesen und vorbereiten
bakery_data = pd.read_csv('/content/bakery_data (3).csv')
bakery_target = pd.read_csv('/content/bakery_target (3).csv')

# Zielvariable hinzufügen
bakery_data['demand'] = bakery_target['demand']

# Entfernen der 'date' Spalte
bakery_data = bakery_data.drop('date', axis=1)

# Kategorische Variablen
categorical_columns = ['weekday', 'month', 'store', 'product']

# Label Encoding der kategorischen Variablen
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    bakery_data[column] = label_encoders[column].fit_transform(bakery_data[column])

# Numerische und Zielvariablen definieren
X_numeric = bakery_data.drop(categorical_columns + ['demand'], axis=1)
y = bakery_data['demand']

# Normalisierung der numerischen Features
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Aufteilen der Daten in Trainings- und Testsets (für numerische und kategorische Variablen)
X_train_num, X_test_num, X_train_cat, X_test_cat, y_train, y_test = train_test_split(
    X_numeric_scaled, bakery_data[categorical_columns], y, test_size=0.2, random_state=42
)

# 2. Embeddings-Generierung mit einem Neuronalen Netz
# Embedding Input-Größen
embedding_sizes = {'weekday': 7, 'month': 12, 'store': 10, 'product': 50}  # Beispielhafte Input-Dimensionen

input_layers = []
embedding_layers = []

# Für jede kategorische Variable eine Embedding-Schicht definieren
for col in categorical_columns:
    input_layer = Input(shape=(1,))
    input_layers.append(input_layer)
    vocab_size = bakery_data[col].nunique() + 1  # Anzahl der eindeutigen Werte + 1
    embed_size = embedding_sizes[col]
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embed_size)(input_layer)
    embedding_layer = Flatten()(embedding_layer)
    embedding_layers.append(embedding_layer)

# Numerische Eingaben
input_numeric = Input(shape=(X_train_num.shape[1],))
input_layers.append(input_numeric)

# Kombination von Embeddings und numerischen Features
all_features = Concatenate()(embedding_layers + [input_numeric])

# Dichtes neuronales Netz zur Feature-Extraktion
x = Dense(64, activation='relu')(all_features)
x = Dense(32, activation='relu')(x)
output = Dense(1, activation='linear')(x)

# Modell zur Extraktion der Embeddings
embedding_model = Model(inputs=input_layers, outputs=output)
embedding_model.compile(optimizer='adam', loss='mse')

# Training des Embedding-Modells
X_train_inputs = [X_train_cat[col].values for col in categorical_columns] + [X_train_num]
X_test_inputs = [X_test_cat[col].values for col in categorical_columns] + [X_test_num]
embedding_model.fit(X_train_inputs, y_train, epochs=20, batch_size=32, validation_data=(X_test_inputs, y_test))

# 3. Embeddings extrahieren
feature_extractor = Model(inputs=embedding_model.inputs, outputs=embedding_model.layers[-2].output)
X_train_embeddings = feature_extractor.predict(X_train_inputs)
X_test_embeddings = feature_extractor.predict(X_test_inputs)

# 4. XGBoost-Modell trainieren (mit Embeddings als Input)
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                          max_depth=5, alpha=10, n_estimators=100, random_state=42)
xg_reg.fit(X_train_embeddings, y_train)

# Vorhersagen mit XGBoost-Modell
y_pred_xgb = xg_reg.predict(X_test_embeddings)

# Metriken für XGBoost
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost (mit Embeddings) Mean Squared Error: {mse_xgb:.2f}")
print(f"XGBoost (mit Embeddings) R^2 Score: {r2_xgb:.2f}")

Epoch 1/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 9968.5986 - val_loss: 4061.6184
Epoch 2/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 3964.0850 - val_loss: 3741.4451
Epoch 3/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 3770.2537 - val_loss: 3518.1062
Epoch 4/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 3533.0977 - val_loss: 3315.0466
Epoch 5/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - loss: 3495.5117 - val_loss: 3229.4778
Epoch 6/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 3199.5391 - val_loss: 3068.3818
Epoch 7/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 3069.8728 - val_loss: 2889.0525
Epoch 8/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 2758

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

# 1. Daten einlesen und vorbereiten
bakery_data = pd.read_csv('/content/bakery_data (3).csv')
bakery_target = pd.read_csv('/content/bakery_target (3).csv')

# Zielvariable hinzufügen
bakery_data['demand'] = bakery_target['demand']

# Entfernen der 'date' Spalte
bakery_data = bakery_data.drop('date', axis=1)

# Kategorische Variablen
categorical_columns = ['weekday', 'month', 'store', 'product']

# Label Encoding der kategorischen Variablen
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    bakery_data[column] = label_encoders[column].fit_transform(bakery_data[column])

# Numerische und Zielvariablen definieren
X = bakery_data.drop(['demand'], axis=1)
y = bakery_data['demand']

# Normalisierung der Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Aufteilen der Daten in Trainings- und Testsets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 2. XGBoost-Modell trainieren
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                          max_depth=5, alpha=10, n_estimators=100, random_state=42)
xg_reg.fit(X_train, y_train)

# Vorhersagen mit XGBoost-Modell
y_pred_xgb = xg_reg.predict(X_test)

# Metriken für XGBoost
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost Mean Squared Error: {mse_xgb:.2f}")
print(f"XGBoost R^2 Score: {r2_xgb:.2f}")

XGBoost Mean Squared Error: 6853.27
XGBoost R^2 Score: 0.64


# Neuer Versuch

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense
import xgboost as xgb

# 1. Daten einlesen und vorbereiten
bakery_data = pd.read_csv('/content/bakery_data (3).csv')
bakery_target = pd.read_csv('/content/bakery_target (3).csv')

# Zielvariable hinzufügen
bakery_data['demand'] = bakery_target['demand']

# Entfernen der 'date' Spalte
bakery_data = bakery_data.drop('date', axis=1)

# Kategorische Variablen
categorical_columns = ['weekday', 'month', 'store', 'product']

# Label Encoding der kategorischen Variablen
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    bakery_data[column] = label_encoders[column].fit_transform(bakery_data[column])

# Numerische und Zielvariablen definieren
X_numeric = bakery_data.drop(categorical_columns + ['demand'], axis=1)
y = bakery_data['demand']

# Normalisierung der numerischen Features
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Aufteilen der Daten in Trainings- und Testsets (für numerische und kategorische Variablen)
X_train_num, X_test_num, X_train_cat, X_test_cat, y_train, y_test = train_test_split(
    X_numeric_scaled, bakery_data[categorical_columns], y, test_size=0.2, random_state=42
)


In [2]:
# 2. Embeddings-Generierung mit einem Neuronalen Netz
embedding_sizes = {'weekday': 7, 'month': 12, 'store': 10, 'product': 50}  # Beispielhafte Input-Dimensionen

input_layers = []
embedding_layers = []

# Für jede kategorische Variable eine Embedding-Schicht definieren
for col in categorical_columns:
    input_layer = Input(shape=(1,))
    input_layers.append(input_layer)
    vocab_size = bakery_data[col].nunique() + 1  # Anzahl der eindeutigen Werte + 1
    embed_size = embedding_sizes[col]
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embed_size)(input_layer)
    embedding_layer = Flatten()(embedding_layer)
    embedding_layers.append(embedding_layer)

# Numerische Eingaben
input_numeric = Input(shape=(X_train_num.shape[1],))
input_layers.append(input_numeric)

# Kombination von Embeddings und numerischen Features
all_features = Concatenate()(embedding_layers + [input_numeric])

# Dichtes neuronales Netz zur Feature-Extraktion
x = Dense(64, activation='relu')(all_features)
x = Dense(32, activation='relu')(x)
output = Dense(1, activation='linear')(x)

# Modell zur Extraktion der Embeddings
embedding_model = Model(inputs=input_layers, outputs=output)
embedding_model.compile(optimizer='adam', loss='mse')

# Training des Embedding-Modells
X_train_inputs = [X_train_cat[col].values for col in categorical_columns] + [X_train_num]
X_test_inputs = [X_test_cat[col].values for col in categorical_columns] + [X_test_num]
embedding_model.fit(X_train_inputs, y_train, epochs=20, batch_size=32, validation_data=(X_test_inputs, y_test))


Epoch 1/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 4ms/step - loss: 9905.9180 - val_loss: 4017.6182
Epoch 2/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 4ms/step - loss: 3985.2932 - val_loss: 3673.2241
Epoch 3/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 6ms/step - loss: 3677.1362 - val_loss: 3500.9512
Epoch 4/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 4ms/step - loss: 3553.4971 - val_loss: 3304.6511
Epoch 5/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - loss: 3462.8237 - val_loss: 3137.6987
Epoch 6/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 4ms/step - loss: 3259.5811 - val_loss: 3003.6157
Epoch 7/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 4ms/step - loss: 2990.0562 - val_loss: 2619.5757
Epoch 8/20
[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 

<keras.src.callbacks.history.History at 0x7b886d7ad480>

In [3]:
# 3. Embeddings extrahieren
feature_extractor = Model(inputs=embedding_model.inputs, outputs=embedding_model.layers[-2].output)
X_train_embeddings = feature_extractor.predict(X_train_inputs)
X_test_embeddings = feature_extractor.predict(X_test_inputs)

# 4. XGBoost-Modell trainieren (mit Embeddings als Input)
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                          max_depth=5, alpha=10, n_estimators=100, random_state=42)
xg_reg.fit(X_train_embeddings, y_train)

# Vorhersagen mit XGBoost-Modell
y_pred_xgb = xg_reg.predict(X_test_embeddings)

# Metriken für XGBoost
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost (mit Embeddings) Mean Squared Error: {mse_xgb:.2f}")
print(f"XGBoost (mit Embeddings) R^2 Score: {r2_xgb:.2f}")

# 5. Vergleich mit Standalone XGBoost-Modell
X_train_all = X_train_num
X_test_all = X_test_num

xg_reg_all = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                              max_depth=5, alpha=10, n_estimators=100, random_state=42)
xg_reg_all.fit(X_train_all, y_train)

y_pred_xgb_all = xg_reg_all.predict(X_test_all)

mse_xgb_all = mean_squared_error(y_test, y_pred_xgb_all)
r2_xgb_all = r2_score(y_test, y_pred_xgb_all)

print(f"XGBoost (nur numerische Features) Mean Squared Error: {mse_xgb_all:.2f}")
print(f"XGBoost (nur numerische Features) R^2 Score: {r2_xgb_all:.2f}")

[1m3190/3190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
XGBoost (mit Embeddings) Mean Squared Error: 1310.45
XGBoost (mit Embeddings) R^2 Score: 0.93
XGBoost (nur numerische Features) Mean Squared Error: 18628.96
XGBoost (nur numerische Features) R^2 Score: 0.03


In [4]:
# 1. Daten einlesen und vorbereiten
bakery_data = pd.read_csv('/content/bakery_data (3).csv')
bakery_target = pd.read_csv('/content/bakery_target (3).csv')

# Zielvariable hinzufügen
bakery_data['demand'] = bakery_target['demand']

# Entfernen der 'date' Spalte
bakery_data = bakery_data.drop('date', axis=1)

# Kategorische Variablen
categorical_columns = ['weekday', 'month', 'store', 'product']

# Label Encoding der kategorischen Variablen
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    bakery_data[column] = label_encoders[column].fit_transform(bakery_data[column])

# Alle Features und Zielvariable
X = bakery_data.drop(['demand'], axis=1)
y = bakery_data['demand']

# Normalisierung der Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Aufteilen der Daten in Trainings- und Testsets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [5]:
# 2. XGBoost-Modell trainieren
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                          max_depth=5, alpha=10, n_estimators=100, random_state=42)
xg_reg.fit(X_train, y_train)

# Vorhersagen mit XGBoost-Modell
y_pred_xgb = xg_reg.predict(X_test)

# Metriken für XGBoost
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost Mean Squared Error: {mse_xgb:.2f}")
print(f"XGBoost R^2 Score: {r2_xgb:.2f}")

XGBoost Mean Squared Error: 6853.27
XGBoost R^2 Score: 0.64
