# Datenverarbeitung

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Funktion zum Einlesen und Vorbereiten der Daten für einen einzelnen Datensatz
def prepare_data(data_path, target_path, target_col, categorical_columns):
    # Daten einlesen
    data = pd.read_csv(data_path)
    target = pd.read_csv(target_path)

    # Zielvariable hinzufügen
    data[target_col] = target[target_col]

    # Entfernen der 'date' Spalte, falls vorhanden
    if 'date' in data.columns:
        data = data.drop('date', axis=1)

    # Label Encoding der kategorischen Variablen
    label_encoders = {}
    for column in categorical_columns:
        if column in data.columns:
            label_encoders[column] = LabelEncoder()
            data[column] = label_encoders[column].fit_transform(data[column])

    # Alle Features und Zielvariable
    X = data.drop([target_col], axis=1)
    y = data[target_col]

    # Normalisierung der Features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Aufteilen der Daten in Trainings- und Testsets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

# SID

In [6]:
sid_path = '/content/SID_data.csv'
sid_target_path = '/content/SID_target.csv'
sid_categorical_columns = []


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Pfade für den SID-Datensatz
sid_path = '/content/SID_data.csv'
sid_target_path = '/content/SID_target.csv'

# Daten einlesen
sid_data = pd.read_csv(sid_path)
sid_target = pd.read_csv(sid_target_path)

# Überprüfen der Spalten in den Ziel-Daten
print("SID Target Columns:")
print(sid_target.columns)

print("\nSID Target Data Types:")
print(sid_target.dtypes)

# Zielvariable hinzufügen
sid_data['sales'] = sid_target['sales']

# Entfernen von nicht benötigten Spalten (Beispiel: 'date', falls vorhanden)
if 'date' in sid_data.columns:
    sid_data = sid_data.drop('date', axis=1)

# Kategorische Variablen
sid_categorical_columns = ['weekday', 'month']  # 'date' wurde entfernt

# Label Encoding der kategorischen Variablen
label_encoders = {}
for column in sid_categorical_columns:
    label_encoders[column] = LabelEncoder()
    sid_data[column] = label_encoders[column].fit_transform(sid_data[column])

# Alle Features und Zielvariable
X_sid = sid_data.drop(['sales'], axis=1)
y_sid = sid_data['sales']

# Normalisierung der Features
scaler = StandardScaler()
X_sid_scaled = scaler.fit_transform(X_sid)

# Aufteilen der Daten in Trainings- und Testsets
X_sid_train, X_sid_test, y_sid_train, y_sid_test = train_test_split(X_sid_scaled, y_sid, test_size=0.2, random_state=42)

# Zeige die vorbereiteten Daten
print("\nSID Data (Training Set):")
print(pd.DataFrame(X_sid_train, columns=X_sid.columns).head())

print("\nSID Data (Test Set):")
print(pd.DataFrame(X_sid_test, columns=X_sid.columns).head())

print("\nSID Target (Training Set):")
print(y_sid_train.head())

print("\nSID Target (Test Set):")
print(y_sid_test.head())


SID Target Columns:
Index(['sales'], dtype='object')

SID Target Data Types:
sales    int64
dtype: object

SID Data (Training Set):
    weekday     month      year     store      item
0  0.000000 -1.307378 -1.414795 -0.174078  0.519719
1 -2.085094 -0.437381  1.414020 -1.218544 -0.589015
2  3.127642 -1.017379 -1.414795  1.218544  1.697749
3  2.085094  0.722616 -1.414795 -1.218544 -0.935495
4  0.000000 -0.437381 -0.707591  0.174078 -0.103944

SID Data (Test Set):
   weekday     month      year     store      item
0      0.0 -0.437381 -1.414795  1.566699  0.103944
1      0.0  1.592613 -0.000387  0.174078 -1.489862
2      0.0 -1.307378  1.414020  1.566699 -1.628453
3      0.0  0.142617 -0.707591 -0.522233 -1.212678
4      0.0  1.302614  1.414020  0.522233  0.658311

SID Target (Training Set):
591855    41
295447    34
909684    39
202807    73
429503    28
Name: sales, dtype: int64

SID Target (Test Set):
491212     21
64903      15
36378     100
133834     93
633538     33
Name: sales, dt

# XGBoost-Modell ohne Embeddings

In [16]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Trainieren eines XGBoost-Modells ohne Embeddings
xgb_model_no_embeddings = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
xgb_model_no_embeddings.fit(X_sid_train, y_sid_train)

# Vorhersagen und Bewertung
y_sid_pred_no_embeddings = xgb_model_no_embeddings.predict(X_sid_test)
print("\nMSE (SID, ohne Embeddings):", mean_squared_error(y_sid_test, y_sid_pred_no_embeddings))
print("R2 Score (SID, ohne Embeddings):", r2_score(y_sid_test, y_sid_pred_no_embeddings))


MSE (SID, ohne Embeddings): 104.76146167347257
R2 Score (SID, ohne Embeddings): 0.8735065046573893


# XGBoost-Modell mit Embeddings

In [17]:
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Definition eines einfachen MLP-Modells für Embeddings
input_dim = X_sid_train.shape[1]
embedding_dim = 10

input_layer = Input(shape=(input_dim,))
dense_layer = Dense(64, activation='relu')(input_layer)
embedding_layer = Dense(embedding_dim, activation='relu')(dense_layer)
output_layer = Dense(1, activation='linear')(embedding_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='mean_squared_error')

# Training des Modells
model.fit(X_sid_train, y_sid_train, epochs=10, batch_size=32, validation_split=0.2)

# Extrahieren der Embeddings
embedding_model = Model(inputs=model.input, outputs=model.layers[-2].output)
X_sid_train_embeddings = embedding_model.predict(X_sid_train)
X_sid_test_embeddings = embedding_model.predict(X_sid_test)

# XGBoost Modell mit den Embeddings trainieren
xgb_model_with_embeddings = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
xgb_model_with_embeddings.fit(X_sid_train_embeddings, y_sid_train)

# Vorhersagen und Bewertung
y_sid_pred_with_embeddings = xgb_model_with_embeddings.predict(X_sid_test_embeddings)
print("\nMSE (SID, mit Embeddings):", mean_squared_error(y_sid_test, y_sid_pred_with_embeddings))
print("R2 Score (SID, mit Embeddings):", r2_score(y_sid_test, y_sid_pred_with_embeddings))


Epoch 1/10
[1m18260/18260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 2ms/step - loss: 917.7158 - val_loss: 643.9894
Epoch 2/10
[1m18260/18260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 2ms/step - loss: 641.9636 - val_loss: 628.8307
Epoch 3/10
[1m18260/18260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 2ms/step - loss: 620.6719 - val_loss: 600.4889
Epoch 4/10
[1m18260/18260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 3ms/step - loss: 595.4899 - val_loss: 586.2525
Epoch 5/10
[1m18260/18260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 2ms/step - loss: 574.7885 - val_loss: 565.3560
Epoch 6/10
[1m18260/18260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2ms/step - loss: 558.7922 - val_loss: 549.6039
Epoch 7/10
[1m18260/18260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 3ms/step - loss: 548.2467 - val_loss: 548.1833
Epoch 8/10
[1m18260/18260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 2ms/step - loss

# YAZ

In [7]:
yaz_path = '/content/yaz_data (3).csv'
yaz_target_path = '/content/yaz_target (3).csv'
yaz_categorical_columns = []


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Pfade für den YAZ-Datensatz
yaz_path = '/content/yaz_data (3).csv'
yaz_target_path = '/content/yaz_target (3).csv'

# Daten einlesen
yaz_data = pd.read_csv(yaz_path)
yaz_target = pd.read_csv(yaz_target_path)

# Überprüfen der Spalten in den YAZ-Daten
print("YAZ Data Columns:")
print(yaz_data.columns)

print("\nYAZ Data Types:")
print(yaz_data.dtypes)

# Überprüfen der Spalten in den YAZ-Ziel-Daten
print("\nYAZ Target Columns:")
print(yaz_target.columns)

print("\nYAZ Target Data Types:")
print(yaz_target.dtypes)

# Zielvariablen hinzufügen
# Da es mehrere Zielvariablen gibt, speichern wir sie in einem DataFrame
yaz_data = pd.concat([yaz_data, yaz_target], axis=1)

# Entfernen von nicht benötigten Spalten (Beispiel: 'date', falls vorhanden)
if 'date' in yaz_data.columns:
    yaz_data = yaz_data.drop('date', axis=1)

# Kategorische Variablen
yaz_categorical_columns = ['weekday', 'month']  # Beispielhafte Annahme

# Label Encoding der kategorischen Variablen
label_encoders = {}
for column in yaz_categorical_columns:
    label_encoders[column] = LabelEncoder()
    yaz_data[column] = label_encoders[column].fit_transform(yaz_data[column])

# Alle Features und Zielvariablen
X_yaz = yaz_data.drop(yaz_target.columns, axis=1)
y_yaz = yaz_data[yaz_target.columns]

# Normalisierung der Features
scaler = StandardScaler()
X_yaz_scaled = scaler.fit_transform(X_yaz)

# Aufteilen der Daten in Trainings- und Testsets
X_yaz_train, X_yaz_test, y_yaz_train, y_yaz_test = train_test_split(X_yaz_scaled, y_yaz, test_size=0.2, random_state=42)

# Multi-Label-Klassifikator vorbereiten (Beispiel: RandomForestClassifier)
model = MultiOutputClassifier(RandomForestClassifier())

# Modell trainieren
model.fit(X_yaz_train, y_yaz_train)

# Vorhersagen machen
y_yaz_pred = model.predict(X_yaz_test)

# Metriken berechnen
for i, column in enumerate(yaz_target.columns):
    print(f"Accuracy for {column}: {accuracy_score(y_yaz_test[column], y_yaz_pred[:, i])}")


YAZ Data Columns:
Index(['date', 'weekday', 'month', 'year', 'is_holiday', 'is_closed',
       'weekend', 'wind', 'clouds', 'rain', 'sunshine', 'temperature'],
      dtype='object')

YAZ Data Types:
date            object
weekday         object
month           object
year             int64
is_holiday       int64
is_closed        int64
weekend          int64
wind           float64
clouds         float64
rain           float64
sunshine         int64
temperature    float64
dtype: object

YAZ Target Columns:
Index(['calamari', 'fish', 'shrimp', 'chicken', 'koefte', 'lamb', 'steak'], dtype='object')

YAZ Target Data Types:
calamari    int64
fish        int64
shrimp      int64
chicken     int64
koefte      int64
lamb        int64
steak       int64
dtype: object
Accuracy for calamari: 0.1437908496732026
Accuracy for fish: 0.11764705882352941
Accuracy for shrimp: 0.0784313725490196
Accuracy for chicken: 0.026143790849673203
Accuracy for koefte: 0.0784313725490196
Accuracy for lamb: 0.065359477

# XGBoost-Modell ohne Embeddings

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Pfade für den YAZ-Datensatz
yaz_path = '/content/yaz_data (3).csv'
yaz_target_path = '/content/yaz_target (3).csv'

# Daten einlesen
yaz_data = pd.read_csv(yaz_path)
yaz_target = pd.read_csv(yaz_target_path)

# Zielvariablen hinzufügen
yaz_data = pd.concat([yaz_data, yaz_target], axis=1)

# Entfernen von nicht benötigten Spalten (Beispiel: 'date', falls vorhanden)
if 'date' in yaz_data.columns:
    yaz_data = yaz_data.drop('date', axis=1)

# Kategorische Variablen
yaz_categorical_columns = ['weekday', 'month']  # Beispielhafte Annahme

# Label Encoding der kategorischen Variablen
label_encoders = {}
for column in yaz_categorical_columns:
    label_encoders[column] = LabelEncoder()
    yaz_data[column] = label_encoders[column].fit_transform(yaz_data[column])

# Alle Features und Zielvariablen
X_yaz = yaz_data.drop(yaz_target.columns, axis=1)
y_yaz = yaz_data[yaz_target.columns]

# Normalisierung der Features
scaler = StandardScaler()
X_yaz_scaled = scaler.fit_transform(X_yaz)

# Aufteilen der Daten in Trainings- und Testsets
X_yaz_train, X_yaz_test, y_yaz_train, y_yaz_test = train_test_split(X_yaz_scaled, y_yaz, test_size=0.2, random_state=42)

# Multi-Output-Regressionsmodell vorbereiten
model_no_embeddings = MultiOutputRegressor(XGBRegressor(objective='reg:squarederror', n_estimators=100))

# Modell trainieren
model_no_embeddings.fit(X_yaz_train, y_yaz_train)

# Vorhersagen machen
y_yaz_pred_no_embeddings = model_no_embeddings.predict(X_yaz_test)

# Metriken berechnen
for i, column in enumerate(yaz_target.columns):
    mse_no_embeddings = mean_squared_error(y_yaz_test[column], y_yaz_pred_no_embeddings[:, i])
    print(f"Mean Squared Error für {column} (ohne Embeddings): {mse_no_embeddings}")


Mean Squared Error für calamari (ohne Embeddings): 7.156469408848473
Mean Squared Error für fish (ohne Embeddings): 9.19195603823591
Mean Squared Error für shrimp (ohne Embeddings): 18.713473084944567
Mean Squared Error für chicken (ohne Embeddings): 112.33111819097934
Mean Squared Error für koefte (ohne Embeddings): 61.0968887534827
Mean Squared Error für lamb (ohne Embeddings): 94.04143190533074
Mean Squared Error für steak (ohne Embeddings): 67.84180353348987


# XGBoost-Modell mit Embeddings

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Pfade für den YAZ-Datensatz
yaz_path = '/content/yaz_data (3).csv'
yaz_target_path = '/content/yaz_target (3).csv'

# Daten einlesen
yaz_data = pd.read_csv(yaz_path)
yaz_target = pd.read_csv(yaz_target_path)

# Zielvariablen hinzufügen
yaz_data = pd.concat([yaz_data, yaz_target], axis=1)

# Entfernen von nicht benötigten Spalten (Beispiel: 'date', falls vorhanden)
if 'date' in yaz_data.columns:
    yaz_data = yaz_data.drop('date', axis=1)

# Kategorische Variablen
yaz_categorical_columns = ['weekday', 'month']  # Beispielhafte Annahme

# Label Encoding der kategorischen Variablen
label_encoders = {}
for column in yaz_categorical_columns:
    label_encoders[column] = LabelEncoder()
    yaz_data[column] = label_encoders[column].fit_transform(yaz_data[column])

# Alle Features und Zielvariablen
X_yaz = yaz_data.drop(yaz_target.columns, axis=1)
y_yaz = yaz_data[yaz_target.columns]

# Normalisierung der Features
scaler = StandardScaler()
X_yaz_scaled = scaler.fit_transform(X_yaz)

# Aufteilen der Daten in Trainings- und Testsets
X_yaz_train, X_yaz_test, y_yaz_train, y_yaz_test = train_test_split(X_yaz_scaled, y_yaz, test_size=0.2, random_state=42)

# Einfaches MLP-Modell für Embeddings
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

input_dim = X_yaz_train.shape[1]
embedding_dim = 10

input_layer = Input(shape=(input_dim,))
dense_layer = Dense(64, activation='relu')(input_layer)
embedding_layer = Dense(embedding_dim, activation='relu')(dense_layer)
output_layer = Dense(1, activation='linear')(embedding_layer)

mlp_model = Model(inputs=input_layer, outputs=output_layer)
mlp_model.compile(optimizer='adam', loss='mean_squared_error')

# Training des MLP-Modells
mlp_model.fit(X_yaz_train, y_yaz_train, epochs=10, batch_size=32, validation_split=0.2)

# Extrahieren der Embeddings
embedding_model = Model(inputs=mlp_model.input, outputs=mlp_model.layers[-2].output)
X_yaz_train_embeddings = embedding_model.predict(X_yaz_train)
X_yaz_test_embeddings = embedding_model.predict(X_yaz_test)

# XGBoost Modell mit den Embeddings trainieren
model_with_embeddings = MultiOutputRegressor(XGBRegressor(objective='reg:squarederror', n_estimators=100))
model_with_embeddings.fit(X_yaz_train_embeddings, y_yaz_train)

# Vorhersagen machen
y_yaz_pred_with_embeddings = model_with_embeddings.predict(X_yaz_test_embeddings)

# Metriken berechnen
for i, column in enumerate(yaz_target.columns):
    mse_with_embeddings = mean_squared_error(y_yaz_test[column], y_yaz_pred_with_embeddings[:, i])
    print(f"Mean Squared Error für {column} (mit Embeddings): {mse_with_embeddings}")


Epoch 1/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 481.9826 - val_loss: 568.6637
Epoch 2/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 481.3294 - val_loss: 545.0606
Epoch 3/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 445.7200 - val_loss: 518.6108
Epoch 4/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 427.5568 - val_loss: 488.0193
Epoch 5/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 400.6920 - val_loss: 453.6962
Epoch 6/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 365.0493 - val_loss: 415.8508
Epoch 7/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 317.8142 - val_loss: 375.3152
Epoch 8/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 291.4059 - val_loss: 335.4407
Epoch 9/10
[1m16/16[0

# MLP ohne Embeddings

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error, r2_score

# Daten einlesen
sid_data = pd.read_csv('/content/SID_data.csv')
sid_target = pd.read_csv('/content/SID_target.csv')

# Überprüfen der Spalten in den Ziel-Daten
print("SID Target Columns:")
print(sid_target.columns)

print("\nSID Target Data Types:")
print(sid_target.dtypes)

# Zielvariable hinzufügen
sid_data['sales'] = sid_target['sales']

# Entfernen von nicht benötigten Spalten (Beispiel: 'date', falls vorhanden)
if 'date' in sid_data.columns:
    sid_data = sid_data.drop('date', axis=1)

# Kategorische Variablen (Beispielhaft 'weekday', 'month')
sid_categorical_columns = ['weekday', 'month']

# Label Encoding der kategorischen Variablen
label_encoders = {}
for column in sid_categorical_columns:
    label_encoders[column] = LabelEncoder()
    sid_data[column] = label_encoders[column].fit_transform(sid_data[column])

# Features (X) und Zielvariable (y)
X_sid = sid_data.drop(['sales'], axis=1)
y_sid = sid_data['sales']

# Normalisierung der Features
scaler = StandardScaler()
X_sid_scaled = scaler.fit_transform(X_sid)

# Aufteilen der Daten in Trainings- und Testsets
X_sid_train, X_sid_test, y_sid_train, y_sid_test = train_test_split(X_sid_scaled, y_sid, test_size=0.2, random_state=42)

# Definieren des MLP-Modells ohne Embeddings
class MLPWithoutEmbeddings(nn.Module):
    def __init__(self, input_size):
        super(MLPWithoutEmbeddings, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialisiere das MLP-Modell
input_size = X_sid_train.shape[1]
mlp_model = MLPWithoutEmbeddings(input_size)

# Optimizer und Loss-Funktion
criterion = nn.MSELoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=0.001)

# Konvertiere die Trainingsdaten in Torch-Tensoren
X_sid_train_tensor = torch.tensor(X_sid_train, dtype=torch.float32)
y_sid_train_tensor = torch.tensor(y_sid_train.values, dtype=torch.float32).view(-1, 1)

# Trainingsloop
epochs = 100
for epoch in range(epochs):
    mlp_model.train()

    # Vorhersagen und Verlust berechnen
    optimizer.zero_grad()
    outputs = mlp_model(X_sid_train_tensor)
    loss = criterion(outputs, y_sid_train_tensor)

    # Rückwärtspropagation
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# Evaluierung auf Testset
mlp_model.eval()
X_sid_test_tensor = torch.tensor(X_sid_test, dtype=torch.float32)
y_sid_test_tensor = torch.tensor(y_sid_test.values, dtype=torch.float32).view(-1, 1)

# Vorhersagen
with torch.no_grad():
    y_pred = mlp_model(X_sid_test_tensor).numpy()

# Berechnung des Mean Squared Error (MSE) und R²-Wert
mse = mean_squared_error(y_sid_test, y_pred)
r2 = r2_score(y_sid_test, y_pred)

print(f'Mean Squared Error (MSE) auf Testset: {mse:.4f}')
print(f'R²-Wert auf Testset: {r2:.4f}')


SID Target Columns:
Index(['sales'], dtype='object')

SID Target Data Types:
sales    int64
dtype: object
Epoch [10/100], Loss: 3500.9707
Epoch [20/100], Loss: 3399.0120
Epoch [30/100], Loss: 3256.3379
Epoch [40/100], Loss: 3062.7607
Epoch [50/100], Loss: 2811.4595
Epoch [60/100], Loss: 2503.3369
Epoch [70/100], Loss: 2151.3074
Epoch [80/100], Loss: 1783.7434
Epoch [90/100], Loss: 1443.7964
Epoch [100/100], Loss: 1179.0358
Mean Squared Error (MSE) auf Testset: 1158.5664
R²-Wert auf Testset: -0.3989


MLP ohne Ebeddings Bakery

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 1. Daten einlesen und vorbereiten
bakery_data = pd.read_csv('/content/bakery_data (3).csv')
bakery_target = pd.read_csv('/content/bakery_target (3).csv')

# Zielvariable hinzufügen
bakery_data['demand'] = bakery_target['demand']

# Entfernen der 'date' Spalte
bakery_data = bakery_data.drop('date', axis=1)

# Kategorische Variablen
categorical_columns = ['weekday', 'month', 'store', 'product']

# Label Encoding der kategorischen Variablen
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    bakery_data[column] = label_encoders[column].fit_transform(bakery_data[column])

# Alle Features und Zielvariable
X = bakery_data.drop(['demand'], axis=1)
y = bakery_data['demand']

# Normalisierung der Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Aufteilen der Daten in Trainings- und Testsets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error, r2_score

# 1. Daten einlesen und vorbereiten
bakery_data = pd.read_csv('/content/bakery_data (3).csv')
bakery_target = pd.read_csv('/content/bakery_target (3).csv')

# Zielvariable hinzufügen
bakery_data['demand'] = bakery_target['demand']

# Entfernen der 'date' Spalte (falls vorhanden)
if 'date' in bakery_data.columns:
    bakery_data = bakery_data.drop('date', axis=1)

# Kategorische Variablen
categorical_columns = ['weekday', 'month', 'store', 'product']

# Label Encoding der kategorischen Variablen
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    bakery_data[column] = label_encoders[column].fit_transform(bakery_data[column])

# Features (X) und Zielvariable (y)
X = bakery_data.drop(['demand'], axis=1)
y = bakery_data['demand']

# Normalisierung der Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Aufteilen der Daten in Trainings- und Testsets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Definieren des MLP-Modells ohne Embeddings
class MLPWithoutEmbeddings(nn.Module):
    def __init__(self, input_size):
        super(MLPWithoutEmbeddings, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialisiere das MLP-Modell
input_size = X_train.shape[1]
mlp_model = MLPWithoutEmbeddings(input_size)

# Optimizer und Loss-Funktion
criterion = nn.MSELoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=0.001)

# Konvertiere die Trainingsdaten in Torch-Tensoren
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)

# Trainingsloop
epochs = 100
for epoch in range(epochs):
    mlp_model.train()

    # Vorhersagen und Verlust berechnen
    optimizer.zero_grad()
    outputs = mlp_model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    # Rückwärtspropagation
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# Evaluierung auf dem Testset
mlp_model.eval()
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# Vorhersagen
with torch.no_grad():
    y_pred = mlp_model(X_test_tensor).numpy()

# Berechnung des Mean Squared Error (MSE) und R²-Wert
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE) auf Testset: {mse:.4f}')
print(f'R²-Wert auf Testset: {r2:.4f}')


Epoch [10/100], Loss: 29254.3301
Epoch [20/100], Loss: 29066.5996
Epoch [30/100], Loss: 28788.8398
Epoch [40/100], Loss: 28382.2305
Epoch [50/100], Loss: 27806.1836
Epoch [60/100], Loss: 27023.6426
Epoch [70/100], Loss: 26009.2188
Epoch [80/100], Loss: 24760.3652
Epoch [90/100], Loss: 23309.4375
Epoch [100/100], Loss: 21732.7812
Mean Squared Error (MSE) auf Testset: 21479.6050
R²-Wert auf Testset: -0.1163
