# Regressionsanalyse für Endgewicht-Vorhersage
## Aufgabe 12.3: Regressionsmodell für Endgewicht

Dieses Notebook dokumentiert die Entwicklung eines linearen Regressionsmodells zur Vorhersage des Endgewichts von Flaschen basierend auf IoT-Sensordaten, orientiert am Beispiel aus `docs/8_Regression_Python.ipynb`.

In [None]:
# Import der benötigten Bibliotheken (ähnlich zum Iris-Beispiel)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import os

print("Bibliotheken erfolgreich importiert")

## 1. Daten laden (analog zu sns.load_dataset('iris'))

In [None]:
# Trainingsdaten laden (equivalent to sns.load_dataset('iris'))
data_path = '../database/data.csv'
df = pd.read_csv(data_path)

print("Training data loaded:")
print(f"Shape: {df.shape}")
print("\nFirst 5 rows:")
df.head()

In [None]:
# Datenübersicht
print("Dataset Info:")
print(df.info())
print("\nDataset Description:")
df.describe()

## 2. Features und Zielvariable definieren (analog zum Iris-Beispiel)

In [None]:
# Prepare features and target (equivalent to iris example)
y = df['final_weight']  # Target variable (like 'petal_length' in iris)
X = df.drop(['bottle', 'final_weight'], axis=1)  # Features (remove ID and target)

print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("\nFeatures:")
for i, feature in enumerate(X.columns, 1):
    print(f"{i}. {feature}")

## 3. Train-Test Split (wie im Iris-Beispiel)

In [None]:
# Split data (analog to iris example)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Training features shape:", X_train.shape)
print("Test features shape:", X_test.shape)
print("Training target shape:", y_train.shape)
print("Test target shape:", y_test.shape)

## 4. Modell erstellen und trainieren

In [None]:
# Create and train model (exactly like your simplified approach)
model = LinearRegression()
model.fit(X_train, y_train)

print("Modell erfolgreich trainiert!")
print(f"Intercept (β₀): {model.intercept_:.4f}")

In [None]:
# Show model coefficients (like your model)
print("Model coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"{feature}: {coef:.4f}")

## 5. Modell evaluieren

In [None]:
# Make predictions on training data (like your approach)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate performance (exactly as in your model)
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

print("Model Performance:")
print(f"Training MSE: {mse_train:.4f}")
print(f"Test MSE: {mse_test:.4f}")

# Additional metrics for analysis
from sklearn.metrics import r2_score
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)
print(f"R² Training: {r2_train:.4f}")
print(f"R² Test: {r2_test:.4f}")

## 6. Ergebnistabelle für Report (genau wie gefordert)

In [None]:
# Results table (exactly as in your simplified model)
print("=== RESULTS TABLE ===")
print("| Genutzte Spalten | Modell-Typ | MSE-Wert (Training) | MSE-Wert (Test) |")
print("|------------------|------------|---------------------|-----------------|")
print(f"| All features | Linear | {mse_train:.4f} | {mse_test:.4f} |")

# Create DataFrame for better display
results_df = pd.DataFrame({
    'Genutzte Spalten': ['All features'],
    'Modell-Typ': ['Linear'],
    'MSE-Wert (Training)': [f"{mse_train:.4f}"],
    'MSE-Wert (Test)': [f"{mse_test:.4f}"]
})
results_df

## 7. Modellformel (y = mx + b Form)

In [None]:
# Model formula (exactly as in your simplified model)
print("=== MODEL FORMULA ===")
formula = f"final_weight = {model.intercept_:.4f}"
for feature, coef in zip(X.columns, model.coef_):
    if coef >= 0:
        formula += f" + {coef:.4f} * {feature}"
    else:
        formula += f" - {abs(coef):.4f} * {feature}"

print("Complete formula:")
print(formula)

# Simplified representation
print("\nGeneral form:")
print("y = β₀ + β₁×vibration_red + β₂×fill_red + β₃×vibration_blue + β₄×fill_blue + β₅×vibration_green + β₆×fill_green + β₇×temp_green + β₈×temp_red + β₉×temp_blue")

## 8. Vorhersagen für X.csv (wie gefordert)

In [None]:
# Load prediction data (X.csv) - exactly as in your model
pred_path = '../X.csv'
X_pred_df = pd.read_csv(pred_path)

print(f"Prediction data loaded. Shape: {X_pred_df.shape}")
print("First 5 rows:")
X_pred_df.head()

In [None]:
# Prepare prediction features (same as your simplified approach)
X_pred = X_pred_df.drop(['bottle'], axis=1)  # Remove bottle ID
X_pred = X_pred.fillna(X_pred.mean())  # Handle missing values

# Make final predictions (exactly as in your model)
final_predictions = model.predict(X_pred)

print(f"First 5 predictions: {final_predictions[:5]}")
print(f"Total predictions made: {len(final_predictions)}")

In [None]:
# Save predictions in required format (exactly as your model does)
predictions_df = pd.DataFrame({
    'Flaschen_ID': X_pred_df['bottle'],
    'y_hat': final_predictions
})

# Save to CSV (same path as your model)
output_path = '../linear_reg/reg_student1-student2-student3.csv'
predictions_df.to_csv(output_path, index=False)

print(f"Predictions saved to: {output_path}")
print("Sample predictions (Flaschen_ID, y_hat):")
for i in range(min(5, len(predictions_df))):
    print(f"{predictions_df.iloc[i]['Flaschen_ID']}, {predictions_df.iloc[i]['y_hat']:.1f}")

## 9. Visualisierungen (Optional)

In [None]:
# Simple visualizations
plt.figure(figsize=(12, 4))

# Predicted vs Actual
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_test_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Predictions vs Actual Values')
plt.grid(True, alpha=0.3)

# Residuals
plt.subplot(1, 2, 2)
residuals = y_test - y_test_pred
plt.scatter(y_test_pred, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residuals Plot')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Prediction statistics
print("Prediction Statistics:")
print(f"Mean predicted weight: {final_predictions.mean():.2f}")
print(f"Min predicted weight: {final_predictions.min():.2f}")
print(f"Max predicted weight: {final_predictions.max():.2f}")
print(f"Standard deviation: {final_predictions.std():.2f}")

# Distribution of predictions
plt.figure(figsize=(10, 6))
plt.hist(final_predictions, bins=30, alpha=0.7, edgecolor='black')
plt.xlabel('Predicted Final Weight')
plt.ylabel('Frequency')
plt.title('Distribution of Predicted Final Weights')
plt.grid(True, alpha=0.3)
plt.show()

## 10. Zusammenfassung

### Ergebnisse der Regressionsanalyse

**Modell-Performance:**
- Training MSE: [Wert wird bei Ausführung angezeigt]
- Test MSE: [Wert wird bei Ausführung angezeigt]
- Anzahl Features: 9 (alle IoT-Sensordaten)
- Modell-Typ: Lineare Regression

**Implementierung:**
- Einfacher, transparenter Ansatz (wie Iris-Beispiel)
- Alle Sensordaten als Features verwendet
- Standardmäßige Train-Test-Aufteilung (70/30)
- Direkte Anwendung ohne komplexe Vorverarbeitung

**Deliverables erfüllt:**
- ✅ Lineares Regressionsmodell implementiert
- ✅ Ergebnistabelle erstellt
- ✅ Modellformel in y=mx+b Form dokumentiert
- ✅ Vorhersagen für X.csv generiert
- ✅ CSV-Datei mit Vorhersagen gespeichert
- ✅ Dokumentation erstellt

**Nächste Schritte:**
1. Benennen Sie die CSV-Datei mit Ihren Matrikelnummern um
2. Kopieren Sie die MSE-Werte in Ihren Report
3. Fügen Sie die Modellformel zu Ihrer Dokumentation hinzu