In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

# Load the processed imputed data and feature columns
wetter_umsatzdaten_kiwo = pd.read_csv("../0_DataPreparation/processed_data_imputed.csv")

# Filter out rows with Umsatz = 0
wetter_umsatzdaten_kiwo = wetter_umsatzdaten_kiwo[wetter_umsatzdaten_kiwo['Umsatz'] != 0]

with open("../0_DataPreparation/feature_columns.txt", "r") as f:
    feature_columns = f.read().splitlines()

# Split dataset into training and validation sets
training_start_date = '2013-07-01'
training_end_date = '2017-07-31'
validation_start_date = '2017-08-01'
validation_end_date = '2018-07-31'

training_data = wetter_umsatzdaten_kiwo[
    (wetter_umsatzdaten_kiwo['Datum'] >= training_start_date) & 
    (wetter_umsatzdaten_kiwo['Datum'] <= training_end_date)
]
validation_data = wetter_umsatzdaten_kiwo[
    (wetter_umsatzdaten_kiwo['Datum'] >= validation_start_date) & 
    (wetter_umsatzdaten_kiwo['Datum'] <= validation_end_date)
]

X_train = training_data[feature_columns].apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float64)
y_train = training_data['Umsatz'].to_numpy()

X_val = validation_data[feature_columns].apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float64)
y_val = validation_data['Umsatz'].to_numpy()


# Normalize features
scaler = {
    "mean": np.mean(X_train, axis=0),
    "std": np.std(X_train, axis=0)
}
scaler['std'][scaler['std'] == 0] = 1  # Avoid division by zero
X_train = (X_train - scaler['mean']) / scaler['std']
X_val = (X_val - scaler['mean']) / scaler['std']

# Train a regularized linear regression model
def compute_cost_reg(X, y, w, b, lambda_):
    m = X.shape[0]
    cost = (1 / (2 * m)) * np.sum((np.dot(X, w) + b - y) ** 2)
    reg_cost = (lambda_ / (2 * m)) * np.sum(w ** 2)
    return cost + reg_cost

def gradient_descent_reg(X, y, w_in, b_in, alpha, num_iters, lambda_):
    w = w_in
    b = b_in
    for _ in range(num_iters):
        dj_dw = (1 / len(X)) * np.dot((np.dot(X, w) + b - y), X) + (lambda_ / len(X)) * w
        dj_db = (1 / len(X)) * np.sum(np.dot(X, w) + b - y)
        w -= alpha * dj_dw
        b -= alpha * dj_db
    return w, b

# Hyperparameter tuning
lambda_values = [0.01, 0.1, 1, 10]
best_r_squared = -np.inf
best_lambda = None

for lambda_ in lambda_values:
    w_init = np.zeros(X_train.shape[1])
    b_init = 0
    w_final, b_final = gradient_descent_reg(X_train, y_train, w_init, b_init, 0.01, 1000, lambda_)
    y_val_pred = np.dot(X_val, w_final) + b_final
    r_squared = 1 - (np.sum((y_val - y_val_pred) ** 2) / np.sum((y_val - np.mean(y_val)) ** 2))

    if r_squared > best_r_squared:
        best_r_squared = r_squared
        best_lambda = lambda_

print(f"Best Lambda: {best_lambda}, Best R^2: {best_r_squared:.4f}")

# Final model training with best lambda
w_final, b_final = gradient_descent_reg(X_train, y_train, np.zeros(X_train.shape[1]), 0, 0.01, 1000, best_lambda)

# Validate the model
y_val_pred = np.dot(X_val, w_final) + b_final
validation_cost = compute_cost_reg(X_val, y_val, w_final, b_final, best_lambda)
mse_val = np.mean((y_val - y_val_pred) ** 2)
mape_val = np.mean(np.abs((y_val - y_val_pred) / y_val)) * 100
r_squared_val = 1 - (np.sum((y_val - y_val_pred) ** 2) / np.sum((y_val - np.mean(y_val)) ** 2))
print(f"Validation cost: {validation_cost:.4e}")
print(f"Mean Squared Error (MSE) on validation set: {mse_val:.4e}")
print(f"Mean Absolute Percentage Error (MAPE) on validation set: {mape_val:.2f}%")
print(f"R^2 on validation set: {r_squared_val:.4f}")

# Save the best model and scaler
joblib.dump({"weights": w_final, "bias": b_final, "scaler": scaler}, "best_model.joblib")
print("Best model saved to 'best_model.joblib'")

# Residual visualization
residuals = y_val - y_val_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_val, residuals, alpha=0.6)
plt.axhline(0, color='red', linestyle='--')
plt.title("Residuals Plot")
plt.xlabel("Actual Values")
plt.ylabel("Residuals")
plt.tight_layout()
plt.show()

# Feature importance visualization
plt.figure(figsize=(10, 6))
plt.bar(feature_columns, w_final)
plt.title("Feature Importance")
plt.xlabel("Features")
plt.ylabel("Coefficients")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

# Output final model parameters
print("\nFinal Model Parameters:")
for i, feature in enumerate(feature_columns):
    print(f"Feature: {feature}, Coefficient: {w_final[i]:.4f}")
print(f"Intercept (b): {b_final:.4f}")


# Use model on test data

In [None]:
# Load the test dataset and sample submission
test_data_path = "../0_DataPreparation/processed_data_imputed.csv"
sample_submission_path = "../0_DataPreparation/sample_submission.csv"
final_submission_path = "final_submission_linear_regression.csv"

test_data = pd.read_csv(test_data_path)
sample_submission = pd.read_csv(sample_submission_path)

# Extract features for prediction
X_test = test_data[feature_columns].apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float64)

# Load the best model
model_data = joblib.load("best_model.joblib")
w_final = model_data["weights"]
b_final = model_data["bias"]
scaler = model_data["scaler"]

# Normalize test features
X_test = (X_test - scaler["mean"]) / scaler["std"]

# Predict `Umsatz` for the test dataset
y_test_pred = np.dot(X_test, w_final) + b_final

# Add predictions to the test dataset
test_data['Predicted_Umsatz'] = y_test_pred

# Merge predictions with sample submission to match structure
final_submission = sample_submission.merge(
    test_data[['id', 'Predicted_Umsatz']], on='id', how='left'
)

# Replace the existing 'Umsatz' column in the sample submission with 'Predicted_Umsatz'
final_submission['Umsatz'] = final_submission['Predicted_Umsatz']

# Drop the 'Predicted_Umsatz' column
final_submission.drop(columns=['Predicted_Umsatz'], inplace=True)

# Replace null values in the 'Umsatz' column with 0
final_submission['Umsatz'] = final_submission['Umsatz'].fillna(0)

# Round the 'Umsatz' values to 2 decimal places
final_submission['Umsatz'] = final_submission['Umsatz'].round(2)

# Save the final submission file
final_submission.to_csv(final_submission_path, index=False)

print(f"Final submission saved to: {final_submission_path}")
