In [1]:
# Cell 1: Imports
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import joblib

# Cell 2: Load Cleaned Data
data_path = "data/processed/cleaned_data.csv"
df = pd.read_csv(data_path, parse_dates=["Date"])
print("Cleaned data loaded:")
print(df.columns.tolist())

# Cell 3: Add Time-Based Features
df["Hour"] = df["Date"].dt.hour
df["DayOfWeek"] = df["Date"].dt.dayofweek
df["Month"] = df["Date"].dt.month

# Cell 4: Feature Selection
features = ["Temperature", "Humidity", "WindSpeed", "Hour", "DayOfWeek", "Month"]
target = "Demand"

X = df[features]
y = df[target]

# Cell 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Cell 6: Train Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_preds))
print(f"Linear Regression RMSE: {lr_rmse:.2f}")

# Save Linear Model
joblib.dump(lr_model, "models/lr_model.pkl")

# Cell 7: Train XGBoost
xgb_model = XGBRegressor(random_state=42, n_estimators=100)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_preds))
print(f"XGBoost RMSE: {xgb_rmse:.2f}")

# Save XGBoost Model
joblib.dump(xgb_model, "models/xgb_model.pkl")

# Done
print("Models trained and saved to 'models/' folder.")

FileNotFoundError: [Errno 2] No such file or directory: 'data/processed/cleaned_data.csv'