In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


df = pd.read_csv("csv_files/final_dataset.csv")
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date").reset_index(drop=True)


feature_cols = [
    "steps","calories","resting_hr","average_hr",
    "sleep_hours","sleep_quality","gym_minutes","gym_intensity"
]
X = df[feature_cols]
y = df["study_motivation"]


tscv = TimeSeriesSplit(n_splits=5)

scoring = {
    "mae": "neg_mean_absolute_error",
    "rmse": "neg_root_mean_squared_error",
    "r2": "r2"
}


models = {
    "LinearRegression (baseline)": LinearRegression(),
    "Ridge (alpha=1.0)": Ridge(alpha=1.0),
    "Lasso (alpha=0.05)": Lasso(alpha=0.05),
    "RandomForest (depth=4)": RandomForestRegressor(
        n_estimators=300, random_state=42, max_depth=4
    ),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}


results = []

for name, model in models.items():
    pipe = Pipeline([
        ("scaler", StandardScaler()),   # linear modeller için önemli
        ("model", model)
    ])
    cv = cross_validate(pipe, X, y, cv=tscv, scoring=scoring)

    results.append({
        "model": name,
        "MAE": -cv["test_mae"].mean(),
        "RMSE": -cv["test_rmse"].mean(),
        "R2": cv["test_r2"].mean()
    })

res_df = pd.DataFrame(results).sort_values("MAE").reset_index(drop=True)
print(res_df)


best_name = res_df.loc[0, "model"]
print("\nBest model by MAE:", best_name)

best_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", models[best_name])
])
best_pipe.fit(X, y)
