In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


df = pd.read_csv("../csv_files/final_dataset.csv")
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date").reset_index(drop=True)


feature_cols = [
    "steps","calories","resting_hr","average_hr",
    "sleep_hours","sleep_quality","gym_minutes","gym_intensity"
]
X = df[feature_cols]
y = df["study_motivation"]


tscv = TimeSeriesSplit(n_splits=5)

scoring = {
    "mae": "neg_mean_absolute_error",
    "rmse": "neg_root_mean_squared_error",
    "r2": "r2"
}


models = {
    "LinearRegression (baseline)": LinearRegression(),
    "Ridge (alpha=1.0)": Ridge(alpha=1.0),
    "Lasso (alpha=0.05)": Lasso(alpha=0.05),
    "RandomForest (depth=4)": RandomForestRegressor(
        n_estimators=300, random_state=42, max_depth=4
    ),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}


results = []

for name, model in models.items():
    pipe = Pipeline([
        ("scaler", StandardScaler()),   # linear modeller için önemli
        ("model", model)
    ])
    cv = cross_validate(pipe, X, y, cv=tscv, scoring=scoring)

    results.append({
        "model": name,
        "MAE": -cv["test_mae"].mean(),
        "RMSE": -cv["test_rmse"].mean(),
        "R2": cv["test_r2"].mean()
    })

res_df = pd.DataFrame(results).sort_values("MAE").reset_index(drop=True)
print(res_df)


best_name = res_df.loc[0, "model"]
print("\nBest model by MAE:", best_name)

best_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", models[best_name])
])
best_pipe.fit(X, y)


                         model       MAE      RMSE        R2
0           Lasso (alpha=0.05)  0.626312  0.761949 -0.573915
1  LinearRegression (baseline)  0.655380  0.747011 -0.835685
2       RandomForest (depth=4)  0.674499  0.805405 -0.783792
3             GradientBoosting  0.704461  0.856336 -1.191685
4            Ridge (alpha=1.0)  0.732368  0.909770 -1.223976

Best model by MAE: Lasso (alpha=0.05)


In [16]:
from sklearn.dummy import DummyRegressor

dummy = DummyRegressor(strategy="mean")

dummy_pipe = Pipeline([
    ("scaler", StandardScaler()),  
    ("model", dummy)
])

cv_dummy = cross_validate(dummy_pipe, X, y, cv=tscv, scoring=scoring)

dummy_results = {
    "model": "DummyMean (naive baseline)",
    "MAE": -cv_dummy["test_mae"].mean(),
    "RMSE": -cv_dummy["test_rmse"].mean(),
    "R2": cv_dummy["test_r2"].mean()
}

print(dummy_results)


{'model': 'DummyMean (naive baseline)', 'MAE': np.float64(0.7883333333333333), 'RMSE': np.float64(0.8907933566904285), 'R2': np.float64(-0.880705387205387)}


In [17]:


import pandas as pd

lasso_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Lasso(alpha=0.05))
])

lasso_pipe.fit(X, y)

coef = lasso_pipe.named_steps["model"].coef_

coef_df = pd.DataFrame({
    "feature": feature_cols,
    "coef": coef,
    "abs_coef": abs(coef)
}).sort_values("abs_coef", ascending=False).reset_index(drop=True)

print(coef_df)


         feature      coef  abs_coef
0     average_hr  0.717068  0.717068
1    gym_minutes  0.415107  0.415107
2          steps  0.000000  0.000000
3       calories  0.000000  0.000000
4     resting_hr -0.000000  0.000000
5    sleep_hours  0.000000  0.000000
6  sleep_quality  0.000000  0.000000
7  gym_intensity  0.000000  0.000000


In [18]:
top5 = coef_df.head(5)
print("Top 5 features by |coef|:")
print(top5[["feature", "coef"]])


Top 5 features by |coef|:
       feature      coef
0   average_hr  0.717068
1  gym_minutes  0.415107
2        steps  0.000000
3     calories  0.000000
4   resting_hr -0.000000


In [19]:
ridge_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Ridge(alpha=1.0))
])
ridge_pipe.fit(X, y)

ridge_coef = ridge_pipe.named_steps["model"].coef_
ridge_coef_df = pd.DataFrame({
    "feature": feature_cols,
    "coef": ridge_coef,
    "abs_coef": abs(ridge_coef)
}).sort_values("abs_coef", ascending=False).reset_index(drop=True)

print(ridge_coef_df)


         feature      coef  abs_coef
0     average_hr  0.945441  0.945441
1    gym_minutes  0.397539  0.397539
2       calories  0.257687  0.257687
3    sleep_hours  0.169001  0.169001
4  sleep_quality -0.099132  0.099132
5  gym_intensity -0.056166  0.056166
6          steps -0.043687  0.043687
7     resting_hr  0.006186  0.006186


In [20]:
lasso2 = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Lasso(alpha=0.01))
])
lasso2.fit(X, y)

coef2 = lasso2.named_steps["model"].coef_
coef2_df = pd.DataFrame({"feature": feature_cols, "coef": coef2, "abs_coef": abs(coef2)}) \
            .sort_values("abs_coef", ascending=False).reset_index(drop=True)
print(coef2_df)



         feature      coef  abs_coef
0     average_hr  1.175421  1.175421
1    gym_minutes  0.269018  0.269018
2       calories  0.264755  0.264755
3    sleep_hours  0.247551  0.247551
4     resting_hr -0.027672  0.027672
5          steps -0.000000  0.000000
6  sleep_quality -0.000000  0.000000
7  gym_intensity -0.000000  0.000000
