
# Predicting "Biological Age" (or BMI) from Lifestyle

https://www.kaggle.com/datasets/ydalat/lifestyle-and-wellbeing-data


In [1]:

import os, json, math, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from joblib import dump, load

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import lightgbm as lgb
import shap
import matplotlib.pyplot as plt



In [5]:
# Path to your CSV (change this to your local file path)
CSV_PATH = "Wellbeing_and_lifestyle_data_Kaggle.csv"  # <-- change to your actual CSV path




In [6]:
df = pd.read_csv(CSV_PATH)
df

Unnamed: 0,Timestamp,FRUITS_VEGGIES,DAILY_STRESS,PLACES_VISITED,CORE_CIRCLE,SUPPORTING_OTHERS,SOCIAL_NETWORK,ACHIEVEMENT,DONATION,BMI_RANGE,...,SLEEP_HOURS,LOST_VACATION,DAILY_SHOUTING,SUFFICIENT_INCOME,PERSONAL_AWARDS,TIME_FOR_PASSION,WEEKLY_MEDITATION,AGE,GENDER,WORK_LIFE_BALANCE_SCORE
0,7/7/15,3,2,2,5,0,5,2,0,1,...,7,5,5,1,4,0,5,36 to 50,Female,609.5
1,7/7/15,2,3,4,3,8,10,5,2,2,...,8,2,2,2,3,2,6,36 to 50,Female,655.6
2,7/7/15,2,3,3,4,4,10,3,2,2,...,8,10,2,2,4,8,3,36 to 50,Female,631.6
3,7/7/15,3,3,10,3,10,7,2,5,2,...,5,7,5,1,5,2,0,51 or more,Female,622.7
4,7/7/15,5,1,3,3,10,4,2,4,2,...,7,0,0,2,8,1,5,51 or more,Female,663.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15967,3/14/21 5:42,3,3,0,4,0,10,0,4,2,...,7,0,1,1,5,2,5,51 or more,Female,644.5
15968,3/14/21 6:30,3,3,6,8,7,4,6,3,1,...,6,0,0,2,10,5,8,21 to 35,Female,714.9
15969,3/14/21 8:35,4,3,0,10,10,8,6,5,1,...,7,0,1,2,10,1,10,21 to 35,Male,716.6
15970,3/14/21 8:43,1,1,10,8,2,7,3,2,1,...,8,7,2,2,1,6,8,21 to 35,Female,682.0


In [9]:
assert df.isna().sum().sum() == 0, "Data contains missing values!"

In [11]:
df.columns = [col.lower() for col in df.columns]

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15972 entries, 0 to 15971
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   timestamp                15972 non-null  object 
 1   fruits_veggies           15972 non-null  int64  
 2   daily_stress             15972 non-null  object 
 3   places_visited           15972 non-null  int64  
 4   core_circle              15972 non-null  int64  
 5   supporting_others        15972 non-null  int64  
 6   social_network           15972 non-null  int64  
 7   achievement              15972 non-null  int64  
 8   donation                 15972 non-null  int64  
 9   bmi_range                15972 non-null  int64  
 10  todo_completed           15972 non-null  int64  
 11  flow                     15972 non-null  int64  
 12  daily_steps              15972 non-null  int64  
 13  live_vision              15972 non-null  int64  
 14  sleep_hours           

In [13]:
df.describe()

Unnamed: 0,fruits_veggies,places_visited,core_circle,supporting_others,social_network,achievement,donation,bmi_range,todo_completed,flow,daily_steps,live_vision,sleep_hours,lost_vacation,daily_shouting,sufficient_income,personal_awards,time_for_passion,weekly_meditation,work_life_balance_score
count,15972.0,15972.0,15972.0,15972.0,15972.0,15972.0,15972.0,15972.0,15972.0,15972.0,15972.0,15972.0,15972.0,15972.0,15972.0,15972.0,15972.0,15972.0,15972.0,15972.0
mean,2.922677,5.23297,5.508077,5.616454,6.474267,4.000751,2.715314,1.410656,5.745993,3.194778,5.703606,3.752129,7.042888,2.898886,2.930879,1.728963,5.711558,3.326572,6.233346,666.751503
std,1.442694,3.311912,2.840334,3.242021,3.086672,2.755837,1.851586,0.491968,2.624097,2.357518,2.891013,3.230987,1.199044,3.69218,2.676301,0.444509,3.08963,2.729293,3.016571,45.019868
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,480.0
25%,2.0,2.0,3.0,3.0,4.0,2.0,1.0,1.0,4.0,1.0,3.0,1.0,6.0,0.0,1.0,1.0,3.0,1.0,4.0,636.0
50%,3.0,5.0,5.0,5.0,6.0,3.0,3.0,1.0,6.0,3.0,5.0,3.0,7.0,0.0,2.0,2.0,5.0,3.0,7.0,667.7
75%,4.0,8.0,8.0,10.0,10.0,6.0,5.0,2.0,8.0,5.0,8.0,5.0,8.0,5.0,4.0,2.0,9.0,5.0,10.0,698.5
max,5.0,10.0,10.0,10.0,10.0,10.0,5.0,2.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,2.0,10.0,10.0,10.0,820.2


## Pandas profiling

In [15]:
!uv pip install ydata_profiling

[2mUsing Python 3.11.13 environment at: c:\Users\hayk_\OneDrive\Desktop\01_python_math_ml_course\ma[0m
[2mResolved [1m50 packages[0m [2min 1.74s[0m[0m
[36m[1mDownloading[0m[39m matplotlib [2m(7.7MiB)[0m
[36m[1mDownloading[0m[39m minify-html [2m(3.0MiB)[0m
[36m[1mDownloading[0m[39m numba [2m(2.7MiB)[0m
[36m[1mDownloading[0m[39m llvmlite [2m(28.9MiB)[0m
[36m[1mDownloading[0m[39m pywavelets [2m(4.0MiB)[0m
[36m[1mDownloading[0m[39m statsmodels [2m(9.2MiB)[0m
 [32m[1mDownloading[0m[39m numba
 [32m[1mDownloading[0m[39m minify-html
 [32m[1mDownloading[0m[39m pywavelets
 [32m[1mDownloading[0m[39m matplotlib
 [32m[1mDownloading[0m[39m statsmodels
 [32m[1mDownloading[0m[39m llvmlite
[2mPrepared [1m16 packages[0m [2min 36.19s[0m[0m
[2mUninstalled [1m3 packages[0m [2min 471ms[0m[0m
[2mInstalled [1m18 packages[0m [2min 618ms[0m[0m
 [32m+[39m [1mdacite[0m[2m==1.9.2[0m
 [32m+[39m [1mfiletype[0m[2m==1.2.0

In [16]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Health Data Profiling Report", explorative=True)

profile.to_file("health_data_profiling_report.html")

100%|██████████| 24/24 [00:01<00:00, 12.12it/s]2<00:00,  5.63it/s, Describe variable: work_life_balance_score]
Summarize dataset: 100%|██████████| 357/357 [00:51<00:00,  6.98it/s, Completed]                                               
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.22s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.95s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 10.74it/s]


In [18]:
df[df.duplicated()]

Unnamed: 0,timestamp,fruits_veggies,daily_stress,places_visited,core_circle,supporting_others,social_network,achievement,donation,bmi_range,...,sleep_hours,lost_vacation,daily_shouting,sufficient_income,personal_awards,time_for_passion,weekly_meditation,age,gender,work_life_balance_score
77,7/20/15,5,2,10,1,4,5,0,5,1,...,8,0,6,2,10,1,6,36 to 50,Female,694.0
82,7/20/15,0,5,0,2,0,0,3,0,2,...,6,10,10,1,2,1,10,21 to 35,Male,538.8
89,7/20/15,5,0,8,3,10,7,6,3,1,...,8,0,1,2,10,5,10,36 to 50,Female,749.7
115,7/23/15,5,2,10,7,4,3,5,2,1,...,9,8,3,2,10,5,10,Less than 20,Female,709.4
120,7/24/15,1,2,6,8,4,5,2,1,1,...,10,0,3,2,1,2,4,21 to 35,Female,665.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10475,3/14/19 3:28,3,3,6,9,6,7,8,3,1,...,6,4,6,1,8,7,10,Less than 20,Female,696.6
12414,1/18/20 7:59,2,4,5,3,3,4,0,5,2,...,6,0,9,1,1,2,2,21 to 35,Female,594.6
12758,2/25/20 3:12,4,5,3,6,4,10,1,5,2,...,7,4,7,1,2,1,2,36 to 50,Female,600.4
14014,7/20/20 18:02,5,3,6,3,6,10,6,2,1,...,7,10,3,1,6,2,4,21 to 35,Male,658.1


In [19]:
len(df[df.duplicated()]) / len(df)

0.030177811169546705

In [20]:
df.drop_duplicates(inplace=True)

In [23]:
df["timestamp"].value_counts()

timestamp
7/23/18          162
3/22/16          136
5/29/18          112
8/13/18           48
9/7/15            39
                ... 
11/5/19 8:21       1
11/5/19 0:27       1
11/4/19 23:20      1
11/4/19 19:32      1
3/14/21 9:03       1
Name: count, Length: 7002, dtype: int64

In [24]:
COLS_TO_DROP = ["timestamp"]

df.drop(columns=COLS_TO_DROP, inplace=True)

In [29]:
age_mapping = {
    "Less than 20": 18,
    "21 to 35": (21 + 35) / 2,
    "36 to 50": (36 + 50) / 2,
    "51 or more": 65
}

df["age"] = df["age"].map(age_mapping)

In [30]:
df.corr()

ValueError: could not convert string to float: '1/1/00'

In [27]:
df.columns

Index(['fruits_veggies', 'daily_stress', 'places_visited', 'core_circle',
       'supporting_others', 'social_network', 'achievement', 'donation',
       'bmi_range', 'todo_completed', 'flow', 'daily_steps', 'live_vision',
       'sleep_hours', 'lost_vacation', 'daily_shouting', 'sufficient_income',
       'personal_awards', 'time_for_passion', 'weekly_meditation', 'age',
       'gender', 'work_life_balance_score'],
      dtype='object')

In [31]:
val = "1/1/00"

for col in df.columns:
    if df[col].dtype == "object":
        unique_values = df[col].unique()
        if val in unique_values:
            print(f"Column '{col}' contains the value '{val}'")

Column 'daily_stress' contains the value '1/1/00'


In [33]:
df["daily_stress"].value_counts()

daily_stress
3         4230
2         3302
4         2903
1         2409
5         1987
0          658
1/1/00       1
Name: count, dtype: int64

In [None]:
df[df["daily_stress"] == "1/1/00"]

In [35]:
df = df[df["daily_stress"] != "1/1/00"]

In [36]:
df.corr()

ValueError: could not convert string to float: 'Female'

In [39]:
df["gender"] = df["gender"].map({
    "Male": 1,
    "Female": 0
})

In [49]:
corr_type = "pearson"  # Options: 'pearson', 'spearman', 'kendall'


corr_matrix = df.corr(method=corr_type)

np.fill_diagonal(corr_matrix.values, 0)

# plt.figure(figsize=(10, 8))
plt.imshow(corr_matrix, cmap='coolwarm', vmin=0, vmax=1)
plt.colorbar()
plt.show()

## 2) Train/test split + simple preprocessing (impute + scale numerics; one-hot categoricals)

In [51]:
df.columns

Index(['fruits_veggies', 'daily_stress', 'places_visited', 'core_circle',
       'supporting_others', 'social_network', 'achievement', 'donation',
       'bmi_range', 'todo_completed', 'flow', 'daily_steps', 'live_vision',
       'sleep_hours', 'lost_vacation', 'daily_shouting', 'sufficient_income',
       'personal_awards', 'time_for_passion', 'weekly_meditation', 'age',
       'gender', 'work_life_balance_score'],
      dtype='object')

# Target

In [52]:
y = "work_life_balance_score"

df[y].hist()

<Axes: >

In [53]:
for col in df.columns:
    if col != y:
        plt.figure()
        plt.scatter(df[col], df[y])
        plt.xlabel(col)
        plt.ylabel(y)
        plt.title(f"{col} vs {y}")
        plt.show()

## 3) Preprocessing and Models: Lasso (linear) and LightGBM (nonlinear)

In [None]:


encoder = OneHotEncoder(handle_unknown="ignore", min_frequency=0.01, sparse_output=False)
X_train_cat_encoded = encoder.fit_transform(X_train_cat)
X_test_cat_encoded = encoder.transform(X_test_cat)


# Preprocessing for LightGBM (no scaling needed, but needs encoding)
# 1. Impute numeric columns (use same imputer as before)
X_train_num_lgb = num_imputer.transform(X_train[numeric_cols])
X_test_num_lgb = num_imputer.transform(X_test[numeric_cols])

# 2. Impute and encode categorical (use same encoder as before)
X_train_cat_lgb = cat_imputer.transform(X_train[categorical_cols])
X_test_cat_lgb = cat_imputer.transform(X_test[categorical_cols])

X_train_cat_encoded_lgb = encoder.transform(X_train_cat_lgb)
X_test_cat_encoded_lgb = encoder.transform(X_test_cat_lgb)

# 3. Combine for LightGBM
X_train_lgb = np.hstack([X_train_num_lgb, X_train_cat_encoded_lgb])
X_test_lgb = np.hstack([X_test_num_lgb, X_test_cat_encoded_lgb])

print(f"LightGBM features shape: {X_train_lgb.shape}")

# Train LightGBM
lgb_model = lgb.LGBMRegressor(
    learning_rate=0.05, n_estimators=400, random_state=RANDOM_STATE,
    verbose=-1
)
lgb_model.fit(X_train_lgb, y_train)

# Evaluation function
def eval_model(model, X_tr, y_tr, X_te, y_te, label):
    pred_tr = model.predict(X_tr)
    pred_te = model.predict(X_te)
    metrics = {
        "MAE_train": mean_absolute_error(y_tr, pred_tr),
        "RMSE_train": mean_squared_error(y_tr, pred_tr, squared=False),
        "R2_train": r2_score(y_tr, pred_tr),
        "MAE_test": mean_absolute_error(y_te, pred_te),
        "RMSE_test": mean_squared_error(y_te, pred_te, squared=False),
        "R2_test": r2_score(y_te, pred_te),
    }
    print(f"\n== {label} ==")
    for k,v in metrics.items():
        print(f"{k:>10}: {v:8.4f}")
    return metrics

m_lin = eval_model(lasso_model, X_train_lasso, y_train, X_test_lasso, y_test, "Lasso")
m_tree = eval_model(lgb_model, X_train_lgb, y_train, X_test_lgb, y_test, "LightGBM")



## 4) SHAP explanations (local + simple global view)
- We'll explain the **LightGBM model** prediction for one test row.
- For background data, we sample 200 clones of the row with light noise on numeric columns.


In [None]:

# choose a test instance to explain
idx = X_test.index[0]
x_row = X_test.loc[[idx]]

def preprocess_and_predict(raw_df):
    """Preprocess raw data and predict with LightGBM"""
    # Impute numeric
    num_data = num_imputer.transform(raw_df[numeric_cols])
    
    # Impute and encode categorical
    cat_data = cat_imputer.transform(raw_df[categorical_cols])
    cat_encoded = encoder.transform(cat_data)
    
    # Combine
    X_processed = np.hstack([num_data, cat_encoded])
    
    return lgb_model.predict(X_processed)

# background: tile x_row with small gaussian noise on numeric columns
bg = pd.concat([x_row]*200, ignore_index=True)
for c in numeric_cols:
    if c in bg.columns:
        sd = max(1e-6, X_train[c].std(skipna=True))
        bg[c] = bg[c] + np.random.normal(scale=0.05*sd if sd>0 else 0.01, size=len(bg))

explainer = shap.Explainer(lambda A: preprocess_and_predict(pd.DataFrame(A, columns=X_test.columns)), bg.values, algorithm="auto")
sv = explainer(x_row.values)

pred_val = float(preprocess_and_predict(x_row)[0])
print(f"Explaining row {idx} — predicted {pred_val:.3f} for target '{target_col}'")

# show top contributions
shap_values = sv.values.reshape(-1)
top_idx = np.argsort(-np.abs(shap_values))[:10]
top_feats = np.array(X_test.columns)[top_idx]
top_vals = np.array(x_row.iloc[0])[top_idx]
top_shap = shap_values[top_idx]

display(pd.DataFrame({"feature": top_feats, "value": top_vals, "shap": top_shap}))

plt.figure(figsize=(7,4))
plt.barh(range(len(top_feats)), top_shap[::-1])
plt.yticks(range(len(top_feats)), top_feats[::-1])
plt.title("Top SHAP contributions (single row)"); plt.tight_layout()
plt.show()


In [None]:

# choose a test instance to explain
idx = X_test.index[0]
x_row = X_test.loc[[idx]]

def predict_from_raw(raw_df):
    return pipe_tree.predict(raw_df)

# background: tile x_row with small gaussian noise on numeric columns
bg = pd.concat([x_row]*200, ignore_index=True)
for c in numeric_cols:
    if c in bg.columns:
        sd = max(1e-6, X_train[c].std(skipna=True))
        bg[c] = bg[c] + np.random.normal(scale=0.05*sd if sd>0 else 0.01, size=len(bg))

explainer = shap.Explainer(lambda A: predict_from_raw(pd.DataFrame(A, columns=X_test.columns)), bg.values, algorithm="auto")
sv = explainer(x_row.values)

pred_val = float(predict_from_raw(x_row)[0])
print(f"Explaining row {idx} — predicted {pred_val:.3f} for target '{target_col}'")

# show top contributions
shap_values = sv.values.reshape(-1)
top_idx = np.argsort(-np.abs(shap_values))[:10]
top_feats = np.array(X_test.columns)[top_idx]
top_vals = np.array(x_row.iloc[0])[top_idx]
top_shap = shap_values[top_idx]

display(pd.DataFrame({"feature": top_feats, "value": top_vals, "shap": top_shap}))

plt.figure(figsize=(7,4))
plt.barh(range(len(top_feats)), top_shap[::-1])
plt.yticks(range(len(top_feats)), top_feats[::-1])
plt.title("Top SHAP contributions (single row)"); plt.tight_layout()
plt.show()



## 5) Minimal counterfactual: greedy tweaks within bounds
- Goal: move prediction by `CF_TARGET_DELTA` (default -5)
- Only moves **numeric** features within 5th–95th percentile ranges
- At each step, try a small step on each actionable feature, pick the best improvement


In [None]:

def numeric_bounds(df, cols):
    b = {}
    for c in cols:
        s = pd.to_numeric(df[c], errors="coerce")
        if s.notna().sum() > 0:
            lo, hi = s.quantile(0.05), s.quantile(0.95)
            if np.isfinite(lo) and np.isfinite(hi) and hi>lo:
                b[c] = {"min": float(lo), "max": float(hi)}
    return b

BOUNDS = numeric_bounds(X_train, numeric_cols)

def unit_effect(predict_func, row_df, feat, step):
    """Calculate unit effect of changing a feature"""
    base = float(predict_func(row_df)[0])
    x1 = row_df.copy()
    x1.iloc[0, x1.columns.get_loc(feat)] = x1.iloc[0][feat] + step
    newp = float(predict_func(x1)[0])
    return (newp - base) / (step if step!=0 else 1.0)

def greedy_counterfactual(predict_func, row_df, target_delta=-5.0, max_iters=80, step_fraction=0.15,
                          actionable=None, bounds=None):
    """
    Find counterfactual by greedily adjusting features.
    
    Args:
        predict_func: Function that takes a raw dataframe and returns predictions
        row_df: Single row dataframe to modify
        target_delta: Desired change in prediction
        max_iters: Maximum iterations
        step_fraction: Step size as fraction of feature range
        actionable: List of features that can be modified
        bounds: Dictionary of min/max bounds for features
    """
    xw = row_df.copy()
    start = float(predict_func(xw)[0])
    goal = start + target_delta
    actions = []

    if actionable is None:
        actionable = [c for c in LIKELY_ACTIONABLE_NUMERIC if c in xw.columns and c in (bounds or {})]
        if not actionable:
            actionable = [c for c in xw.columns if c in (bounds or {})]

    for _ in range(max_iters):
        cur = float(predict_func(xw)[0])
        if (target_delta < 0 and cur <= goal) or (target_delta > 0 and cur >= goal):
            break

        best = None
        direction = -1.0 if target_delta < 0 else 1.0

        for f in actionable:
            if f not in xw.columns or f not in bounds: 
                continue
            lo, hi = bounds[f]["min"], bounds[f]["max"]
            curv = float(xw.iloc[0][f])
            rng = hi - lo
            if rng <= 0:
                continue
            step = max(0.01*rng, step_fraction*rng) * direction
            trial = float(np.clip(curv + step, lo, hi))
            actual_step = trial - curv
            if abs(actual_step) < 1e-9:
                continue
            eff = unit_effect(predict_func, xw, f, actual_step)
            improve = -eff if target_delta < 0 else eff
            if best is None or improve > best["improve"]:
                best = {"feat": f, "trial": trial, "curv": curv, "improve": improve}

        if best is None:
            break

        f = best["feat"]
        old_val = best["curv"]
        xw.iloc[0, xw.columns.get_loc(f)] = best["trial"]
        newp = float(predict_func(xw)[0])
        actions.append({"feature": f, "from": old_val, "to": float(best["trial"]), "pred_after": newp})

    return xw, actions, start, float(predict_func(xw)[0])

# Run CF on the same row with the LightGBM model
x_new, actions, p0, p1 = greedy_counterfactual(
    preprocess_and_predict, x_row.copy(), target_delta=CF_TARGET_DELTA,
    max_iters=CF_MAX_ITERS, step_fraction=CF_STEP_FRACTION,
    bounds=BOUNDS
)

print(f"CF start {p0:.2f} → end {p1:.2f} (Δ {p1-p0:+.2f})")
if actions:
    for a in actions:
        print(f"- {a['feature']}: {a['from']:.3f} → {a['to']:.3f}   (pred → {a['pred_after']:.2f})")
else:
    print("No feasible changes within bounds. Try smaller |target_delta| or adjust step_fraction.")



## 6) Caveats
- This is **correlational**, not causal or medical advice.
- Counterfactuals are limited to numeric features within the empirical 5th–95th percentiles.
- For reproducibility across machines, set `RANDOM_STATE` and consider saving pipelines if needed.
