In [4]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
df = pd.read_csv("data/mindpulse_synthetic.csv", parse_dates=['timestamp'])

# ---------------------------
# 1. Feature Engineering
# ---------------------------

def coping_count(s):
    try:
        s = str(s).strip("[] ")
        if not s:
            return 0
        return len([p for p in s.split(",") if p.strip()])
    except:
        return 0

df["coping_count"] = df["coping_methods"].apply(coping_count)

# Clean diet & WLB to remove emojis
for col in ["diet_quality", "work_life_balance"]:
    df[col] = df[col].astype(str).str.replace("ðŸ”´","").str.replace("ðŸŸ ","").str.replace("ðŸŸ¢","").str.replace("ðŸ”µ","").str.strip()

df["exercise_freq"] = df["exercise_freq"].astype(str).str.strip()

# ---------------------------
# 2. Select Features + Target
# ---------------------------
features = ["age", "sleep_hours", "coping_count", 
            "exercise_freq", "diet_quality", "social_interaction", "work_life_balance"]

X = df[features].copy()
y = df["risk_score"].copy()   # <-- Linear Regression target

# ---------------------------
# 3. Preprocessing Pipeline
# ---------------------------
numeric_cols = ["age", "sleep_hours", "coping_count"]
ordinal_cols = ["exercise_freq", "diet_quality", "work_life_balance"]

ord_enc = OrdinalEncoder(categories=[
    ["", "None", "1-2 days/week", "3-5 days/week", "Daily"],
    ["", "Poor", "Average", "Good", "Excellent"],
    ["", "Poor", "Fair", "Good", "Excellent"]
],
handle_unknown="use_encoded_value", unknown_value=-1)

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

ordinal_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="")),
    ("ordinal", ord_enc)
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_cols),
    ("ord", ordinal_pipeline, ordinal_cols)
])

# Transform dataset
X_processed = preprocessor.fit_transform(X)

# ---------------------------
# 4. Train-Test Split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

# ---------------------------
# 5. Linear Regression Model
# ---------------------------
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

