In [29]:
import os
import sys
from pathlib import Path

root = Path(os.getcwd()).parent
sys.path.append(str(root))

import pandas as pd

# Load morphological data
morph_df = pd.read_csv("../data/morphology_data_cleaned.csv")
morph_df.rename(columns={"id": "neighborhood_id"}, inplace=True)

# Load health data
health_df_soc = pd.read_excel(
    "../data/synthetic_health_data.xlsx", sheet_name="Participant_SocioDemograph_Data"
)
health_df_clin = pd.read_excel(
    "../data/synthetic_health_data.xlsx", sheet_name="Participant_HEALTH_Data"
)

# Merge health data
health_df = pd.merge(
    health_df_soc, health_df_clin, on=["participant_id", "neighborhood_id"], how="inner"
)

# Merge morphology and health data
df = pd.merge(morph_df, health_df, on="neighborhood_id", how="inner")

print(f"Final dataset shape: {df.shape}")

Final dataset shape: (2631, 33)


In [30]:
# Create quantile-based age groups so each bin has comparable counts
age_bin_series = pd.qcut(
    df["age"],
    q=min(4, df["age"].nunique()),
    duplicates="drop",
)

age_labels = []
age_intervals = age_bin_series.cat.categories
for idx, interval in enumerate(age_intervals):
    left_edge = interval.left
    right_edge = interval.right
    if idx == 0:
        label = f"<= {int(round(right_edge))}"
    elif idx == len(age_intervals) - 1:
        label = f"> {int(round(left_edge))}"
    else:
        label = f"{int(round(left_edge))}-{int(round(right_edge))}"
    age_labels.append(label)

df["age_bin"] = age_bin_series.cat.rename_categories(age_labels)

In [None]:
numerical_cols = [
    "PC1",
    "PC2",
    "PC3",
    "PC4",
    "PC5",
    "PC6",
    "z_distance",
    "lst_mean",
    "solar_summ",
    "solar_wint",
    "pm10_mean",
    "pm25_mean",
    "no2_mean",
    "noiseday_m",
    "noisenight",
    "points_sleep_deprivation",
    "sleeping_hours",
    "bedtime_hour",
]

categorical_cols = [
    "typology",
    "sex",
    "income",
    "education_level",
    "age_bin",
    "GHQ12_score",
]

binary_cols = [
    "heart_failure",
    "heart_rhythm",
    "sleep_disorder_hot",
    "d_breath_respiratory",
    "d_breath_asthma",
]

possible_features = [
    "points_sleep_deprivation",
    "sleeping_hours",
    "bedtime_hour",
    "GHQ12_score",
    "heart_failure",
    "heart_rhythm",
    "sleep_disorder_hot",
    "d_breath_respiratory",
    "d_breath_asthma",
]

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Drop columns that are neither numerical nor categorical nor binary
features = df.drop(
    columns=[
        col
        for col in df.columns
        if col not in numerical_cols + categorical_cols + binary_cols
    ]
)

# Encode categorical variables
for col in categorical_cols:
    dummies = pd.get_dummies(features[col], prefix=col)
    features = pd.concat([features.drop(columns=[col]), dummies], axis=1)

In [33]:
features.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,lst_mean,solar_summ,solar_wint,pm10_mean,...,GHQ12_score_3,GHQ12_score_4,GHQ12_score_5,GHQ12_score_6,GHQ12_score_7,GHQ12_score_8,GHQ12_score_9,GHQ12_score_10,GHQ12_score_11,GHQ12_score_12
0,0.526055,0.33848,0.846051,-1.420268,-0.217506,0.687111,-0.111053,1.130211,0.982144,-1.361464,...,False,False,False,False,False,False,False,True,False,False
1,0.526055,0.33848,0.846051,-1.420268,-0.217506,0.687111,-0.111053,1.130211,0.982144,-1.361464,...,False,False,False,False,True,False,False,False,False,False
2,0.526055,0.33848,0.846051,-1.420268,-0.217506,0.687111,-0.111053,1.130211,0.982144,-1.361464,...,True,False,False,False,False,False,False,False,False,False
3,0.526055,0.33848,0.846051,-1.420268,-0.217506,0.687111,-0.111053,1.130211,0.982144,-1.361464,...,False,False,False,False,False,False,False,False,False,False
4,0.526055,0.33848,0.846051,-1.420268,-0.217506,0.687111,-0.111053,1.130211,0.982144,-1.361464,...,False,False,False,False,False,False,False,False,False,False


In [34]:
# Define target variable
target = df["GHQ12_score"]
# Normalize target
target = (target - target.mean()) / target.std()

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42
)

# Initialize and train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

ValueError: could not convert string to float: '20:55'

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 1.1078594386787426
R^2 Score: -0.0724979649299149
