In [21]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [22]:
diary_df = pd.read_pickle('/home/s232713/data/final_merged_data.pkl')

tress_df = pd.read_csv('/home/s232713/data/gps_trip_stress.csv')
env_df    = pd.read_csv('/home/s232713/data/foot_time_env.csv')


In [23]:
individ_list = [
    "MMM21201","MMM15201","MMM15601","MMM13401","MMM12601",
    "MMM12701","MMM12501","MMM24301","MMM22401","MMM23601",
    "MMM23901","MMM14101","MMM15801","MMM16301","MMM22501",
    "MMM14001","MMM14401","MMM15001","MMM14301"
] # best data infdividuals

participant_ids = (
    diary_df.loc[diary_df["INDIVID"].isin(individ_list), "Participant ID"]
    .unique()
)

print(f"Participant IDs: {participant_ids}")

Participant IDs: [133 118 112 130 139 129 117 104 152 109 148 159 138 339 315 303 366 313
 390]


In [24]:
stress_df = tress_df[tress_df["INDIVID"].isin(individ_list)]
env_df    = env_df[env_df["INDIVID"].isin(individ_list)]

In [25]:
display(stress_df.head())
print(stress_df['stress_label'].value_counts())
print(stress_df['stress_prob'].describe())
print(stress_df['stress_binary'].value_counts())
display(env_df.head())

Unnamed: 0,participant ID,INDIVID,Timestamp,Latitude,Longitude,Accuracy,Altitude,Speed,Interval ID,Activity_concat,Milliseconds,stress_label,stress_prob,stress_binary
0,133,MMM12501,2023-11-08 13:53:57,55.814739,12.380711,4.746234,40.079327,1.594638,265109,Foot,1699451637000,1.0,1.0,1
1,133,MMM12501,2023-11-08 13:53:58,55.814738,12.380678,4.745669,40.046085,1.401877,265109,Foot,1699451638000,1.0,1.0,1
2,133,MMM12501,2023-11-08 13:53:59,55.814739,12.380655,4.744096,38.615249,1.371988,265109,Foot,1699451639000,1.0,1.0,1
3,133,MMM12501,2023-11-08 13:54:00,55.814738,12.380632,4.741475,37.977603,1.204911,265109,Foot,1699451640000,1.0,1.0,1
4,133,MMM12501,2023-11-08 13:54:01,55.814739,12.380608,4.738739,38.215747,1.293555,265109,Foot,1699451641000,1.0,1.0,1


stress_label
1.0    223612
2.0      9624
Name: count, dtype: int64
count    233236.000000
mean          0.426098
std           0.481210
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           1.000000
Name: stress_prob, dtype: float64
stress_binary
0    143371
1     99698
Name: count, dtype: int64


Unnamed: 0,INDIVID,Interval ID,GRID_ID,t_entry,t_exit,type,Start Time_x,stress,start_dt,year,...,Wall,Utility Pole,Sky,Pole,Bicyclist,Bus,Lane Marking - General,Other Rider,Curb,Bicycle
0,MMM12501,265109,190047,2023-11-08 13:54:08,2023-11-08 13:55:34,line_cross,2023-11-08 13:53:57,1,2023-11-08 13:53:57,2023,...,0.30304,0.03212,45.786037,0.131664,0.0,0.007706,1.840706,0.0,1.30888,0.02718
1,MMM12501,265109,190703,2023-11-08 13:55:34,2023-11-08 13:56:37,line_cross,2023-11-08 13:53:57,1,2023-11-08 13:53:57,2023,...,0.186329,0.008888,45.51153,0.134697,0.0,0.034084,0.584087,0.0,1.017246,0.0
2,MMM12501,265109,190704,2023-11-08 13:53:57,2023-11-08 13:54:08,line_cross,2023-11-08 13:53:57,1,2023-11-08 13:53:57,2023,...,0.078812,0.029297,45.058765,0.178432,0.000134,0.0,0.967426,0.0,0.170422,0.017872
3,MMM12501,265109,191360,2023-11-08 13:56:37,2023-11-08 13:56:37,buffer_only,2023-11-08 13:53:57,1,2023-11-08 13:53:57,2023,...,1.113256,0.002041,35.056755,0.093575,0.0,0.0,0.000305,0.0,0.768579,0.0
4,MMM12501,266118,187406,2023-11-09 11:34:35,2023-11-09 11:34:36,line_cross,2023-11-09 11:28:42,0,2023-11-09 11:28:42,2023,...,3.148632,0.001736,36.127605,0.046902,0.802364,0.0,0.446949,0.033226,1.607914,0.262966


In [26]:
def time_weighted_mean(x: pd.Series) -> float:
    if len(x) < 2:
        return np.nan
    dt = x.index.to_series().diff().dt.total_seconds()
    dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
    dt = dt.clip(lower=0.0)
    wsum = dt.sum()
    return float((x.values * dt.values).sum() / wsum) if wsum > 0 else float(x.mean())


In [27]:
all_episodes = []

all_individs = stress_df["INDIVID"].unique()

for INDIVID in all_individs:

    # ----------------------------
    # Stress data for individual
    # ----------------------------
    df = stress_df.loc[stress_df["INDIVID"] == INDIVID].copy()

    if len(df) < 10:
        continue

    df["Timestamp"] = pd.to_datetime(df["Timestamp"])
    df = df.set_index("Timestamp").sort_index()

    # ----------------------------
    # Environmental intervals
    # ----------------------------
    env_cols = [
        "INDIVID",
        "Interval ID",
        "t_entry",
        "t_exit",
        "grid_ndvi_mean",
        "noise_L_mean",
        "roads_transportation",
        "buildings_facilities",
    ]

    env_sub = env_df.loc[env_df["INDIVID"] == INDIVID, env_cols].copy()
    if len(env_sub) == 0:
        continue

    env_sub["t_entry"] = pd.to_datetime(env_sub["t_entry"])
    env_sub["t_exit"] = pd.to_datetime(env_sub["t_exit"])

    env_vars = [
        "grid_ndvi_mean",
        "noise_L_mean",
        "roads_transportation",
        "buildings_facilities",
    ]
    trip_col = "Interval ID"

    # ----------------------------
    # Build cell-episodes
    # ----------------------------
    episodes = []

    for _, row in env_sub.iterrows():
        t0 = row["t_entry"]
        t1 = row["t_exit"]

        mask = (df.index >= t0) & (df.index < t1)  # t1 escluso
        stress_seg = df.loc[mask, "stress_prob"].dropna()

        if len(stress_seg) < 3:
            continue

        duration_sec = (t1 - t0).total_seconds()
        if duration_sec < 10:
            continue

        episodes.append(
            {
                "INDIVID": INDIVID,
                trip_col: row[trip_col],
                "duration_sec": duration_sec,
                "stress_mean": time_weighted_mean(stress_seg),
                "grid_ndvi_mean": row["grid_ndvi_mean"],
                "noise_L_mean": row["noise_L_mean"],
                "roads_transportation": row["roads_transportation"],
                "buildings_facilities": row["buildings_facilities"],
            }
        )

    if len(episodes) < 20:
        continue

    df_ep = pd.DataFrame(episodes)

    # ----------------------------
    # Trip-centering (BEFORE z-scoring)
    # ----------------------------
    df_ep["stress_trip_c"] = df_ep["stress_mean"] - df_ep.groupby(["INDIVID", trip_col])[
        "stress_mean"
    ].transform("mean")

    for v in env_vars:
        df_ep[v + "_trip_c"] = df_ep[v] - df_ep.groupby(["INDIVID", trip_col])[v].transform("mean")

    # ----------------------------
    # Within-individual standardization
    # ----------------------------
    # (A) raw within-person z (optional, keep if you still want it)
    raw_std = df_ep[env_vars].std()
    df_ep[env_vars] = (df_ep[env_vars] - df_ep[env_vars].mean()) / raw_std.replace(0, np.nan)
    df_ep["stress_mean_z"] = (df_ep["stress_mean"] - df_ep["stress_mean"].mean()) / df_ep[
        "stress_mean"
    ].std()

    # (B) trip-centered within-person z (this is the one youâ€™ll use for the trip-centered model)
    trip_centered_vars = [v + "_trip_c" for v in env_vars]
    tc_std = df_ep[trip_centered_vars].std()
    df_ep[trip_centered_vars] = (df_ep[trip_centered_vars] - df_ep[trip_centered_vars].mean()) / tc_std.replace(0, np.nan)
    df_ep["stress_trip_c_z"] = (df_ep["stress_trip_c"] - df_ep["stress_trip_c"].mean()) / df_ep[
        "stress_trip_c"
    ].std()

    all_episodes.append(df_ep)

df_ep_all = pd.concat(all_episodes, ignore_index=True)

print("Total episodes:", len(df_ep_all))
print("Total individuals:", df_ep_all["INDIVID"].nunique())

display(df_ep_all.head())


  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0


Total episodes: 4459
Total individuals: 13


  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0
  dt.iloc[0] = dt.iloc[1] if len(dt) > 1 else 0.0


Unnamed: 0,INDIVID,Interval ID,duration_sec,stress_mean,grid_ndvi_mean,noise_L_mean,roads_transportation,buildings_facilities,stress_trip_c,grid_ndvi_mean_trip_c,noise_L_mean_trip_c,roads_transportation_trip_c,buildings_facilities_trip_c,stress_mean_z,stress_trip_c_z
0,MMM12501,265109,86.0,0.966992,-1.128177,0.702517,0.058113,-0.021323,-0.008806,-0.080095,5.649312e-16,-0.18743,2.040336e-17,0.556525,-0.098308
1,MMM12501,265109,63.0,0.960403,-1.522144,0.702517,-0.137246,-0.021323,-0.015396,-0.553839,5.649312e-16,-0.468575,2.040336e-17,0.503137,-0.171872
2,MMM12501,265109,11.0,1.0,-0.534387,0.702517,0.644189,-0.021323,0.024202,0.633934,5.649312e-16,0.656005,2.040336e-17,0.823945,0.27018
3,MMM12501,266118,14.0,1.0,-0.332733,0.702517,1.034906,0.820946,0.029452,-0.694762,5.649312e-16,0.105429,0.7939747,0.823945,0.328799
4,MMM12501,266118,26.0,0.969778,-0.479095,0.702517,2.793133,1.663215,-0.000769,-0.870762,5.649312e-16,2.635735,1.701374,0.579096,-0.008586


In [28]:
print(df_ep_all[[
    "stress_mean_z",
    "grid_ndvi_mean",
    "noise_L_mean",
    "roads_transportation",
    "buildings_facilities"
]].describe())

print(df_ep_all.groupby("INDIVID").size().describe())

       stress_mean_z  grid_ndvi_mean  noise_L_mean  roads_transportation  \
count   4.459000e+03    4.459000e+03  4.459000e+03          4.458000e+03   
mean   -1.513827e-15    3.346355e-17  2.900174e-16         -2.231404e-17   
std     9.986532e-01    9.986532e-01  9.986532e-01          9.986529e-01   
min    -1.577624e+01   -3.177945e+00 -3.406530e+00         -1.978447e+00   
25%    -4.549688e-01   -7.254056e-01 -7.555754e-01         -6.368060e-01   
50%    -9.740686e-02   -1.389250e-01  3.698111e-02         -2.291965e-01   
75%     4.133056e-01    6.221786e-01  7.385475e-01          3.194055e-01   
max     1.355053e+01    6.108927e+00  2.985632e+00          7.371466e+00   

       buildings_facilities  
count          4.458000e+03  
mean          -4.781580e-18  
std            9.986529e-01  
min           -1.105460e+00  
25%           -7.014338e-01  
50%           -3.449136e-01  
75%            5.202516e-01  
max            7.162436e+00  
count      13.000000
mean      343.000000
std

In [29]:
# remove rows with NaN or inf in predictors or outcome
model_vars = [
    "stress_mean_z",
    "grid_ndvi_mean",
    "noise_L_mean",
    "roads_transportation",
    "buildings_facilities",
    "duration_sec"
]

df_model = df_ep_all[model_vars + ["INDIVID", "Interval ID"]].copy()

# replace inf with NaN
df_model = df_model.replace([np.inf, -np.inf], np.nan)

# drop rows with any NaN
df_model = df_model.dropna()

print("Episodes before cleaning:", len(df_ep_all))
print("Episodes after cleaning:", len(df_model))
print("Individuals after cleaning:", df_model["INDIVID"].nunique())

df_ep_all.groupby("INDIVID").size().describe(), \
df_model.groupby("INDIVID").size().describe()



Episodes before cleaning: 4459
Episodes after cleaning: 4458
Individuals after cleaning: 13


(count      13.000000
 mean      343.000000
 std       325.607637
 min        71.000000
 25%       124.000000
 50%       185.000000
 75%       500.000000
 max      1160.000000
 dtype: float64,
 count      13.000000
 mean      342.923077
 std       325.652284
 min        71.000000
 25%       124.000000
 50%       185.000000
 75%       500.000000
 max      1160.000000
 dtype: float64)

In [30]:
env_vars = [
    "grid_ndvi_mean",
    "noise_L_mean",
    "roads_transportation",
    "buildings_facilities",
]

y = df_model["stress_mean_z"]

X = df_model[env_vars]
X = sm.add_constant(X)

w = np.sqrt(df_model["duration_sec"])

pooled_model = sm.WLS(y, X, weights=w).fit(
    cov_type="cluster",
    cov_kwds={"groups": df_model["INDIVID"]}
)

print('BEST DATA POOLED WEIGHTED LEAST SQUARES MODEL')
print(pooled_model.summary())

BEST DATA POOLED WEIGHTED LEAST SQUARES MODEL
                            WLS Regression Results                            
Dep. Variable:          stress_mean_z   R-squared:                       0.001
Model:                            WLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.826
Date:                Thu, 15 Jan 2026   Prob (F-statistic):              0.189
Time:                        15:22:08   Log-Likelihood:                -5942.0
No. Observations:                4458   AIC:                         1.189e+04
Df Residuals:                    4453   BIC:                         1.193e+04
Df Model:                           4                                         
Covariance Type:              cluster                                         
                           coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------