### Import relevant libraries

In [1]:
from semopy import Model, calc_stats, semplot
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from scipy.stats import f_oneway
import pandas as pd
import numpy as np
import seaborn as sns
from semopy.inspector import inspect


### Import the dataset

In [11]:
df = pd.read_csv("Welzijnsmonitor2025_scaled_normalised_UTF8.csv", sep=";")

### Looking for correlations between variables

In [16]:
numeric_cols = [col for col in df if pd.api.types.is_numeric_dtype(df[col]) and not pd.api.types.is_bool_dtype(df[col])]

# Compute correlation matrix for numeric columns
corr_matrix = df[numeric_cols].corr()

# Take only upper triangle (no self- or duplicate pairs)
upper = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

# Turn into a Series of pairs and filter |corr| > 0.7
high_corr_pairs = (
    upper
    .stack()                       # (col1, col2) index, correlation as values
    .rename("correlation")
)

# Keep only strong correlations and sort by |correlation|
high_corr_pairs = high_corr_pairs[
    high_corr_pairs.abs() > 0.7
].sort_values(key=lambda s: s.abs(), ascending=False)

print(high_corr_pairs)

StudentID    Bekendgebruik_1    0.848417
Extr_eng2_2  Extr_eng2_3        0.705477
Name: correlation, dtype: float64


### Creating the model

In [None]:
model_desc = """
# Measurement model

Engagement_and_Burnout_1 =~ Bevl_1 + Bevl_2 + Bevl_3
Behavior_1 =~ Cogn_Eng1_5 + Cogn_Eng1_6 + Cogn_Eng2_2 + Cogn_Eng2_3 + Cogn_Eng2_8
Motivation =~ Mot_Stress_1 + Mot_Stress_4
Resilience_1 =~ Veer_1 + Veer_5

Knowledge_of_Interventions =~ Bekendgebruik_1 + Bekendgebruik_2 + Bekendgebruik_3 + Bekendgebruik_4 + Bekendgebruik_5 + Bekendgebruik_6 + Bekendgebruik_7
Behavior_2 =~ Cogn_Eng1_1 + Cogn_Eng1_2 + Cogn_Eng1_3 + Cogn_Eng1_4 + Cogn_Eng2_1 + Cogn_Eng2_4 + Cogn_Eng2_5 + Cogn_Eng2_6 + Cogn_Eng2_7
Extra_Engagement =~ Cogregiedocent_1 + Cogregiedocent_2 + Cogregiedocent_3 +  Cogregiedocent_4 + Cogregiedocent_5 + Cogregiedocent_6 + Extr_eng2_1 + Extr_eng2_2 + Extr_eng2_3
Need_for_Help =~ Hulp_1 + Hulp_2 + Hulp_3
Participation =~ Partici1_1 + Partici1_2 + Partici1_3 + Partici1_4

Study_Performance =~ Cijfer_huidig_1 + StPunt_beh

Lifestyle =~ Cantrill_1 + Leefst + Q297_1
Stress_1 =~ Onnodige_stress_1 + Onnodige_stress_2 + Onnodige_stress_3 + Onnodige_stress_4 + Onnodige_stress_5 + Onnodige_stress_6 + Onnodige_stress_7 + Onnodige_stress_8 + Onnodige_stress_9 + Onnodige_stress_10
Pressure_to_Perform =~ Pres_1 + Pres_2 + Pres_12
Anxiety =~ Zorg_1 + Zorg_2 + Zorg_3 + Zorg_4 + Zorg_5 + Zorg_6


Engagement_and_Burnout_2 =~ Burn1_1 + Burn1_2 + Burn1_3 + Burn1_4
Depression =~ Depr_1 + Depr_2 + Depr_3 + Depr_4 + Depr_5 + Depr_6 + Depr_7 + Depr_8
Resilience_2 =~ Veer_2 + Veer_3 + Veer_4 + Veer_6

"""

In [4]:
model = Model(model_desc)
model.fit(df)
latent_scores = model.predict_factors(df)

# Export the model with the new latent variables

In [5]:
latent_scores = model.predict_factors(df)

latent_scores.to_csv("latent_variable_scores.csv", index=False)

print("Latent variable scores extracted:")
print(latent_scores.head())

Latent variable scores extracted:
    Anxiety  Background_Features_1  Background_Features_2  \
0  0.401656               0.218794               0.027222   
1 -0.630363               0.460891               0.004495   
2 -0.713828               0.639477               0.003870   
3 -0.428887              -0.030688               0.013762   
4  1.001586              -0.044500               0.009224   

   Background_Features_3  Behavior_1  Behavior_2  Depression  \
0              -0.226861   -0.238883   -0.005434   -0.232868   
1              -0.400695   -0.684537   -0.420016   -0.993097   
2              -0.169663    0.271817    0.012886   -0.291897   
3              -0.194726   -0.419932   -0.225418   -0.717810   
4              -0.530319   -0.465803   -0.136794    1.442951   

   Engagement_and_Burnout_1  Engagement_and_Burnout_2  Extra_Engagement  ...  \
0                  0.352428                 -0.093128         -0.027322  ...   
1                  0.002849                 -0.853727 

### Create a minmax normalization for latent variables (Temporary solution)

In [6]:
scaler = MinMaxScaler()

latent_scores_normalized = pd.DataFrame(
    scaler.fit_transform(latent_scores),
    columns=latent_scores.columns,
    index=latent_scores.index
)

In [7]:
df_with_latent_norm = df.copy()
for col in latent_scores_normalized.columns:
    df_with_latent_norm[col + "_norm"] = latent_scores_normalized[col]

# Save if needed
df_with_latent_norm.to_csv("data_with_normalized_latent_variables.csv", index=False)

### Calculate fit indices

In [8]:
stats = calc_stats(model)

print("=== Model Fit Statistics ===")
print(stats.T)

=== Model Fit Statistics ===
                      Value
DoF             3530.000000
DoF Baseline    3828.000000
chi2           17351.006917
chi2 p-value       0.000000
chi2 Baseline  55384.753160
CFI                0.731926
GFI                0.686719
AGFI               0.660272
NFI                0.686719
TLI                0.709296
RMSEA              0.050887
AIC              749.064102
BIC             2803.298091
LogLik            11.467949


### Create and save the path diagram
Doesn't work yet

In [9]:
g = semplot(model, "pd.png")
print(g)



digraph G {
	overlap=scale splines=true
	edge [fontsize=12]
	node [fillcolor="#cae6df" shape=circle style=filled]
	Background_Features_1 [label=Background_Features_1]
	Behavior_2 [label=Behavior_2]
	Knowledge_of_Interventions [label=Knowledge_of_Interventions]
	Study_Performance [label=Study_Performance]
	Stress_2 [label=Stress_2]
	Background_Features_2 [label=Background_Features_2]
	Engagement_and_Burnout_1 [label=Engagement_and_Burnout_1]
	Anxiety [label=Anxiety]
	Pressure_to_Perform [label=Pressure_to_Perform]
	Resilience_1 [label=Resilience_1]
	Behavior_1 [label=Behavior_1]
	Need_for_Help [label=Need_for_Help]
	Resilience_2 [label=Resilience_2]
	Depression [label=Depression]
	Motivation [label=Motivation]
	Participation [label=Participation]
	Lifestyle [label=Lifestyle]
	Background_Features_3 [label=Background_Features_3]
	Extra_Engagement [label=Extra_Engagement]
	Engagement_and_Burnout_2 [label=Engagement_and_Burnout_2]
	Stress_1 [label=Stress_1]
	node [shape=box style=""]
	Beken

In [None]:
#Wellbeing =~ Engagement_and_Burnout_1 + Behavior_1 + Motivation + Resilience_1
#Energy_Sources =~ Knowledge_of_Interventions + Betrok_Ouders + Behavior_2 + Extra_Engagement + Need_for_Help + Participation
#Negative_Outcomes =~ StopInt + Vertr
#Positive_Outcomes =~ Study_Performance + Stopint2
#Stressors =~ Lifestyle + Stress_1 + Pressure_to_Perform + Werk_1 + Anxiety
#Response_to_Stress =~ Engagement_and_Burnout_2 + Depression + Mot_Stress_2 + Resilience_2

# regressions
# Stressors ~ Energy_Sources
# Energy_Sources ~ Stressors

# Response_to_Stress ~ Stressors + Energy_Sources + Wellbeing
# Wellbeing ~ Stressors + Energy_Sources + Response_to_Stress

# Positive_Outcomes ~ Wellbeing + Response_to_Stress
# Negative_Outcomes ~ Wellbeing + Response_to_Stress