In [29]:
import numpy as np
import pandas as pd
from IPython.display import display
import gc
import pyreadstat
from sklearn.preprocessing import MinMaxScaler
from semopy import Model, calc_stats, semplot, report
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [30]:
def load_data():
    kyrbs = pd.read_csv('c:\\data\\education\\kyrbs2024_sas\\kyrbs\\kyrbs2024.csv', encoding='cp949')
    pop = pd.read_csv('c:\\data\\education\\kyrbs2024_sas\\kyrbs\\pop24.csv', encoding='cp949')
    return kyrbs, pop

In [31]:
def check_missing_values(df):
    missing_cols = df.columns[df.isna().any()].tolist()
    print("Missing cols:", missing_cols)
    return df[missing_cols].isna().sum()

In [32]:
def create_derived_variables(df):
    df['disturbed'] = df[[f'M_GAD_{i}' for i in range(1, 8)]].apply(lambda x: 2 if (x == 4).any() else 1, axis=1)
    df['violent'] = df['V_TRT'].apply(lambda x: 1 if x == 1 else 2)
    df = df[df['E_FM_F_1'] != 8888]
    def next_of_kin(row):
        return 1 if (row['E_FM_F_1'] == 1 or row['E_FM_M_3'] == 3) else 2
    
    def livingwith(row):
        return 1 if (row['E_LT_F'] == 1 or row['E_LT_M'] == 1) else 2
    
    df['nextofkin'] = df.apply(next_of_kin, axis=1)
    df['livingwith'] = df.apply(livingwith, axis=1)
    return df

In [33]:
def preprocess_data(df):
    observed_vars = ['M_STR', 'M_SAD', 'M_LON', 'disturbed', 'M_SUI_CON', 'E_S_RCRD', 'E_SES', 'nextofkin', 'violent']
    df = df[['MH'] + observed_vars].dropna()
    
    df['M_STR'] = 6 - df['M_STR']
    df['E_S_RCRD'] = 6 - df['E_S_RCRD']
    df['E_SES'] = 6 - df['E_SES']
    
    scaler = MinMaxScaler()
    cols_to_scale = ['M_STR', 'M_LON', 'E_S_RCRD', 'E_SES', 'violent']
    df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    
    numeric_cols = cols_to_scale
    categorical_cols = ['M_SAD', 'disturbed', 'M_SUI_CON', 'nextofkin']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
    df[categorical_cols] = df[categorical_cols].astype("category")
    
    return df

In [34]:
def split_data(df):
    mid = df[df['MH'] == '중학교']
    high = df[df['MH'] == '고등학교']
    return mid, high

In [35]:
def define_sem_model():
    model_desc = """
    Emotion =~ M_SAD + M_LON + disturbed
    Stress =~ M_STR
    Suicide =~ M_SUI_CON
    Stress ~ violent + nextofkin + E_S_RCRD + E_SES
    Suicide ~ Stress + Emotion
    Emotion ~~ Stress
    """
    return Model(model_desc)

In [36]:
def weighted_cov(X, weights):
    average = np.average(X, axis=0, weights=weights)
    X_centered = X - average
    cov_matrix = np.dot((X_centered * weights[:, None]).T, X_centered) / (weights.sum() - 1)
    return cov_matrix

In [37]:
def calculate_covariance(df, observed_vars):
    data_for_cov = df[observed_vars].apply(pd.to_numeric, errors='coerce')
    weights = df['wt'].to_numpy()
    w_cov = weighted_cov(data_for_cov.to_numpy(), weights)
    return pd.DataFrame(w_cov, index=observed_vars, columns=observed_vars)

In [38]:
def compute_vif(df, observed_vars):
    X = df[observed_vars].apply(pd.to_numeric, errors='coerce').dropna()
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

In [39]:
def fit_sem_model(model, df, w_cov_df):
    model.fit(df, cov=w_cov_df)
    return model

In [40]:
def main():
    kyrbs, pop = load_data()
    kyrbs = create_derived_variables(kyrbs)
    kyrbs = preprocess_data(kyrbs)
    mid, high = split_data(kyrbs)
    model = define_sem_model()
    
    observed_vars = ['M_STR', 'M_SAD', 'M_LON', 'disturbed', 'M_SUI_CON', 'E_S_RCRD', 'E_SES', 'nextofkin', 'violent']
    
    high = high.copy()
    high.loc[:, 'wt'] = 1.0
    
    w_cov_df = calculate_covariance(high, observed_vars)
    print("공분산 행렬 랭크:", np.linalg.matrix_rank(w_cov_df))
    print("공분산 행렬 크기:", w_cov_df.shape)
    print("데이터 샘플 개수:", high.shape[0])
    print("가중치 합:", high['wt'].sum())
    
    model = fit_sem_model(model, high, w_cov_df)
    stats = calc_stats(model)
    estimates = model.inspect()
    print("적합도 지표:\n", stats.T, "\n\n")
    print("모수 추정치:\n", estimates)
    
    semplot(model, 'C:\\git_files\\education_analysis\\base\\semplot.png')
    report(model, 'C:\\git_files\\education_analysis\\base\\report')

In [41]:
if __name__ == "__main__":
    main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['nextofkin'] = df.apply(next_of_kin, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['livingwith'] = df.apply(livingwith, axis=1)


공분산 행렬 랭크: 9
공분산 행렬 크기: (9, 9)
데이터 샘플 개수: 22413
가중치 합: 22413.0
적합도 지표:
                       Value
DoF               28.000000
DoF Baseline      40.000000
chi2             881.982691
chi2 p-value       0.000000
chi2 Baseline  24361.878997
CFI                0.964888
GFI                0.963797
AGFI               0.948281
NFI                0.963797
TLI                0.949840
RMSEA              0.036890
AIC               33.921297
BIC              170.217036
LogLik             0.039351 


모수 추정치:
          lval  op       rval  Estimate      Std. Err    z-value   p-value
0      Stress   ~    violent  0.018191      0.009491   1.916627  0.055285
1      Stress   ~  nextofkin  0.003475      0.002997   1.159347  0.246315
2      Stress   ~   E_S_RCRD -0.001355      0.001036    -1.3071  0.191179
3      Stress   ~      E_SES -0.003713      0.002155  -1.722837  0.084918
4     Suicide   ~     Stress  9.283963      4.830796   1.921829  0.054627
5     Suicide   ~    Emotion -4.929944      2.913065