In [74]:
import pandas as pd
import sklearn

from sklearn.model_selection import train_test_split

### Data Loading & Split

In [None]:
# Read dataset from csv
dataset = pd.read_csv('processed_data/depression_data.csv')
target_embed_cols = ['DPQ010','DPQ020','DPQ030','DPQ040','DPQ050','DPQ060','DPQ070','DPQ080','DPQ090']

# Add binary target column based on DSM-V criteria
depression_criteria = (
    (dataset['DPQ010'].isin([2, 3]) | dataset['DPQ020'].isin([2, 3])) &     # Little interest in doing things OR feeling down more than half the days
    (dataset[target_embed_cols].isin([2, 3]).sum(axis=1) >= 5)              # At least 5 symptoms present more than half the days 
)

dataset['depressed'] = (depression_criteria).astype(int)

# Get features from the depression file, so we can drop them in X
features_from_depression_file = list(pd.read_sas('raw_data/targets/DPQ_L_Target_Depression.xpt', format='xport').drop(columns='SEQN').columns)

# Define sets
X = dataset.drop(columns=features_from_depression_file).drop(columns=['SEQN', 'depressed'])
y_embed = dataset[target_embed_cols]
y_binary = dataset['depressed']

# Global variables
STATE = 42

['ACD010A', 'ACD010B', 'ACD010C', 'ACD040', 'ALQ111', 'ALQ121', 'ALQ130', 'ALQ142', 'ALQ270', 'ALQ280', 'ALQ151', 'ALQ170', 'BPQ020', 'BPQ030', 'BPQ150', 'BPQ080', 'BPQ101D', 'DBQ010', 'DBD030', 'DBD041', 'DBD050', 'DBD055', 'DBD061', 'DBQ073A', 'DBQ073B', 'DBQ073C', 'DBQ073D', 'DBQ073E', 'DBQ073U', 'DBQ301', 'DBQ330', 'DBQ360', 'DBQ370', 'DBD381', 'DBQ390', 'DBQ400', 'DBD411', 'DBQ421', 'DBQ424', 'DBQ930', 'DBQ935', 'DBQ940', 'DBQ945', 'DIQ010', 'DID040', 'DIQ160', 'DIQ180', 'DIQ050', 'DID060', 'DIQ060U', 'DIQ070', 'FNQ021', 'FNQ041', 'FNQ050', 'FNQ060', 'FNQ080', 'FNQ160', 'FNQ100', 'FNQ110', 'FNQ120', 'FNQ170', 'FNQ180', 'FNQ190', 'FNQ130', 'FNQ200', 'FNQ140', 'FNQ150', 'FNDCDI', 'FNQ410', 'FNQ430', 'FNQ440', 'FNQ450', 'FNQ460', 'FNQ470', 'FNQ480', 'FNQ490', 'FNQ510', 'FNQ520', 'FNQ530', 'FNQ540', 'FNDADI', 'FNDAEDI', 'FSD032A', 'FSD032B', 'FSD032C', 'FSD041', 'FSD052', 'FSD061', 'FSD071', 'FSD081', 'FSD092', 'FSD102', 'FSDAD', 'FSD151', 'FSQ165', 'FSD165N', 'FSQ012', 'FSD012N', 'FS

In [76]:
# Assert correct shape and absence of SEQN column in features and targets
assert X.shape[0] == y_embed.shape[0], "Feature and target embedding row counts do not match"
assert X.shape[0] == y_binary.shape[0], "Feature and target binary row counts do not match"
assert X.columns.__contains__("SEQN") == False, "Feature set should not contain SEQN column"
assert y_embed.columns.__contains__("SEQN") == False, "Target embedding set should not contain SEQN column"

In [77]:
# Split into train and test sets 
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_binary,
    test_size=0.2,        
    random_state=STATE,   
    stratify=y_binary       # preserve class balance
)

y_embed_train = y_embed.loc[X_train.index]
y_embed_test  = y_embed.loc[X_test.index]

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Columns with >50% missing
cols_before = X.shape[1]
cols_to_drop = X.columns[X.isnull().mean() > 0.5]
X = X.drop(columns=cols_to_drop)
print(f"Dropped {len(cols_to_drop)} columns with >50% missing. Remaining: {X.shape[1]} columns (was {cols_before})")

# Rows with >50% missing
rows_before = X.shape[0]
rows_to_drop = X.index[X.isnull().mean(axis=1) > 0.5]
X = X.drop(index=rows_to_drop)
y_binary = y_binary.loc[X.index]  # Align target
y_embed = y_embed.loc[X.index]
print(f"Dropped {len(rows_to_drop)} rows with >50% missing. Remaining: {X.shape[0]} rows (was {rows_before})")

# Identify categorical vs numeric
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
print(f"Categorical features: {categorical_cols}")
print(f"Numeric features: {numeric_cols}")

# Column transformer with imputation
preprocessor = ColumnTransformer(
    transformers=[
        # Numeric pipeline: median imputation → scaling
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_cols),
        
        # Categorical pipeline: most frequent imputation → one-hot encoding
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ]
)

preprocessor.fit(X_train)

X_train_preprocessed = preprocessor.transform(X_train)
X_test_preprocessed  = preprocessor.transform(X_test)
print(f"Preprocessed feature matrix shape: {X_train_preprocessed.shape}")

Dropped 0 columns with >50% missing. Remaining: 113 columns (was 113)
Dropped 0 rows with >50% missing. Remaining: 4167 rows (was 4167)
Categorical features: ['PAD790U', 'PAD810U', 'SLQ300', 'SLQ310', 'SLQ320', 'SLQ330']
Numeric features: ['ACD010A', 'ALQ111', 'ALQ121', 'ALQ130', 'ALQ142', 'ALQ151', 'BPQ020', 'BPQ080', 'BPQ101D', 'DBQ930', 'DBQ935', 'DBQ940', 'DBQ945', 'DIQ010', 'DIQ160', 'DIQ180', 'FNQ410', 'FNQ430', 'FNQ440', 'FNQ450', 'FNQ460', 'FNQ470', 'FNQ480', 'FNQ490', 'FNQ510', 'FNQ520', 'FNQ530', 'FNQ540', 'FNDADI', 'FNDAEDI', 'FSD032A', 'FSD032B', 'FSD032C', 'FSDAD', 'FSD151', 'FSQ165', 'FSD162', 'HIQ011', 'HIQ032A', 'HIQ210', 'HOD051', 'HSQ590', 'HUQ010', 'HUQ030', 'HUQ042', 'HUQ055', 'HUQ090', 'INDFMMPI', 'INDFMMPC', 'INQ300', 'KIQ022', 'KIQ005', 'KIQ042', 'KIQ044', 'KIQ052', 'KIQ481', 'MCQ010', 'AGQ030', 'MCQ053', 'MCQ160A', 'MCQ160B', 'MCQ160C', 'MCQ160D', 'MCQ160E', 'MCQ160F', 'MCQ160M', 'MCQ160P', 'MCQ160L', 'MCQ550', 'MCQ560', 'MCQ220', 'OSQ230', 'OCD150', 'OCQ180', '

### Random forests

In [None]:
# All imports only used in this subsection
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, classification_report

In [None]:
# Baseline
rfc_baseline_model = RandomForestClassifier(n_estimators=1, random_state=STATE)
rfc_baseline_model.fit(X_train_preprocessed, y_train)

y_pred = rfc_baseline_model.predict(X_test_preprocessed)
y_proba = rfc_baseline_model.predict_proba(X_test_preprocessed)[:, 1]

print("F1 score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

F1 score: 0.25
ROC-AUC: 0.6013996719518863
              precision    recall  f1-score   support

           0       0.94      0.93      0.94       775
           1       0.23      0.27      0.25        59

    accuracy                           0.88       834
   macro avg       0.59      0.60      0.59       834
weighted avg       0.89      0.88      0.89       834



In [None]:
# Ours
from sklearn.ensemble import RandomForestRegressor


# # Multi-output Random Forest
# rfr = RandomForestRegressor(
#     n_estimators=200,
#     random_state=STATE,
#     n_jobs=-1
# )

# rfr.fit(X_train_preprocessed, y_embed_train)

# # Predict
# y_pred_embed = rfr.predict(X_test_preprocessed)

# # Evaluate
# from sklearn.metrics import mean_squared_error

# mse_per_item = mean_squared_error(y_embed_test, y_pred_embed, multioutput='raw_values')
# print("MSE per DPQ item:", mse_per_item)
# print("Average MSE:", mse_per_item.mean())



MSE per DPQ item: [0.58665767 0.49892269 0.83707749 0.67195899 0.68924811 0.59399463
 0.65525156 0.45110989 0.18155387]
Average MSE: 0.5739749900079936


### Logistic Model

### Bayesian Model

### MLP (with torch)