In [381]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [382]:
# Steps for preprocessing:
# 1. Drop columns with more than 50% missing values
# 2. Replace NHANES special codes (7/77/777, 9/99/999) with NaN
# 3. Binary variables: convert to binary Yes/No (1/2) to (1/0)
#    For example, DBQ930 - Main meal planner/preparer (1=Yes, 2=No, 7=Refused, 9=Don't know)
# 4. Categorical variables:
#    - Ordinal variables, e.g. FNQ410 - Food security status: 1=Full food security, 2=Marginal food security, 3=Low food security, 4=Very low food security
#       - Preserve ordinality
#       - Convert to 0-indexed (0-3)
# 5. Continous variables:
#    - Frequency variables, e.g. ALQ121 - Alcohol frequency past 12 months), 0=Never in the past year, 1=Every day, 2=5-6 days/week,...
#      Convert to days per year for continuous analysis
        # alq121_mapping = {
        #     0: 0,      # Never
        #     1: 365,    # Every day
        #     2: 286,    # 5-6 days/week → ~5.5*52
        #     3: 182,    # 3-4 days/week → ~3.5*52
        #     4: 104,    # 2 days/week
        #     5: 78,     # 1 day/week
        #     6: 52,     # 2-3 days/month
        #     7: 36,     # 1 day/month
        #     8: 24,     # 7-11 times/year
        #     9: 15,     # 3-6 times/year
        #     10: 6      # 1-2 times/year
        # }
        # df['ALQ121_days'] = df['ALQ121'].map(alq121_mapping)
#   - Numerical measurements (WHD010=height, WHD020=weight, WHD050=desired weight): 
#     - Check for outliers and reasonable ranges
#     - Standardize units if necessary
#     - Create new features, e.g., BMI from height and weight
# 6. Target variable: DSM-V based depression diagnosis

# Missing values?
# Combining features?
# Encode categorical variables?

### Load dataset

In [383]:
df_depression_data = pd.read_csv('processed_data/depression_data.csv')
numerical_cols = df_depression_data.select_dtypes(include=['number', 'float64', 'int64', 'datetime']).columns.tolist()
categorical_cols = df_depression_data.select_dtypes(include=['category', 'object']).columns.tolist()

print(f"Numerical columns: {numerical_cols}")
print(f"Categorical columns: {categorical_cols}")

Numerical columns: ['SEQN', 'DPQ010', 'DPQ020', 'DPQ030', 'DPQ040', 'DPQ050', 'DPQ060', 'DPQ070', 'DPQ080', 'DPQ090', 'DPQ100', 'ACD010A', 'ACD010B', 'ACD010C', 'ACD040', 'ALQ111', 'ALQ121', 'ALQ130', 'ALQ142', 'ALQ270', 'ALQ280', 'ALQ151', 'ALQ170', 'BPQ020', 'BPQ030', 'BPQ150', 'BPQ080', 'BPQ101D', 'DBQ010', 'DBD030', 'DBD041', 'DBD050', 'DBD055', 'DBD061', 'DBQ073A', 'DBQ073B', 'DBQ073C', 'DBQ073D', 'DBQ073E', 'DBQ073U', 'DBQ301', 'DBQ330', 'DBQ360', 'DBQ370', 'DBD381', 'DBQ390', 'DBQ400', 'DBD411', 'DBQ421', 'DBQ424', 'DBQ930', 'DBQ935', 'DBQ940', 'DBQ945', 'DIQ010', 'DID040', 'DIQ160', 'DIQ180', 'DIQ050', 'DID060', 'DIQ060U', 'DIQ070', 'FNQ021', 'FNQ041', 'FNQ050', 'FNQ060', 'FNQ080', 'FNQ160', 'FNQ100', 'FNQ110', 'FNQ120', 'FNQ170', 'FNQ180', 'FNQ190', 'FNQ130', 'FNQ200', 'FNQ140', 'FNQ150', 'FNDCDI', 'FNQ410', 'FNQ430', 'FNQ440', 'FNQ450', 'FNQ460', 'FNQ470', 'FNQ480', 'FNQ490', 'FNQ510', 'FNQ520', 'FNQ530', 'FNQ540', 'FNDADI', 'FNDAEDI', 'FSD032A', 'FSD032B', 'FSD032C', 'FSD041

### Handle columns with missing values
Replace NHANES codes for refuse/missing wih NaN and drop columns with >45% missing values.

In [384]:
def replace_nhanes_special_codes(df):
    """
    Replace NHANES special codes with NaN.
    - 7, 77, 777, 7777, 77777 = Refused
    - 9, 99, 999, 9999, 99999, 55555 = Don't know / Missing
    """
    # Define special codes to replace
    refused_codes = [7, 77, 777, 7777, 77777]
    dont_know_codes = [9, 99, 999, 9999, 99999, 55555]
    special_codes = refused_codes + dont_know_codes
    
    # Also handle implausible values in specific columns
    implausible_values = {
        'WHD010': [9999],      # Height
        'WHD020': [9999],      # Weight  
        'WHD050': [9999, 7777], # Desired weight
        'PAD680': [9999],      # Sedentary minutes
        'OCQ180': [99999, 77777], # Work hours
        'DID040': [999],       # Age diabetes diagnosed
        'RHQ332': [999],       # Age ovaries removed
    }
    
    replaced_count = 0
    for col in df.select_dtypes(include=[np.number]).columns:
        # Replace standard special codes
        mask = df[col].isin(special_codes)
        replaced_count += mask.sum()
        df.loc[mask, col] = np.nan
        
        # Replace column-specific implausible values
        if col in implausible_values:
            mask = df[col].isin(implausible_values[col])
            replaced_count += mask.sum()
            df.loc[mask, col] = np.nan
    
    print(f"Replaced {replaced_count} special code values with NaN")
    return df

In [385]:
def drop_high_missing_columns(df, threshold=50):
    """Drop columns with missing percentage above threshold."""
    missing_pct = df.isna().sum() / len(df) * 100
    cols_to_drop = missing_pct[missing_pct > threshold].index.tolist()
    df = df.drop(columns=cols_to_drop)
    print(f"Dropped {len(cols_to_drop)} columns with >{threshold}% missing values")
    print(f"Shape after dropping high-missing columns: {df.shape}")
    return df

In [386]:
# replace NHANES special codes with NaN
df_depression_data = replace_nhanes_special_codes(df_depression_data)
# drop columns with more than 50% missing values
df_depression_data = drop_high_missing_columns(df_depression_data, threshold=45)
# drop id column, unnecessary flag column and target leakage columns
df_depression_data.drop(columns=['SEQN', 'SMAQUEX2', 'FNQ530', 'FNQ540'], inplace=True, errors='ignore')
df_depression_data

Replaced 8750 special code values with NaN
Dropped 146 columns with >45% missing values
Shape after dropping high-missing columns: (4167, 119)


Unnamed: 0,DPQ010,DPQ020,DPQ030,DPQ040,DPQ050,DPQ060,DPQ070,DPQ080,DPQ090,DPQ100,...,SMQ681,SMQ846,SMQ851,SMQ863,SMDANY,SMQ020,WHD010,WHD020,WHD050,WHQ070
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,1.0,70.0,220.0,220.0,2.0
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2.0,60.0,150.0,165.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,1.0,68.0,200.0,180.0,2.0
3,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,1.0,2.0,...,1.0,1.0,2.0,2.0,1.0,1.0,69.0,220.0,265.0,2.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2.0,61.0,228.0,235.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4162,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2.0,67.0,204.0,234.0,2.0
4163,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,1.0,67.0,245.0,290.0,1.0
4164,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,1.0,62.0,169.0,172.0,1.0
4165,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,2.0,2.0,2.0,2.0,2.0,1.0,65.0,180.0,180.0,1.0


In [387]:
numerical_cols = df_depression_data.select_dtypes(include=['number', 'float64', 'int64', 'datetime']).columns.tolist()
categorical_cols = df_depression_data.select_dtypes(include=['category', 'object']).columns.tolist()

print(f"Numerical columns: {numerical_cols}")
print(f"Categorical columns: {categorical_cols}")

Numerical columns: ['DPQ010', 'DPQ020', 'DPQ030', 'DPQ040', 'DPQ050', 'DPQ060', 'DPQ070', 'DPQ080', 'DPQ090', 'DPQ100', 'ACD010A', 'ALQ111', 'ALQ121', 'ALQ130', 'ALQ142', 'ALQ151', 'BPQ020', 'BPQ080', 'BPQ101D', 'DBQ930', 'DBQ935', 'DBQ940', 'DBQ945', 'DIQ010', 'DIQ160', 'DIQ180', 'FNQ410', 'FNQ430', 'FNQ440', 'FNQ450', 'FNQ460', 'FNQ470', 'FNQ480', 'FNQ490', 'FNQ510', 'FNQ520', 'FNDADI', 'FNDAEDI', 'FSD032A', 'FSD032B', 'FSD032C', 'FSDAD', 'FSD151', 'FSQ165', 'FSD162', 'HIQ011', 'HIQ210', 'HOD051', 'HSQ590', 'HUQ010', 'HUQ030', 'HUQ042', 'HUQ055', 'HUQ090', 'INDFMMPI', 'INDFMMPC', 'INQ300', 'KIQ022', 'KIQ005', 'KIQ042', 'KIQ044', 'KIQ481', 'MCQ010', 'AGQ030', 'MCQ053', 'MCQ160A', 'MCQ160B', 'MCQ160C', 'MCQ160D', 'MCQ160E', 'MCQ160F', 'MCQ160M', 'MCQ160P', 'MCQ160L', 'MCQ550', 'MCQ560', 'MCQ220', 'OSQ230', 'OCD150', 'OHQ845', 'OHQ620', 'OHQ630', 'OHQ640', 'OHQ660', 'OHQ670', 'OHQ680', 'PAD790Q', 'PAD800', 'PAD810Q', 'PAD680', 'RHQ010', 'RHQ031', 'RHD280', 'RXQ510', 'RXQ033', 'RXQ050', 

### Identify categorical and numerical columns

In [388]:
def identify_column_types(df, target_cols):
    """
    Separate columns into ordinal, nominal, binary, object (categorical) and numerical types.
    """
    exclude_cols = set(target_cols)
    object_cols = df.select_dtypes(include='object').columns.tolist()
    exclude_cols.update(object_cols)

    # Known ordinal and nominal columns based on NHANES codebooks
    ordinal_cols = [
        'HUQ010',
        'FNQ410', 'FNQ430', 'FNQ440', 'FNQ450', 'FNQ460', 
        'FNQ470', 'FNQ480', 'FNQ490',
        'FNQ510', 'FNQ520',
        'FSD032A', 'FSD032B', 'FSD032C', 'FSDAD',
        'ALQ121', 'ALQ142',
        'OHQ845', 'OHQ620', 'OHQ630', 'OHQ640', 'OHQ660', 'OHQ670', 'OHQ680',
        'DIQ010', 'INDFMMPC', 'KIQ005', 'SMD460',
    ]
    
    nominal_cols = [
        'HUQ042', 'OCD150', 'HUQ030',
    ]

    # Identify binary columns (Yes=1, No=2)
    binary_cols = []
    for col in df.select_dtypes(include=[np.number]).columns:
        unique_vals = set(df[col].dropna().unique())
        if col not in exclude_cols and unique_vals and unique_vals.issubset({1, 2, 1.0, 2.0}):
            binary_cols.append(col)

    known_categorical = set(ordinal_cols + nominal_cols + binary_cols)

    # Build numerical list
    numerical_cols = []
    for col in df.select_dtypes(include=[np.number]).columns:
        if col not in known_categorical and col not in exclude_cols:
            numerical_cols.append(col)
    
    return ordinal_cols, nominal_cols, binary_cols, numerical_cols, object_cols

In [389]:
(ordinal_cols, nominal_cols, 
 binary_cols, numerical_cols, 
 object_cols) = identify_column_types(
    df_depression_data,
    target_cols=['DPQ010', 'DPQ020', 'DPQ030', 'DPQ040', 'DPQ050', 'DPQ060', 'DPQ070', 'DPQ080', 'DPQ090', 'DPQ100'])

In [390]:
print(f"Ordinal columns: {len(ordinal_cols)}")
print(f"Nominal columns: {len(nominal_cols)}")
print(f"Binary columns: {len(binary_cols)}")
print(f"Numerical columns: {len(numerical_cols)}")
print(f"Object columns (excluded): {len(object_cols)}")
print(f"Total columns identified: {len(ordinal_cols) + len(nominal_cols) + len(binary_cols) + len(numerical_cols) + len(object_cols)}")

Ordinal columns: 28
Nominal columns: 3
Binary columns: 53
Numerical columns: 15
Object columns (excluded): 6
Total columns identified: 105


In [391]:
for col in object_cols:
    print(f"Unique values in {col}: {df_depression_data[col].unique()[:10]}")  # Print first 10 unique values

Unique values in PAD790U: ["b'W'" "b'D'" "b''" "b'Y'" "b'M'"]
Unique values in PAD810U: ["b'W'" "b''" "b'M'" "b'D'" "b'Y'"]
Unique values in SLQ300: ["b'21:00'" "b'00:00'" "b'03:00'" "b'22:30'" "b'23:30'" "b'22:00'"
 "b'02:00'" "b'23:00'" "b''" "b'07:30'"]
Unique values in SLQ310: ["b'06:00'" "b'08:00'" "b'07:30'" "b'10:30'" "b'06:30'" "b'10:00'"
 "b'07:00'" "b'05:30'" "b''" "b'14:30'"]
Unique values in SLQ320: ["b'21:00'" "b'00:00'" "b'03:00'" "b'22:30'" "b'00:30'" "b'01:00'"
 "b'22:00'" "b'02:00'" "b'23:00'" "b'20:30'"]
Unique values in SLQ330: ["b'06:00'" "b'09:00'" "b'08:00'" "b'10:30'" "b'07:30'" "b'07:00'"
 "b'10:00'" "b'11:00'" "b'08:30'" "b'09:30'"]


### Handle object columns
Transform times into decimal hours and add them to numerical columns.

In [392]:
def time_to_hours(time_str):
    """Convert 'HH:MM' to decimal hours (0-24)"""
    if pd.isna(time_str):
        return np.nan
    try:
        h, m = time_str.split(':')
        return int(h) + int(m) / 60
    except:
        return np.nan

df_depression_data['SLQ300'] = df_depression_data['SLQ300'].apply(time_to_hours)
df_depression_data['SLQ310'] = df_depression_data['SLQ310'].apply(time_to_hours)
df_depression_data['SLQ320'] = df_depression_data['SLQ320'].apply(time_to_hours)
df_depression_data['SLQ330'] = df_depression_data['SLQ330'].apply(time_to_hours)

numerical_cols.extend(['SLQ300', 'SLQ310', 'SLQ320', 'SLQ330'])

### Special ordinal cases
- Most ordinal features have 1: Yes, 2: No. We need to convert 2 to 0 for proper encoding
- Some features (e.g., DIQ010) are ordinal but unordered: 1: Yes, 2: No, 3: Borderline
- Some features are in reverse order (1: every day, 2: nearly every day, etc.), when it should be higher number = higher frequency/severity

In [393]:
# Convert 2 to 0 for binary columns
for col in binary_cols:
    df_depression_data[col] = df_depression_data[col].replace({2: 0, 2.0: 0})

In [394]:
# for some ordinal features, higher value = less frequent/severe => should be reversed
reverse_ordinal = {
    # FNQ510: 1=daily → 5=never (should be: higher = more frequent)
    'FNQ510': {1: 4, 2: 3, 3: 2, 4: 1, 5: 0},  # daily→4, weekly→3, monthly→2, few times→1, never→0
    
    # OHQ620-OHQ680: 1=very often → 5=never (should be: higher = more problems)
    'OHQ620': {1: 4, 2: 3, 3: 2, 4: 1, 5: 0},
    'OHQ630': {1: 4, 2: 3, 3: 2, 4: 1, 5: 0},
    'OHQ640': {1: 4, 2: 3, 3: 2, 4: 1, 5: 0},
    'OHQ660': {1: 4, 2: 3, 3: 2, 4: 1, 5: 0},
    'OHQ670': {1: 4, 2: 3, 3: 2, 4: 1, 5: 0},
    'OHQ680': {1: 4, 2: 3, 3: 2, 4: 1, 5: 0},
    
    # FSD032A-C: 1=often true → 3=never true (should be: higher = more food insecurity)
    'FSD032A': {1: 2, 2: 1, 3: 0},
    'FSD032B': {1: 2, 2: 1, 3: 0},
    'FSD032C': {1: 2, 2: 1, 3: 0},

    # ALQ121, ALQ142: 0=Never, 1=Every day → 10=1-2 times a year (should be: higher = more frequent)
    'ALQ121': {0: 0, 10: 1, 9: 2, 8: 3, 7: 4, 6: 5, 5: 6, 4: 7, 3: 8, 2: 9, 1: 10},
    'ALQ142': {0: 0, 10: 1, 9: 2, 8: 3, 7: 4, 6: 5, 5: 6, 4: 7, 3: 8, 2: 9, 1: 10},
}

for col, mapping in reverse_ordinal.items():
    df_depression_data[col] = df_depression_data[col].map(mapping)

In [395]:
# DIQ010: 1=Yes, 2=No, 3=Borderline
# Correct order: No(0) → Borderline(1) → Yes(2)
diq010_remap = {2: 0, 3: 1, 1: 2}

# FNQ520: 1=a little, 2=a lot, 3=somewhere in between
# Correct order: little(0) → between(1) → a lot(2)
fnq520_remap = {1: 0, 3: 1, 2: 2}

df_depression_data['DIQ010'] = df_depression_data['DIQ010'].map(diq010_remap)
df_depression_data['FNQ520'] = df_depression_data['FNQ520'].map(fnq520_remap)

### Shift ordinal columns to 0-indexed

In [396]:
def shift_to_zero_indexed(df, columns):
    """
    Shift ordinal categorical columns that start at 1 to start at 0.
    
    Args:
        df: DataFrame to modify
        columns: List of columns to check and shift
    
    Returns:
        df: Modified DataFrame
        shifted_cols: List of columns that were shifted
    """
    shifted_cols = []
    
    for col in columns:
        if col not in df.columns:
            continue
            
        # Find minimum value, ignoring NaNs
        min_val = df[col].dropna().min()
        
        # If minimum value is 1 (or higher), shift down
        if min_val >= 1:
            df[col] = df[col] - min_val
            shifted_cols.append(col)
    
    print(f"Shifted {len(shifted_cols)} columns to 0-indexed")
    return df, shifted_cols

In [397]:
df_depression_data, shifted_cols = shift_to_zero_indexed(df_depression_data, ordinal_cols)

Shifted 13 columns to 0-indexed


### Preprocessing pipeline

In [398]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder

In [399]:
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler())
    ]), numerical_cols),
    
    ('ord', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
    ]), ordinal_cols),
    
    ('nom', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ]), nominal_cols),
    
    ('bin', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
    ]), binary_cols),
])

In [400]:
target_cols = ['DPQ010', 'DPQ020', 'DPQ030', 'DPQ040', 
               'DPQ050', 'DPQ060', 'DPQ070', 'DPQ080', 
               'DPQ090', 'DPQ100']
y = df_depression_data[target_cols].copy()
X = df_depression_data.drop(columns=target_cols)

In [401]:
preprocessor.fit_transform(X)



array([[ 1. ,  0. ,  1. , ...,  0. ,  1. ,  0. ],
       [-1. ,  0.5, -1. , ...,  0. ,  0. ,  1. ],
       [ 0. , -1. , -1. , ...,  0. ,  1. ,  0. ],
       ...,
       [ 4. ,  0. , -0.5, ...,  0. ,  1. ,  1. ],
       [ 0. ,  0. , -1.5, ...,  0. ,  1. ,  1. ],
       [ 1. ,  0. ,  1. , ...,  0. ,  1. ,  1. ]], shape=(4167, 106))