In [170]:
import pandas as pd
import ppscore as pps
import numpy as np

In [171]:
depression_df = pd.read_csv("data.tsv", sep="\t", header=0)

### Add Somatic Depression Column

In [172]:
somatic_dep_cols = [
    "AD_MDEA3",  # apetite
    "AD_MDEA4",  # sleep
    "AD_MDEA6",  # fatigue
]
depression_df["SOMATICDEP"] = -9
any_2 = depression_df[somatic_dep_cols].eq(2).any(axis=1)
all_1 = depression_df[somatic_dep_cols].eq(1).all(axis=1)
depression_df.loc[any_2, "SOMATICDEP"] = 0
depression_df.loc[all_1, "SOMATICDEP"] = 1

### Choose Relevant Columns

In [173]:
age_col = ["CATAG2", "CATAG3", "CATAG7"][0]

# fmt: off
categorical_nominal_cols = [
    "NEWRACE2", "MJANDCOKE", "ILLICITDRUGUSE",  "LSYRILLICIT", "COKECRACK", 
    "OTHERILLICIT", "MJCOKELY", "COCCRKLY", "MJGT12MO", "COCGT12MO", "ALCFMFPB", 
    "ANYGT12MO", "IRPRVHLT", "WORKFORCE", "EMPSTAT4", "CACHAR", "CATYPE", "CRIMEHIST",
    "ANYSDRUG", "ANYATTACK", "ANYTHEFT", "TXLCAD", "YOWRSATP", "YOWRSPLN", "ADWRDLOT",
    "ADWRSTHK", "YO_MDEA4", "ADWRSLEP", "YO_MDEA3", "ADWRELES", "IRMARIT", "RKIDSHH",
    "MARRIED", "CHILDRENINHOME", "SOMATICDEP", "IRSEX",
]

categorical_ordinal_cols = [
    "ANYINDEX", "IREDUC2", "INCOME", "INCOME_R", "POVERTY", "REVERSEPOP", 
    "MOVESPY2", "NUMARREST", "HEALTH2", "DSTNCALM", "DSTTIRE", "DSTSITST",
    "DSTDEPRS", "DSTCHEER", "DSTNRVOS", "NOMARR2", "DEPRESSIONINDEX",
    age_col
]
numerical_discrete_cols = ["SCHDSICK"]
numerical_continuous_cols = []
# fmt: on


depression_df = depression_df[
    categorical_nominal_cols
    + categorical_ordinal_cols
    + numerical_discrete_cols
    + numerical_continuous_cols
]

In [174]:
def swap_to_nan(df, cols, range):
    df = df.copy()
    lr, ur = range
    df[cols] = df[cols].where((df[cols] >= lr) & (df[cols] <= ur), np.nan)
    return df


depression_df = depression_df[depression_df["DEPRESSIONINDEX"] != -9]  # Remove unknown
depression_df.reset_index(drop=True, inplace=True)

depression_df = swap_to_nan(depression_df, categorical_ordinal_cols, (0, 20))
depression_df = swap_to_nan(depression_df, numerical_discrete_cols, (0, 30))
depression_df = swap_to_nan(depression_df, numerical_continuous_cols, (0, 0)) # No range
depression_df

Unnamed: 0,NEWRACE2,MJANDCOKE,ILLICITDRUGUSE,LSYRILLICIT,COKECRACK,OTHERILLICIT,MJCOKELY,COCCRKLY,MJGT12MO,COCGT12MO,...,DSTNCALM,DSTTIRE,DSTSITST,DSTDEPRS,DSTCHEER,DSTNRVOS,NOMARR2,DEPRESSIONINDEX,CATAG2,SCHDSICK
0,7,1,1,1,1,0,1,0,1,1,...,,,,,5.0,4.0,,0,2,
1,1,1,1,0,0,0,0,0,1,0,...,,,,,4.0,4.0,,0,2,
2,1,0,0,0,0,0,0,0,0,0,...,,,,,5.0,5.0,1.0,0,3,
3,1,1,1,1,0,1,1,0,1,0,...,,,,,5.0,4.0,,0,2,
4,1,1,1,0,0,0,0,0,1,0,...,,,,,4.0,3.0,1.0,0,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36967,1,0,0,0,0,0,0,0,0,0,...,,,,,5.0,3.0,,0,2,0.0
36968,2,0,0,0,0,0,0,0,0,0,...,,,,,5.0,5.0,,0,2,0.0
36969,1,0,1,0,0,1,0,0,0,0,...,,,,,5.0,3.0,,0,2,2.0
36970,1,1,1,0,1,1,0,0,1,1,...,,,,,5.0,4.0,1.0,0,2,


### Preliminary Analysis
Find N significantly different factors influencing the depression level.

DEPRESSIONINDEX - categorical ordinal variable 

In [175]:
def analyse(df):
    print("Pearson")
    pearson_df = df.corr("pearson")

    print("Spearman")
    spearman_df = df.corr("spearman")

    print("PPS")
    pps_df = pps.matrix(df)[["x", "y", "ppscore"]].pivot(
        columns="x", index="y", values="ppscore"
    )
    pps_df.index.name = None
    pps_df.columns.name = None

    return pearson_df, spearman_df, pps_df

In [176]:
pearson_df, spearman_df, pps_df = analyse(depression_df)

Pearson
Spearman
PPS


In [177]:
N = 8
COL = "DEPRESSIONINDEX"
print(pearson_df[COL].drop(COL).nlargest(N))
print(spearman_df[COL].drop(COL).nlargest(N))
print(pps_df[COL].drop(COL).nlargest(N))

SOMATICDEP        0.683601
OTHERILLICIT      0.183496
ILLICITDRUGUSE    0.182700
MJANDCOKE         0.149675
IRSEX             0.148019
LSYRILLICIT       0.136487
ANYINDEX          0.130616
HEALTH2           0.112902
Name: DEPRESSIONINDEX, dtype: float64
SOMATICDEP        0.693303
OTHERILLICIT      0.177845
ILLICITDRUGUSE    0.177730
MJANDCOKE         0.144724
IRSEX             0.140869
ANYGT12MO         0.133392
LSYRILLICIT       0.132218
ANYINDEX          0.108115
Name: DEPRESSIONINDEX, dtype: float64
ADWRDLOT      0.121851
ADWRSTHK      0.121824
ADWRSLEP      0.118432
ADWRELES      0.112131
SOMATICDEP    0.103422
DSTCHEER      0.098245
DSTNRVOS      0.035293
IREDUC2       0.003888
Name: DEPRESSIONINDEX, dtype: float64


### Analysis Performed For Different Groups

In [178]:
def split_by_col(df, col):
    categories = df[col].unique()
    categories.sort()
    dfs = {}
    for category in categories:
        dfs[f"{category}"] = df[df[col] == category].reset_index(drop=True)
    return dfs


def split_by_cols(df, cols):
    dfs = {}
    for col in cols:
        dfs[col] = split_by_col(df, col)
    return dfs

In [24]:
depression_dfs = split_by_cols(depression_df, ["IRSEX", age_col])