In [36]:
import os.path
import pandas as pd
import ppscore as pps

In [37]:
depression_df = pd.read_csv("data.tsv", sep="\t", header=0)

### Add Somatic Depression Column

In [38]:
somatic_dep_cols = [
    "AD_MDEA3",  # apetite
    "AD_MDEA4",  # sleep
    "AD_MDEA6",  # fatigue
]
depression_df["SOMATICDEP"] = -9
any_2 = depression_df[somatic_dep_cols].eq(2).any(axis=1)
all_1 = depression_df[somatic_dep_cols].eq(1).all(axis=1)
depression_df.loc[any_2, "SOMATICDEP"] = 0
depression_df.loc[all_1, "SOMATICDEP"] = 1

### Choose Relevant Columns

In [39]:
age_col = ["CATAG2", "CATAG3", "CATAG7"][0]

# fmt: off
potential_causes_cols = [
    "NEWRACE2", "ANYINDEX", "MJANDCOKE", "ILLICITDRUGUSE", "LSYRILLICIT", "COKECRACK", "OTHERILLICIT",
    "MARJLTYR", "MJCOKELY", "COCCRKLY", "MJGT12MO", "COCGT12MO", "ANYGT12MO", "ALCFMFPB", "IREDUC2", "EDU_DUMMY",
    "INCOME", "INCOME_R", "POVERTY", "IRPRVHLT", "WORKFORCE", "EMPSTAT4", "REVERSEPOP", "MOVESPY2", "CACHAR", 
    "CATYPE", "CRIMEHIST", "ANYSDRUG", "ANYATTACK", "ANYTHEFT", "NUMARREST", "HEALTH2", "SCHDSICK", "SCHDSKIP", 
    "TXLCAD", "DSTNCALM", "DSTTIRE", "DSTSITST", "DSTDEPRS", "DSTCHEER", "DSTNRVOS", "YOWRSATP", "YOWRSPLN", 
    "ADWRDLOT", "ADWRSTHK", "YO_MDEA4", "ADWRSLEP", "ADWRSMOR", "YO_MDEA3", "ADWRELES", "IRMARIT", "NOMARR2", 
    "RKIDSHH", "MARRIED", "CHILDRENINHOME"
]
# fmt: on

split_cols = ["IRSEX", age_col, "SOMATICDEP"]

depression_df = depression_df[["DEPRESSIONINDEX"] + potential_causes_cols + split_cols]

In [40]:
depression_df = depression_df[depression_df["DEPRESSIONINDEX"] != -9]

### Preliminary Analysis
Find N significantly different factors influencing the depression level.

In [41]:
def analyse(df):
    print("Pearson")
    pearson_df = df.corr("pearson")

    print("Spearman")
    spearman_df = df.corr("spearman")

    print("PPS")
    pps_df = pps.matrix(df)[["x", "y", "ppscore"]].pivot(
        columns="x", index="y", values="ppscore"
    )
    pps_df.index.name = None
    pps_df.columns.name = None

    return pearson_df, spearman_df, pps_df

In [42]:
pearson_df, spearman_df, pps_df = analyse(depression_df)

Pearson
Spearman
PPS


### Analysis Performed For Different Groups

In [43]:
def split_by_col(df, col):
    categories = df[col].unique()
    categories.sort()
    dfs = {}
    for category in categories:
        dfs[f"{category}"] = df[df[col] == category].reset_index(drop=True)
    return dfs


def split_by_cols(df, cols):
    dfs = {}
    for col in cols:
        dfs[col] = split_by_col(df, col)
    return dfs

In [44]:
depression_dfs = split_by_cols(depression_df, split_cols)