In [83]:
import os.path
import pandas as pd

In [84]:
depression_df = pd.read_csv("data.tsv", sep="\t", header=0)

In [85]:
# TODO: Check if there are more columns that can be used to identify somatic depression
somatic_dep_cols = [
    "AD_MDEA3",  # apetite
    "AD_MDEA4",  # sleep
    "AD_MDEA6",  # fatigue
]
all_1 = depression_df[somatic_dep_cols].eq(1).all(axis=1)
all_2 = depression_df[somatic_dep_cols].eq(2).all(axis=1)
depression_df["SOMATICDEP"] = -9
depression_df.loc[all_1, "SOMATICDEP"] = 1
depression_df.loc[all_2, "SOMATICDEP"] = 0

In [96]:
age_col = ["CATAG2", "CATAG3", "CATAG7"][0]

# fmt: off
potential_causes_cols = [
    "NEWRACE2", "ANYINDEX", "MJANDCOKE", "ILLICITDRUGUSE", "LSYRILLICIT", "COKECRACK", "OTHERILLICIT",
    "MARJLTYR", "MJCOKELY", "COCCRKLY", "MJGT12MO", "COCGT12MO", "ANYGT12MO", "ALCFMFPB", "IREDUC2", "EDU_DUMMY",
    "INCOME", "INCOME_R", "POVERTY", "IRPRVHLT", "WORKFORCE", "EMPSTAT4", "REVERSEPOP", "MOVESPY2", "CACHAR", 
    "CATYPE", "CRIMEHIST", "ANYSDRUG", "ANYATTACK", "ANYTHEFT", "NUMARREST", "HEALTH2", "SCHDSICK", "SCHDSKIP", 
    "TXLCAD", "DSTNCALM", "DSTTIRE", "DSTSITST", "DSTDEPRS", "DSTCHEER", "DSTNRVOS", "YOWRSATP", "YOWRSPLN", 
    "ADWRDLOT", "ADWRSTHK", "YO_MDEA4", "ADWRSLEP", "ADWRSMOR", "YO_MDEA3", "ADWRELES", "IRMARIT", "NOMARR2", 
    "RKIDSHH", "MARRIED", "CHILDRENINHOME"
]
# fmt: on
depression_metric_cols = [
    "DEPRESSIONINDEX",  # 0 - No depression to 9 - High depression
    "DEPEPISODE",  # MDE lifetime
    "MDELASTYR",  # MDE last year
    "ANYTXRXMDE",  # treatment or medication last year
]

# split_cols = ["IRSEX", age_col, "SOMATICDEP"]
split_cols = ["IRSEX", age_col]


depression_df = depression_df[
    potential_causes_cols + depression_metric_cols + split_cols
]

In [97]:
def split_by_cols(df, cols):
    if not cols:
        return df
    col = cols[0]
    categories = df[col].unique()
    categories.sort()
    dfs = {}
    for category in categories:
        sub_df = df[df[col] == category].reset_index(drop=True)
        dfs[f"{col}({category})"] = split_by_cols(sub_df, cols[1:])

    return dfs

In [100]:
split_depression_dfs = split_by_cols(depression_df, split_cols)

In [7]:
def get_corr(df, key, method, frac=1.0):
    fn = f"corr_{key}_{method}.csv"
    if not os.path.exists(fn):
        df.sample(frac=frac).corr(method).to_csv(fn)
    return pd.read_csv(fn, index_col=0)

In [None]:
corr_pearson_df = dict()
corr_spearman_df = dict()
for key, age_df in deppression_age_dfs.items():
    print(f"Processing age group: {key}")
    print("Processing pearson")
    corr_pearson_df[key] = get_corr(age_df, key, "pearson")
    print("Processing spearman")
    corr_spearman_df[key] = get_corr(age_df, key, "spearman")

Processing age group: 1
Processing pearson
Processing spearman
Processing age group: 2
Processing pearson
