In [308]:
import os.path
import pandas as pd

In [316]:
depression_df = pd.read_csv("data.tsv", sep="\t", header=0)

### Add Depression Indicator Column

In [317]:
# TODO: Add depression index empty handling??
depression_metric_cols = [
    "DEPRESSIONINDEX",  # 0 - No depression to 9 - High depression
    "DEPEPISODE",  # MDE lifetime
    "MDELASTYR",  # MDE last year
    "ANYTXRXMDE",  # treatment or medication last year
]
# TODO: Check this logic once again
depression_df["DEPINDICATOR"] = -9
all_0 = depression_df[depression_metric_cols].eq(0).any(axis=1)
all_1 = depression_df[depression_metric_cols].ge(1).any(axis=1)
depression_df.loc[all_0, "DEPINDICATOR"] = 0
depression_df.loc[all_1, "DEPINDICATOR"] = 1

### Add Somatic Depression Column

In [318]:
# TODO: Check if there are more columns that can be used to identify somatic depression
somatic_dep_cols = [
    "AD_MDEA3",  # apetite
    "AD_MDEA4",  # sleep
    "AD_MDEA6",  # fatigue
]
depression_df["SOMATICDEP"] = -9
all_1 = depression_df[somatic_dep_cols].eq(1).all(axis=1)
all_2 = depression_df[somatic_dep_cols].eq(2).all(axis=1)
depression_df.loc[all_1, "SOMATICDEP"] = 1
depression_df.loc[all_2, "SOMATICDEP"] = 0

### Choose Relevant Columns

In [319]:
age_col = ["CATAG2", "CATAG3", "CATAG7"][0]

# fmt: off
potential_causes_cols = [
    "NEWRACE2", "ANYINDEX", "MJANDCOKE", "ILLICITDRUGUSE", "LSYRILLICIT", "COKECRACK", "OTHERILLICIT",
    "MARJLTYR", "MJCOKELY", "COCCRKLY", "MJGT12MO", "COCGT12MO", "ANYGT12MO", "ALCFMFPB", "IREDUC2", "EDU_DUMMY",
    "INCOME", "INCOME_R", "POVERTY", "IRPRVHLT", "WORKFORCE", "EMPSTAT4", "REVERSEPOP", "MOVESPY2", "CACHAR", 
    "CATYPE", "CRIMEHIST", "ANYSDRUG", "ANYATTACK", "ANYTHEFT", "NUMARREST", "HEALTH2", "SCHDSICK", "SCHDSKIP", 
    "TXLCAD", "DSTNCALM", "DSTTIRE", "DSTSITST", "DSTDEPRS", "DSTCHEER", "DSTNRVOS", "YOWRSATP", "YOWRSPLN", 
    "ADWRDLOT", "ADWRSTHK", "YO_MDEA4", "ADWRSLEP", "ADWRSMOR", "YO_MDEA3", "ADWRELES", "IRMARIT", "NOMARR2", 
    "RKIDSHH", "MARRIED", "CHILDRENINHOME"
]
# fmt: on

split_cols = ["IRSEX", age_col, "SOMATICDEP"]

depression_df = depression_df[["DEPINDICATOR"] + potential_causes_cols + split_cols]

### Preliminary Analysis
Find N significantly different factors influencing the depression level.

In [320]:
def get_corr(df, key, method, frac=1.0):
    fn = f"corr_{key}_{method}.csv"
    if not os.path.exists(fn):
        df.sample(frac=frac).corr(method).to_csv(fn)
    return pd.read_csv(fn, index_col=0)


def analyse(df):
    pass

### Analysis Performed For Different Groups

In [321]:
def split_by_col(df, col):
    categories = df[col].unique()
    categories.sort()
    dfs = {}
    for category in categories:
        dfs[f"{category}"] = df[df[col] == category].reset_index(drop=True)
    return dfs


def split_by_cols(df, cols):
    dfs = {}
    for col in cols:
        dfs[col] = split_by_col(df, col)
    return dfs

In [322]:
depression_dfs = split_by_cols(depression_df, split_cols)