In [None]:
from os.path import join
import numpy as np
import pandas as pd

In [None]:
PROJ_DIR = "./"
DATA_DIR = join(PROJ_DIR, "dset")

In [None]:
participant_df = pd.read_csv(join(DATA_DIR, "participants.tsv"), sep="\t")
preprocessed_df = pd.read_csv(join(DATA_DIR, "sub-group_task-rest_desc-1S2StTesthabenula_table.txt"), sep="\t")


In [None]:
participant_df

In [None]:
# Get the list of subject IDs from the preprocessed DataFrame
valid_subjects = preprocessed_df["Subj"].unique()

# Filter participant_df to only include matching participant_id entries
preproc_participant_df = participant_df[participant_df["participant_id"].isin(valid_subjects)]


### first, let's find out the numbers for each of the combinations of abide phenotypic variables

In [None]:
# List of desired columns to keep
measures = [
    "participant_id",
    "DX_GROUP",
    "AGE_AT_SCAN",
    "SRS_COMMUNICATION",
    "SRS_COMMUNICATION_RAW",
    "SRS_MOTIVATION",
    "SRS_MOTIVATION_RAW",
    "BRIEF_GEC_T",
    "BRIEF_MI_T",
    "BRIEF_BRI_T",
    "ADI_R_RRB_TOTAL_C",
    "ADI_R_SOCIAL_TOTAL_A",
    "CBCL_6-18_TOTAL_PROBLEM_T",
]

# Keep only the desired columns
measure_df = preproc_participant_df[measures]

# Replace all -9999 values with np.nan
measure_df = measure_df.replace(-9999, np.nan)

In [None]:
measure_df

In [None]:
# Combine SRS Communication
measure_df["SRS_COMMUNICATION_COMB"] = (
    measure_df["SRS_COMMUNICATION"]
    .combine_first(measure_df["SRS_COMMUNICATION_RAW"])
)

# Combine SRS Motivation
measure_df["SRS_MOTIVATION_COMB"] = (
    measure_df["SRS_MOTIVATION"]
    .combine_first(measure_df["SRS_MOTIVATION_RAW"])
)


# Keep only participant_id and combined columns
measure_df = measure_df[
    [
        "participant_id",
        "DX_GROUP",
        "AGE_AT_SCAN",
        "SRS_COMMUNICATION_COMB",
        "SRS_MOTIVATION_COMB",
        "BRIEF_GEC_T",
        "BRIEF_MI_T",
        "BRIEF_BRI_T",
        "ADI_R_RRB_TOTAL_C",
        "ADI_R_SOCIAL_TOTAL_A",
        "CBCL_6-18_TOTAL_PROBLEM_T",
    ]
]


In [None]:
for col in measure_df.columns:
    if col != "participant_id":
        unique_vals = measure_df[col].dropna().unique()
        print(f"{col} — {len(unique_vals)} unique values:\n{unique_vals}\n")


In [None]:
measure_df

In [None]:
# Columns we require data for
cols_required = [
    "participant_id",
    "DX_GROUP",
    "AGE_AT_SCAN",
    "SRS_COMMUNICATION_COMB",
    "SRS_MOTIVATION_COMB",
    "BRIEF_GEC_T",
    "BRIEF_MI_T",
    "BRIEF_BRI_T",
    "ADI_R_RRB_TOTAL_C",
    "ADI_R_SOCIAL_TOTAL_A",
]

# Drop rows with any missing values in those columns
complete_df = measure_df.dropna(subset=cols_required)

# Overall count
print(f"Total participants with all 4 measures: {complete_df.shape[0]}")

# Count by DX_GROUP
counts_by_group = complete_df["DX_GROUP"].value_counts()
print("\nParticipants with all 4 measures by DX_GROUP:")
print(counts_by_group)

### let's create the csv files that will be used in the PLSC
#### 1. phenotypic data
#### 2. covariate data
#### 3. rsfc data 

In [None]:
phenotypes = [
    "SRS_COMMUNICATION_COMB",
    "SRS_MOTIVATION_COMB",
    "BRIEF_GEC_T",
    "BRIEF_MI_T",
    "BRIEF_BRI_T",
    "ADI_R_RRB_TOTAL_C",
    "ADI_R_SOCIAL_TOTAL_A",
]

phenotype_df = complete_df[phenotypes]

print(phenotype_df)

## double check we don't want ppt id column and save as a csv