# R Notebook - code to exclude children with psychopathology
### Variables of interest, element names, structnames
1. CBCL - cuttof T score > 63 (based on clinical threshold: https://aseba.org/wp-content/uploads/2019/02/cbclprofile.pdf)
    1. cbcl_scr_syn_internal_t - abcd_cbcls01
    1. cbcl_scr_syn_external_t - abcd_cbcls01
    1. cbcl_scr_syn_totprob_t - abcd_cbcls01
1. KSADS Diagnosis Parent - exclude if == 1
    1. ksads_*_p - abcd_ksad01.txt 
1. Short Social Responsiveness Scale (Not clear what cutoff to apply: Nguyen et al. 2019; short version raw cutoff of 15, but is 16 items while ABCD has 11)
     1. ssrs_p_ss_sum - abcd_mhp02.txt

In [1]:
## install dependendies
install.packages("naniar")

also installing the dependencies ‘gridExtra’, ‘visdat’, ‘viridis’, ‘UpSetR’


Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



In [3]:
#set working directory
#setwd("~/project-brain-parcellation/")
getwd()

In [4]:
# Retrieve CBCL baseline data
cbcl <- read.table("data/input/sourcedata/ABCD3/abcd_cbcls01.txt", sep="\t", header=TRUE)
cbcl <- cbcl[which(cbcl$eventname == "baseline_year_1_arm_1"), ]
cbcl <- cbcl[ , c("subjectkey", "cbcl_scr_syn_internal_t", "cbcl_scr_syn_external_t", "cbcl_scr_syn_totprob_t")] 
names1 <- c(2:4)
cbcl[names1] <- sapply(cbcl[names1], as.numeric)
str(cbcl)

'data.frame':	11878 obs. of  4 variables:
 $ subjectkey             : chr  "NDAR_INV003RTV85" "NDAR_INV00LH735Y" "NDAR_INV01NAYMZH" "NDAR_INV030W95VP" ...
 $ cbcl_scr_syn_internal_t: num  39 50 41 46 33 41 33 50 69 54 ...
 $ cbcl_scr_syn_external_t: num  41 44 50 51 53 48 34 61 51 40 ...
 $ cbcl_scr_syn_totprob_t : num  36 41 45 46 39 45 25 59 57 44 ...


In [8]:
# Retrieve KSADS Diagnosis variable names
abcd <- read.csv("data/input/sourcedata/ABCD3/data_elements.tsv", sep="\t", header=TRUE) #all element, description and structures
abcd <- abcd[grep("abcd_ksad01", abcd$structure),]    #keep abcd_ksad01 variables only
abcd <- abcd[grep("^ksads.*_p$", abcd$element), ]     #keep rows with element that starts with ksads and ends with _p
abcd <- abcd[grep("^Diagnosis", abcd$description), ]  #keep rows with description that starts with Diagnosis
ksads_list <- abcd$element #list of all variable names with parental ksad diagnosis

# Create dataframe KSADS diagnosis items
ksads <- read.table("data/input/sourcedata/ABCD3/abcd_ksad01.txt", sep="\t", header=TRUE)
ksads <- ksads[which(ksads$eventname == "baseline_year_1_arm_1"), ]
subjectkey <- ksads$subjectkey
ksads <- subset(ksads, select = ksads_list)

In [9]:
# change value 555 and 888 to NA
ksads[(ksads == 555) | (ksads==888)] <- NA

# datatype to factor 
names2 <- c(1:145)
ksads[names2] <- lapply(ksads[names2], factor) #categorical variables

#remove colums with single value in all rows
ksads <- ksads[, sapply(ksads, nlevels) > 1]


In [10]:
df1 <- cbind(subjectkey, ksads)
str(df1)

'data.frame':	11878 obs. of  103 variables:
 $ subjectkey    : chr  "NDAR_INV005V6D2C" "NDAR_INV019DXLU4" "NDAR_INV00HEV6HB" "NDAR_INV00J52GPG" ...
 $ ksads_1_845_p : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_1_840_p : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_1_841_p : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_1_842_p : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_1_847_p : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_1_846_p : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_2_837_p : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_2_836_p : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_2_831_p : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_2_832_p : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_2_830_p : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_2_833_p : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...


In [14]:
# Retrieve SSRS items variable names
items <- read.csv("data/input/sourcedata/ABCD3/data_elements.tsv", sep="\t", header=TRUE) #all element, description and structures
items <- items[grep("^ssrs.*_p$", items$element),]    #keep abcd_ksad01 variables only
items_list <- items$element #list of all individual item names ssrs 

# Create dataframe ssrs item data
ssrs <- read.table("data/input/sourcedata/ABCD3/abcd_pssrs01.txt", sep="\t", header=TRUE)
subjectkey <- ssrs$subjectkey
ssrs1 <- subset(ssrs, select = items_list)
df2 <- cbind(subjectkey, ssrs1)
df2 <- df2[-1,] #remove first row with descriptions

# Create ssrs_sum variable
names3 <- c(2:12) #ssrs columns
df2[names3] <- sapply(df2[names3], as.numeric) #change to numeric
df2$ssrs_sum <-rowSums(df2[names3]) #create ssrs_sum 
df2 <- df2[, c("subjectkey", "ssrs_sum")]

In [15]:
# Merge ksads, cbcl and ssrs dataframes
data <- merge(cbcl, df1, by = "subjectkey")
data2 <- merge(data, df2, by = "subjectkey")
str(data2)

'data.frame':	11235 obs. of  107 variables:
 $ subjectkey             : chr  "NDAR_INV003RTV85" "NDAR_INV005V6D2C" "NDAR_INV00BD7VDC" "NDAR_INV00CY2MDM" ...
 $ cbcl_scr_syn_internal_t: num  39 39 52 52 54 34 50 45 73 50 ...
 $ cbcl_scr_syn_external_t: num  41 34 46 66 53 40 44 68 47 51 ...
 $ cbcl_scr_syn_totprob_t : num  36 32 49 58 51 44 41 62 62 48 ...
 $ ksads_1_845_p          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_1_840_p          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_1_841_p          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_1_842_p          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_1_847_p          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_1_846_p          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_2_837_p          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ ksads_2_836_p          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $

In [18]:
# cut offs for CBCL data
df3 <- data2[(data2$cbcl_scr_syn_internal_t < 63) & (data2$cbcl_scr_syn_external_t < 63) & 
             (data2$cbcl_scr_syn_totprob_t < 63),]
nrow(df3)

In [27]:
na_count <-sapply(df3, function(y) sum(length(which(is.na(y)))))
na_count <- data.frame(na_count)
na_count$Measure <- row.names(na_count)
na_count <- na_count[which(na_count$na_count > 150), ] #Problem - many NAs!!
na_count
#ksads_3_848_p - Disruptive Mood Dysregulation Disorder (DMDD)
#ksads_6_859_p - Diagnosis - Agoraphobia Present
#ksads_6_860_p - Diagnosis - Agoraphobia (F40.00) PAST

Unnamed: 0_level_0,na_count,Measure
Unnamed: 0_level_1,<int>,<chr>
ksads_3_848_p,4614,ksads_3_848_p
ksads_6_859_p,4511,ksads_6_859_p
ksads_6_860_p,4511,ksads_6_860_p


In [33]:
df4 <- df3[ , grepl( "ksads" , names( df3 ) ) ]

ind <- apply( df4 , 1 , function(x) any( x != '0') ) # any rows that aren't 0
             
df.na<- df3[ind , ]
check <- df.na[complete.cases(df.na), ]
             
getwd()      
nrow(df.na)
nrow(check)
             
write.csv(check, file="adolescent-brain-parcellation/code/qc/subject_lists/No_psychopathology_nomissing.csv")

In [32]:

getwd() 