In [None]:
#!/usr/bin/Rscript
#############################################################
options(stringsAsFactors = FALSE)
##############################################################

suppressMessages(
    {

    library(tidyverse)
    library(data.table)
    library(dynamicTreeCut)
    library(fastcluster)
    library(WGCNA)
    library(flashClust)
    library(lattice)
    library(latticeExtra)
    library(gridExtra)
    library(grid)
    library(ggplot2)
    library(IRdisplay)

    #enableWGCNAThreads(15)
    #change that on rf    
    allowWGCNAThreads(15)

    }
                )


save_plot <- function(p, fn, w, h){
    for(ext in c(".pdf", ".png")){
        ggsave(filename=paste0(fn,ext), plot=p, width=w, height=h)
    }
}


dir.create('../_m/')
setwd('../_m/')

In [None]:

counts_path <- Sys.glob('../../../../jhpce_data/analysis/*/residualized_expression.tsv')
counts_path


region <- gsub('.*analysis/|/residu.*','',counts_path)
region

In [None]:
rdata_path <- Sys.glob('../../../../jhpce_data/analysis/*/voomSVA.RData')
rdata_path
#load(rdata_path)

In [None]:
#process the dataframes
process_data <- function(data, group) {
  data %>%
        dplyr::select(row.names(group)) %>%
        t() %>%
        .[goodSamplesGenes(., verbose = 3)$goodSamples, goodSamplesGenes(., verbose = 3)$goodGenes] %>%
        t()
    }

# split the expr data between groups (ie. case vs control, male vs female etc)
separate_data <- function(data,group) {
    df_tmp <- table_all %>% 
                        t() %>%
                        as.data.frame() %>%
                        tibble::rownames_to_column(var = 'row_name') %>%
                        filter(row_name %in% row.names(group)) %>%
                        tibble::column_to_rownames(var = 'row_name')
    return(df_tmp)
    
    }

# Define a function to get the max value between two vectors
max_vector <- function(x, y) { 
    if (max(x) > max(y)) x else y
        }

In [None]:
#c('group_A','group_B')
setLabels <- c('Male','Female')
setA <- setLabels[1]
setB <- setLabels[2]

In [None]:
scale_free_df = data.frame()

In [None]:

for (i in seq_along(counts_path)){
    
    load(rdata_path[i])
    
    dir.create(region[i])
    
    metadata <- v$targets %>% 
                        as.data.frame()
    
    CTL <- metadata %>% 
                    filter(Dx == 'Control')
    SZD <- metadata %>% 
                    filter(Dx != 'Control')
    CTL_SZD <- metadata

    
    for (sample_filter in c('CTL', 'SZD', 'CTL_SZD')){
        
        save_path <- paste0(region[i],'/',sample_filter,'/')
        print(save_path)
        
        dir.create(save_path)
        
        filtered_metadata <- get(sample_filter)
        
        group_a <- filtered_metadata %>% 
                                    filter(Sex == 'M') #male only
        group_b <- filtered_metadata %>% 
                                    filter(Sex != 'M') #female only   
    
        vsd<-fread(counts_path[i],header = T, na.strings = "",check.names = F) %>%
                                                tibble::column_to_rownames(var = "feature_id") #%>% slice_head(n=1000)
    
        
        # Process data for group_a and group_b
        vsd_group_a <- process_data(vsd, group_a)
        vsd_group_b <- process_data(vsd, group_b)

        # Merge the processed data
        table_all <- merge(vsd_group_a, vsd_group_b, by = "row.names") %>%
                                                tibble::column_to_rownames('Row.names') 


        datExprA <- separate_data(tabble_all, group_a)
        datExprB <- separate_data(tabble_all, group_b)
        datExprAll <- table_all %>%
                                t() %>%
                                as.data.frame()

        sampleTreeA <- flashClust(dist(datExprA), method="average")
        sampleTreeB <- flashClust(dist(datExprB), method="average")
        sampleTreeAll <- flashClust(dist(datExprAll), method="average")


        ###save as .PDF file 
        pdf(paste0(save_path,'1a-Dist_clust_',sample_filter,'.pdf'),height=10,width = 15)
        par(mar=c(2,4,1,0), mfrow=c(1,2), oma=c(2,0,4,0), cex=0.5)
        plot(sampleTreeA, main=paste0(setA, ' ', sample_filter), sub="", cex.lab=1.2, cex.axis=1.2, cex.main=1.7, ylim=20)
        plot(sampleTreeB, main=paste0(setB, ' ',sample_filter), sub="", cex.lab=1.2, cex.axis=1.2, cex.main=1.7, ylim=20)
        par(cex=1)
        title(main="Sample Clustering Based on Distance ",outer=TRUE)
        dev.off()


        ###save as .PDF file 
        pdf(paste0(save_path,'1a-Dist_clust_all_',sample_filter,'.pdf'),height=10,width = 15)
        par(cex=0.5)
        plot(sampleTreeAll, main='', sub="", cex.lab=1.2, cex.axis=1.2, cex.main=1.7, ylim=20)
        title(main="Sample Clustering Based on Distance ",outer=TRUE)
        dev.off()
        
        
        powers1 = c(seq(1,30,by = 1))
        sink('pickSoftThreshold.log')
        RpowerTableA <- pickSoftThreshold(datExprA, powerVector = powers1, RsquaredCut = 0.85, verbose = 1)
        RpowerTableB <- pickSoftThreshold(datExprB, powerVector = powers1, RsquaredCut = 0.85, verbose = 1)
        sink()
        
        
        male_sfa <- RpowerTableA$fitIndices
        male_sfa$brain_region <- region[i]
        male_sfa$sex <- 'male'
        male_sfa$status <- sample_filter
        
        female_sfa <- RpowerTableB$fitIndices
        female_sfa$brain_region <- region[i]
        female_sfa$sex <- 'female'
        female_sfa$status <- sample_filter
        
        male_female_scalefree <- rbind(male_sfa, female_sfa) %>% filter(Power == 15)
        
                
        display('male')
        display(RpowerTableA$fitIndices)
        display('female')
        display(RpowerTableB$fitIndices)

        
        #save results into a dataframe
        RpowerTableA$fitIndices %>%
                fwrite(paste0(save_path,setA,'_scale_free_metrics.tsv'),quote=F,sep='\t',row.names=F)
        RpowerTableB$fitIndices %>%
                fwrite(paste0(save_path,setB,'_scale_free_metrics.tsv'),quote=F,sep='\t',row.names=F)
        
        
        
        
        scale_free_df <- rbind(male_female_scalefree,scale_free_df)
                
        
        print(dim(datExprAll))
        
        
    
        }
    
    }

In [None]:
scale_free_df

In [None]:
dev.off()

### We didn't achievied a scale-free topology index (R^2 >= 0.85) on some DLPFC networks using all samples from this brain region, so let's try to remove some possible outliers samples based on  DLPFC hierarchical dendograms ('_m/dlpfc/{CTL,SZD}/*.pdf'), lets try to remove some outliers samples to try to achieve a scale free network

In [None]:
# run script again, but only with dlpfc data, removing 3 DLPFC samples

scale_free_df2 = data.frame()


for (i in 2){ #dlpfc expression data is the second vector from counts_path
    
    load(rdata_path[i])
    
    dir.create(region[i])
    
    metadata <- v$targets %>% 
                        as.data.frame()
    
    CTL <- metadata %>% 
                    filter(Dx == 'Control')
    SZD <- metadata %>% 
                    filter(Dx != 'Control')
    CTL_SZD <- metadata

    
    for (sample_filter in c('CTL', 'SZD', 'CTL_SZD')){
            
        save_path <- paste0(region[i],'/',sample_filter,'/')
        print(save_path)
        
        dir.create(save_path)
        
        filtered_metadata <- get(sample_filter)
        
        group_a <- filtered_metadata %>% 
                                    filter(Sex == 'M') %>% #male only
                                    filter(RNum != 'R3555') #remove this specific sample, huge outlier
        group_b <- filtered_metadata %>% 
                                    filter(Sex != 'M') %>% #female only 
                                    filter(!RNum %in% c('R12351', 'R12371'))  #remove these specific samples, outliers
    
        vsd<-fread(counts_path[i],header = T, na.strings = "",check.names = F) %>%
                                                tibble::column_to_rownames(var = "feature_id") #%>% slice_head(n=1000)
    
        
        # Process data for group_a and group_b
        vsd_group_a <- process_data(vsd, group_a)
        vsd_group_b <- process_data(vsd, group_b)

        # Merge the processed data
        table_all <- merge(vsd_group_a, vsd_group_b, by = "row.names") %>%
                                                tibble::column_to_rownames('Row.names') 


        datExprA <- separate_data(tabble_all, group_a)
        datExprB <- separate_data(tabble_all, group_b)
        datExprAll <- table_all %>%
                                t() %>%
                                as.data.frame()

        sampleTreeA <- flashClust(dist(datExprA), method="average")
        sampleTreeB <- flashClust(dist(datExprB), method="average")
        sampleTreeAll <- flashClust(dist(datExprAll), method="average")

        
        powers1 = c(seq(1,30,by = 1))
        # sink('pickSoftThreshold.log')
        RpowerTableA <- pickSoftThreshold(datExprA, powerVector = powers1, RsquaredCut = 0.85, verbose = 1)
        RpowerTableB <- pickSoftThreshold(datExprB, powerVector = powers1, RsquaredCut = 0.85, verbose = 1)
        # sink()
        
        
        male_sfa <- RpowerTableA$fitIndices
        male_sfa$brain_region <- region[i]
        male_sfa$sex <- 'male'
        male_sfa$status <- sample_filter
        
        female_sfa <- RpowerTableB$fitIndices
        female_sfa$brain_region <- region[i]
        female_sfa$sex <- 'female'
        female_sfa$status <- sample_filter
        
        male_female_scalefree <- rbind(male_sfa, female_sfa) %>% filter(Power == 15)
        
                
        display('male')
        display(RpowerTableA$fitIndices)
        display('female')
        display(RpowerTableB$fitIndices)

        
        RpowerTableA$fitIndices #%>%
                #fwrite(paste0(save_path,setA,'_scale_free_metrics.tsv'),quote=F,sep='\t',row.names=F)
        RpowerTableB$fitIndices #%>%
                #fwrite(paste0(save_path,setB,'_scale_free_metrics.tsv'),quote=F,sep='\t',row.names=F)
        
        
        
        
        scale_free_df2 <- rbind(male_female_scalefree,scale_free_df2)
        
        
        print(dim(datExprAll))
                
    
        }
    
    }

In [None]:
scale_free_df2

# Main Results

In [None]:
# with these 3 DLPFC samples removed (R3555, R12351, R12371)  we were able to get a scale-free network also on Power 15 (R2 >= 0.85) on all DLPFC data as well

metadata %>% 
        filter(RNum == 'R3555' | RNum == 'R12351' | RNum == 'R12371') %>%
        select(RNum, Sex, Dx, RIN, Region)

In [None]:
options(repr.matrix.max.cols=100, repr.matrix.max.rows=100)

metadata %>% 
        filter(RNum == 'R3555' | RNum == 'R12351' | RNum == 'R12371') 

### Scale free topology index final results from Power 15

In [None]:
#Caudate & Hippocampus
scale_free_df %>% filter(Power == 15) %>% filter(brain_region != 'dlpfc') 

In [None]:
#DLPFC
scale_free_df2

In [None]:
sessionInfo()