# Generate stratified folds for cross-validation

In [1]:
suppressMessages(library(limma))
suppressMessages(library(dplyr))
suppressMessages(library(tidyverse))

In [2]:
sz_residuals <- function(v, train_indices, test_indices)
{
    null_model = v$design[, !(names(v$design) %in% c("Schizo"))]
    dimnames(v$weights) = dimnames(v$E)
                    # Extract log2(CPM + 0.5) normalized expression (train)
    expression_train = v$E[, train_indices]
    null_model_train = null_model[train_indices, ]
    weights_train = v$weights[, train_indices]

    fit_train = lmFit(expression_train, design=null_model_train, weights=weights_train)
                    # Calculate residuals from training data
    residuals_train = expression_train - ( fit_train$coefficients %*% t(null_model_train) )
    residuals_train_sd = apply(residuals_train, 1, sd)
    residuals_train_mean = apply(residuals_train, 1, mean)
                    # Normalize residuals
    residuals_train_norm = (residuals_train - residuals_train_mean) / residuals_train_sd
                    # Extract log2(CPM + 0.5) normalized expression (test)
    expression_test = v$E[, test_indices]
    null_model_test = null_model[test_indices, ]
                    # Apply training to test data and normalize
    residuals_test = expression_test - ( fit_train$coefficients %*% t(null_model_test) )
    residuals_test_norm = (residuals_test - residuals_train_mean) / residuals_train_sd

    return (list(residuals_train_norm = residuals_train_norm, residuals_test_norm = residuals_test_norm))
}

In [3]:
read_brnum_voom_output <- function(input_filename)
{
                    # Loads variable and changes rownames to BrNum
    load(input_filename)
    r <- subset(v$targets, select=BrNum)
    r['rnum'] <- row.names(r)
    row.names(r) <- r$BrNum
    return(r)
}

In [4]:
get_index_table <- function(feature)
{
    fn = paste0('../../differential_expression/_m/', feature,'/voomSVA.RData')
    dft = read_brnum_voom_output(fn)
    colnames(dft) = c('BrNum', 'caudate')
    return(dft)
}

In [5]:
write_fold <- function(feature, fold_number, v, train_indices, test_indices)
{
    ii <- fold_number
    output_dir <- feature

    print(ii)
    mydir <- paste0(output_dir,"/","Fold-", ii-1)
    dir.create(mydir)

    res_df <- sz_residuals(v, train_indices, test_indices)
    write.table(v$targets[train_indices, 'Dx', drop=FALSE], 
                file=paste0(mydir,'/','Y_train.csv'), 
                row.names=as.character(v$targets[train_indices, 'BrNum']),
                sep="\t", quote=FALSE, col.names=NA)

    write.table(res_df$residuals_train_norm, 
                file=paste0(mydir,'/','X_train.csv'),
                col.names=as.character(v$targets[train_indices, 'BrNum']),
                sep="\t", quote=FALSE)

    write.table(v$targets[test_indices, 'Dx', drop=F], 
                file=paste0(mydir,'/','Y_test.csv'),
                row.names=as.character(v$targets[test_indices, 'BrNum']),
                sep="\t", quote=FALSE, col.names=NA)

    write.table(res_df$residuals_test_norm, 
                file=paste0(mydir,'/','X_test.csv'),
                col.names=as.character(v$targets[test_indices, 'BrNum']),
                sep="\t", quote=FALSE)
}


In [6]:
process_voom_output <- function(n_folds, feature)
{
                    # Get indexes
    index_table <- get_index_table(feature)
    n_examples <- dim(index_table)[1]
                    # Load phenotypes for stratification
    fn = paste0('../../differential_expression/_m/', feature,'/voomSVA.RData')
    load(fn)
                    # Subset for disorder and sex   
    pheno <- subset(v$targets, select=c(Dx, Sex, BrNum))
    dd <- merge(pheno, index_table, by='BrNum')
    dd['index'] <- seq(1, n_examples)
                    # Randomize samples with stratification
    ## shuffled_indices <- sample(seq(1, n_examples))
    out <- dd %>% group_by(Dx, Sex) %>% mutate(strat_index = sample(index))
    shuffled_indices <- out$strat_index
                    # Cut folds
    folds = cut(shuffled_indices,breaks=n_folds,labels=FALSE)

    dir.create(feature)
    for(ii in 1:n_folds)
    {
        train_indices = which(folds!=ii, arr.ind=TRUE)
        test_indices = which(folds==ii, arr.ind=TRUE)
        
        print(feature)
        
        write_fold(feature, fold_number=ii, v=v, 
                   train_indices=index_table[train_indices, 'caudate'],
                   test_indices=index_table[test_indices, 'caudate'])
    }
}

In [None]:
feature = 'genes'
process_voom_output(10, feature)