Run the following cell in the main machine

In [None]:
# Push all these local files to remote; run this on main machine
# Using InferCNV for the samples that Numbat had a hard time separating
for i in 22 24 29 34 35 36
do
    pat="GM${i}"
    echo $pat
    aws s3 cp /home/ubuntu/data/cxcr4-pdac/seurat/${pat}/${pat}_cb.rds s3://cxcr4-pdac/infercnv/input/${pat}_cb.rds
done
aws s3 cp /home/ubuntu/data/cxcr4-pdac/seurat/reference.RDS s3://cxcr4-pdac/infercnv/input/reference.RDS

Run the following cells on the worker instances

In [None]:
# Obtain data files first
# (First install the infercnv conda env)


# First copy over infercnv.env.yml and 06b_InferCNV.R (by hand sry)
# From setup.sh:
mamba env create -f infercnv.env.yml
INFERCNV="/home/ubuntu/InstallTemp/infercnv"
mkdir -p "${INFERCNV}"
aws s3 cp s3://cxcr4-pdac/infercnv/input/refdata-gex-GRCh38-2020-A_gen_pos.txt "${INFERCNV}/refdata-gex-GRCh38-2020-A_gen_pos.txt"



SEURAT="/home/ubuntu/data/cxcr4-pdac/seurat/"
mkdir -p $SEURAT

for i in 22 24 29 34 35 36
do
    pat="GM${i}"
    mkdir -p "${SEURAT}/${pat}"
    echo $pat
    aws s3 cp s3://cxcr4-pdac/infercnv/input/${pat}_cb.rds ${SEURAT}/${pat}/${pat}_cb.rds
done
aws s3 cp s3://cxcr4-pdac/infercnv/input/reference.RDS "${SEURAT}/reference.RDS"


In [None]:
conda activate infercnv

projectdir="/home/ubuntu/projects/edsouza-summer2023/cxcr4-pdac"
for i in 22 24 29 34 35 36
do
    if [[  $i -ne 1000000 ]] # Change this line for the sample depending on which worker instance you're on
    then
        sample="GM${i}"
        outdir="${projectdir}/data/infercnv/${sample}"
        logfile="${outdir}/${sample}_infercnv_log.txt"
        donefile="${outdir}/done.checkpoint"

        mkdir -p ${outdir}

        if [ ! -f "${donefile}" ]
        then
            echo "Starting sample ${sample}. Log at ${logfile}" | ts '[%Y-%m-%d %H:%M:%S]'
            Rscript "${projectdir}/06b_InferCNV.R" ${sample} 2>&1 | ts '[%Y-%m-%d %H:%M:%S]' > "${logfile}"

            if ! grep -q "Execution halted" <(tail -n 5 "${logfile}")
            then
                echo "Sample ${sample} finished successfully"  | ts '[%Y-%m-%d %H:%M:%S]'
                touch "${donefile}"
            else
                echo "Sample ${sample} finished unsuccessfully" | ts '[%Y-%m-%d %H:%M:%S]'
            fi
        else
            echo "Sample ${sample} already finished; skipping" | ts '[%Y-%m-%d %H:%M:%S]'
        fi
    fi
done

In [None]:
# Run this line on the remote machine where infercnv was run \
aws s3 cp /home/ubuntu/data/cxcr4-pdac/infercnv/ s3://cxcr4-pdac/infercnv/ --recursive

# Run this line on the local analysis machine
aws s3 cp s3://cxcr4-pdac/infercnv/ /home/ubuntu/data/cxcr4-pdac/infercnv/ --recursive

# Threshold filtering

In [35]:
# Use the `numbat` env for this part

library(numbat)
library(dplyr)
library(Seurat)
library(ggplot2)
library(stringr)
library(patchwork)
library(gifski)

'%notin%' <- Negate('%in%')
message_ <- function(m) {
    message(paste(Sys.time(), ': ', m))
}

################################################################################

# Read-in numbat output
produce_seurat <- function(pat) {
    projectdir <- file.path('/home/ubuntu/data/cxcr4-pdac')
    seu_file <- file.path(projectdir, 'infercnv', pat, paste0(pat, '_infercnv.rds'))
    
    outdir <- file.path(projectdir, 'infercnv', pat)
    gif_dir <- file.path(outdir, 'create_gif')

    seu <- readRDS(seu_file)
    seu@meta.data <- seu@meta.data[seu@meta.data$is_ref == 'not_ref', ] 
    return(list('seurat'=seu, 'gif_dir'=gif_dir))
}

filter_infercnv_threshold <- function(seu, outdir,
                                    thresholds=seq(0,1,0.01), 
                                    min_prevalence = 10, 
                                    create_gif = FALSE, delay=0.3,
                                    return_seurat_object = FALSE) {

    if (return_seurat_object) {
        stopifnot('To return seurat object, `thresholds` must be a scalar value' = length(thresholds) == 1)
        stopifnot('To return Seurat object, you `create_gif` must be FALSE' = !create_gif)
    }

    samplename <- seu@meta.data$orig.ident %>% unique()
    stopifnot('Seurat object must only have a single `orig.ident`: '= length(samplename) == 1)

    message_(paste('Computing thresholds for', samplename))

    # First the raw thresholds
    low_raw <- min(seu$proportion_cnv_avg)
    high_raw <- max(seu$proportion_cnv_avg)
    prevalence_raw <- list()
    ct_raw = 1
    for (thresh in seq(low_raw, high_raw, 0.01)) {
        seu$selected_infercnv_raw <- seu$proportion_cnv_avg > thresh
        prev_df_raw <- seu@meta.data %>% 
                            group_by(seurat_clusters) %>% 
                            summarize(prevalence = 100 * sum(selected_infercnv_raw)/n()) %>%
                            filter(prevalence > min_prevalence) %>%  # 10% prevalence threshold
                            mutate(threshold = thresh)
        prevalence_raw[[ct_raw]] <- prev_df_raw; ct_raw <- ct_raw + 1
        allowed_clusters <- prev_df_raw$seurat_clusters
        seu$selected_filtered_infercnv_raw <- seu$selected_infercnv_raw & (seu$seurat_clusters %in% allowed_clusters)
    }
    prevalence_raw <- dplyr::bind_rows(prevalence_raw)

    # Now the scaled thresholds
    low_scaled <- min(seu$proportion_scaled_cnv_avg)
    high_scaled <- max(seu$proportion_scaled_cnv_avg)
    prevalence_scaled <- list()
    ct_scaled = 1
    for (thresh in seq(low_scaled, high_scaled, 0.01)) {
        seu$selected_infercnv_scaled <- seu$proportion_scaled_cnv_avg > thresh
        prev_df_scaled <- seu@meta.data %>% 
                            group_by(seurat_clusters) %>% 
                            summarize(prevalence = 100 * sum(selected_infercnv_scaled)/n()) %>%
                            filter(prevalence > min_prevalence) %>%  # 10% prevalence threshold
                            mutate(threshold = thresh)
        prevalence_scaled[[ct_scaled]] <- prev_df_scaled; ct_scaled <- ct_scaled + 1
        allowed_clusters <- prev_df_scaled$seurat_clusters
        seu$selected_filtered_infercnv_scaled <- seu$selected_infercnv_scaled & (seu$seurat_clusters %in% allowed_clusters)
    }
    prevalence_scaled <- dplyr::bind_rows(prevalence_scaled)



    message_(paste('Creating images for', samplename))
    pic_ct <- 0
    for (thresh in thresholds) {
        # 
        seu$selected_infercnv_raw <- seu$proportion_cnv_avg > thresh
        prev_df_raw <- seu@meta.data %>% 
                            group_by(seurat_clusters) %>% 
                            summarize(prevalence = 100 * sum(selected_infercnv_raw)/n()) %>%
                            filter(prevalence > min_prevalence) %>%
                            mutate(threshold=thresh)
        allowed_clusters <- prev_df_raw$seurat_clusters
        seu$selected_filtered_infercnv_raw <- seu$selected_infercnv_raw & (seu$seurat_clusters %in% allowed_clusters)

        seu$selected_infercnv_scaled <- seu$proportion_scaled_cnv_avg > thresh
        prev_df_scaled <- seu@meta.data %>% 
                            group_by(seurat_clusters) %>% 
                            summarize(prevalence = 100 * sum(selected_infercnv_scaled)/n()) %>%
                            filter(prevalence > min_prevalence) %>%
                            mutate(threshold=thresh)
        allowed_clusters <- prev_df_scaled$seurat_clusters
        seu$selected_filtered_infercnv_scaled <- seu$selected_infercnv_scaled & (seu$seurat_clusters %in% allowed_clusters)

        suppressMessages(suppressWarnings({
            p1a <- DimPlot(seu, group.by='selected_infercnv_raw', order = T) +
            scale_color_manual(values=c('FALSE'='grey', 'TRUE'='red')) +
            ggtitle(paste0(samplename, 
                          ' InferCNV raw\nThreshold = ', 
                          thresh))

            p2a <- DimPlot(seu, group.by='selected_filtered_infercnv_raw', order = T) +
                scale_color_manual(values=c('FALSE'='grey', 'TRUE'='red')) +
                ggtitle(paste0(samplename, 
                               ' InferCNV raw filtered\nThreshold = ', 
                               thresh, '; FILTERED'))

            p3a <- ggplot(prevalence_raw, aes(threshold, prevalence, color=seurat_clusters)) + 
                    geom_line() + 
                    geom_vline(xintercept=thresh, color='red', linetype='dashed') +
                    geom_hline(yintercept=min_prevalence, color='black', linetype='dashed') +
                    xlim(0,1) + ylim(0,100) + 
                    scale_x_continuous(expand = c(0, 0), limits = c(0, NA)) + 
                    scale_y_continuous(expand = c(0, 0), limits = c(0, NA)) +
                    theme_classic() + theme(legend.position = "none") +
                    ylab('% of malignant cells in cluster') +
                    ggtitle(paste0(samplename, 
                                   ' Malignant prevalence per cluster (RAW FILTERED), threshold = ', 
                                   thresh))

            p1b <- DimPlot(seu, group.by='selected_infercnv_scaled', order = T) +
            scale_color_manual(values=c('FALSE'='grey', 'TRUE'='red')) +
            ggtitle(paste0(samplename, 
                          ' InferCNV scaled\nThreshold = ', 
                          thresh))

            p2b <- DimPlot(seu, group.by='selected_filtered_infercnv_scaled', order = T) +
                scale_color_manual(values=c('FALSE'='grey', 'TRUE'='red')) +
                ggtitle(paste0(samplename, 
                               ' InferCNV scaled filtered\nThreshold = ', 
                               thresh, '; FILTERED'))

            p3b <- ggplot(prevalence_scaled, aes(threshold, prevalence, color=seurat_clusters)) + 
                    geom_line() + 
                    geom_vline(xintercept=thresh, color='red', linetype='dashed') +
                    geom_hline(yintercept=min_prevalence, color='black', linetype='dashed') +
                    xlim(0,1) + ylim(0,100) + 
                    scale_x_continuous(expand = c(0, 0), limits = c(0, NA)) + 
                    scale_y_continuous(expand = c(0, 0), limits = c(0, NA)) +
                    theme_classic() + theme(legend.position = "none") +
                    ylab('% of malignant cells in cluster') +
                    ggtitle(paste0(samplename, 
                                   ' Malignant prevalence per cluster (RAW FILTERED), threshold = ', 
                                   thresh))
        }))

        # Produce plot files
        if (!dir.exists(outdir)) {
            dir.create(outdir, recursive = T)
        }
        outfile <- file.path(outdir, paste0(samplename, '_infercnv_thresh_', 
                                            str_pad(thresh*100, 2, pad='0'),
                                            '_min_prev_', min_prevalence,
                                            '.png'))
        ggsave(outfile, ((p1a + p1b) / (p2a + p2b) / (p3a + p3b)),
              width = 12, height = 25, dpi = 300, 
              units = "in", device='png') %>% suppressMessages() %>% suppressWarnings()
        
        if (pic_ct %% 10 == 0) {
            message_(paste0('Produced plot for ', outfile))
        }


        # Will only run if `thresholds` is a single value and we're not creating a gif
        if (return_seurat_object) {
            message_(paste0('Returning Seurat object for ', 
                             samplename, ' threshold ', thresh, 
                             ' and min prevalence ', min_prevalence,'%.'))
            return(seu)
        }
        pic_ct <- pic_ct + 1
    }
    

    if (create_gif) {
        gif_path <- file.path(outdir, paste0(samplename, '_infercnv_thresholds.gif'))
        png_files <- file.path(outdir, paste0(samplename, '_infercnv_thresh_', 
                                             str_pad(thresholds*100, 2, pad='0'),
                                            '_min_prev_', min_prevalence,
                                            '.png'))
        message_(paste('Creating GIF for sample', samplename))
        gifski::gifski(png_files, gif_path, delay=delay)
        message_(paste('Created GIF at', gif_path))
        file.remove(png_files)
    }
}


In [None]:
### As with numbat, we create gifs to determine threshold for each sample
# This allows us to avoid false negatives

In [None]:
seu <- produce_seurat('GM22') # 22 24 29 34 35 36

# filter_infercnv_threshold(seu[['seurat']], outdir=seu[['gif_dir']],
#                         thresholds= seq(0,1,0.01), #c(0, 0.3, 0.5, 0.9, 1), 
#                         min_prevalence=10,
#                         create_gif=T, 
#                         delay=0.20  # Only matters if `create_gif` is TRUE
#                         )

seu <- filter_infercnv_threshold(seu[['seurat']], outdir=seu[['gif_dir']],
                                thresholds=0.39,
                                min_prevalence=75,
                                create_gif=F,
                                return_seurat_object=T)


In [None]:
seu <- produce_seurat('GM24') # 22 24 29 34 35 36

# filter_infercnv_threshold(seu[['seurat']], outdir=seu[['gif_dir']],
#                         thresholds= seq(0,1,0.01), #c(0, 0.3, 0.5, 0.9, 1), 
#                         min_prevalence=10,
#                         create_gif=T, 
#                         delay=0.20  # Only matters if `create_gif` is TRUE
#                         )

seu <- filter_infercnv_threshold(seu[['seurat']], outdir=seu[['gif_dir']],
                                thresholds=0.1,
                                min_prevalence=50,
                                create_gif=F,
                                return_seurat_object=T)

In [None]:
seu <- produce_seurat('GM29') # 22 24 29 34 35 36

# filter_infercnv_threshold(seu[['seurat']], outdir=seu[['gif_dir']],
#                         thresholds= seq(0,1,0.01), #c(0, 0.3, 0.5, 0.9, 1), 
#                         min_prevalence=10,
#                         create_gif=T, 
#                         delay=0.20  # Only matters if `create_gif` is TRUE
#                         )

seu <- filter_infercnv_threshold(seu[['seurat']], outdir=seu[['gif_dir']],
                                thresholds=0.35, # 0.18
                                min_prevalence=85,
                                create_gif=F,
                                return_seurat_object=T)

In [None]:
seu <- produce_seurat('GM34') # 22 24 29 34 35 36

# filter_infercnv_threshold(seu[['seurat']], outdir=seu[['gif_dir']],
#                         thresholds= seq(0,1,0.01), #c(0, 0.3, 0.5, 0.9, 1), 
#                         min_prevalence=10,
#                         create_gif=T, 
#                         delay=0.10  # Only matters if `create_gif` is TRUE
#                         )

seu <- filter_infercnv_threshold(seu[['seurat']], outdir=seu[['gif_dir']],
                                thresholds=0.999,
                                min_prevalence=100,
                                create_gif=F,
                                return_seurat_object=T)

In [None]:
seu <- produce_seurat('GM35') # 22 24 29 34 35 36

# filter_infercnv_threshold(seu[['seurat']], outdir=seu[['gif_dir']],
#                         thresholds= seq(0,1,0.01), #c(0, 0.3, 0.5, 0.9, 1), 
#                         min_prevalence=10,
#                         create_gif=T, 
#                         delay=0.20  # Only matters if `create_gif` is TRUE
#                         )

seu <- filter_infercnv_threshold(seu[['seurat']], outdir=seu[['gif_dir']],
                                thresholds=0.35,
                                min_prevalence=60,
                                create_gif=F,
                                return_seurat_object=T)

In [None]:
seu <- produce_seurat('GM36') # 22 24 29 34 35 36

# filter_infercnv_threshold(seu[['seurat']], outdir=seu[['gif_dir']],
#                         thresholds= seq(0,1,0.01), #c(0, 0.3, 0.5, 0.9, 1), 
#                         min_prevalence=10,
#                         create_gif=T, 
#                         delay=0.3  # Only matters if `create_gif` is TRUE
#                         )

seu <- filter_infercnv_threshold(seu[['seurat']], outdir=seu[['gif_dir']],
                                thresholds=0.35,
                                min_prevalence=91,
                                create_gif=F,
                                return_seurat_object=T)