# Analysis pipeline for genome-wide CRISPRko screens

This analysis pipeline for genome-wide CRISPRko screens first identifies positively and negatively-selected sgRNAs and genes using MAGeCK. A user-selected percentage of top-ranked genes are then submitted for enrichment analysis using Enrichr.

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import pandas as pd

import subprocess
from subprocess import Popen

import os
import traceback

# Display
import IPython
from IPython.display import display, Markdown, HTML

# For Enrichr
import requests, json


In [None]:
%%appyter hide_code_exec
{% do SectionField(
    name='Data_Section',
    title='Load your Data',
    subtitle='Upload genome-wide CRISPRko data',
    img='analysis.png'
    
) %}

{% do SectionField(
    name='Mageck_Section',
    title='Preferences for identifying positively- and negatively-selected sgRNAs and genes',
    subtitle='Using MAGeCK (Li et al. 2014)',
    img='analysis.png'
    
) %}

{% do SectionField(
    name='Enrichr_Section',
    title='Preferences for enrichment analysis of top positively- and negatively-selected genes',
    subtitle='Using Enrichr (Kuleshov et al. 2016)',
    img='enrichr.png'
) %}

In [None]:
%%appyter code_exec
{% set readcounts_filename = FileField(
    name='readcounts_filename', 
    label='Upload read counts file', 
    default='GSE158298_readcounts.txt',

    examples={'GSE158298_readcounts.txt': url_for('static', filename = 'GSE158298_readcounts.txt')}, 
    description='Upload read counts file. Acceptable file formats are provided in the next field.', 
    section='Data_Section')

%}

{% set file_format = ChoiceField(
  name='file_format',
  label='File format',
  description='Please select the file format for your read counts file',
  default='TSV (.tsv / .txt)',
  choices={
    'TSV (.tsv / .txt)': "sep='\\t',",
    'GZipped TSV (.tsv.gz / .txt.gz)': "sep='\\t', compression='gzip',",
    'CSV (.csv)': "sep=',',",
    'GZipped CSV (.csv.gz)': "compression='gzip',",
    'Excel Sheet 1 (.xls, .xlsx, .xlsm, .xlsb, .odf, .ods, .odt)': "excel",
  },
  section='Data_Section',
) %}

{% do DescriptionField(name = 'filter_div0', 
                       text = '<hr>', 
                       section = 'Data_Section') %}


{% set treatment_names = StringField(
    name='treatment_names', 
    label='Column labels for treatment samples', 
    default='SARS2MOI001, SARS2MOI03', 
    description='Column labels for treatment samples, separated by commas', 
    section='Data_Section')
%}

{% set control_names = StringField(
    name='control_names', 
    label='Column labels for control samples', 
    default='PreInfection, PreInfection', 
    description='Column labels for control samples, separated by commas', 
    section='Data_Section')
%}



In [None]:
%%appyter code_exec

{% set negcontrol_filename = FileField(
    name='negcontrol_filename', 
    label='Optional: Upload list of control sgRNAs', 
    default='GSE158298_negativecontrol.txt',

    examples={'GSE158298_negativecontrol.txt': url_for('static', filename = 'GSE158298_negativecontrol.txt')}, 
    description='Upload control sgRNAs as a plain text (TXT) file with one control sgRNA label per line', 
    section='Mageck_Section')

%}

{% set norm_method = ChoiceField(
  name='norm_method',
  label='Normalization method',
  description='Normalization method for ranking sgRNAs and genes. Select the \'control\' option if you have provided a list of control sgRNAs',
  default="median",
  choices={
    'none':'none',
      'median':'median',
      'total':'total',
      'control (select this option if providing list of control sgRNAs)':'control'
  },
  section='Mageck_Section',
) %}


{% set paired_samples = BoolField(
    name='paired_samples', 
    label='Paired samples?', 
    default= "false",
    description='Check if samples are paired. Please ensure control and treatment sample indices are aligned.', 
    section='Mageck_Section',
) 
%}

{% set adjustment_method = ChoiceField(
  name='adjustment_method',
  label='sgRNA-level p-value adjustment',
  default='False discovery rate',
  choices={
    'False discovery rate' : 'fdr',
      'Holm\'s method' : 'holms',
      'Pounds\' method' : 'pounds'
  },
  section='Mageck_Section',
) %}

{% set pdf_report = BoolField(
    name='pdf_report', 
    label='PDF report for selected sgRNAs and genes?', 
    default= "false",
    description='Select YES to receive the pdf report generated by MAGeCK.', 
    section='Mageck_Section',
) 
%}

In [None]:
%%appyter code_exec


{% set topk_percent = IntField(
    name='topk_percent', 
    label='Percentage of top-ranked genes to use for enrichment analysis', 
    min=1, 
    max=5, 
    default=1, 
    description='Choose the percentage of top-ranked genes to use for enrichment analysis', 
    section='Enrichr_Section')
%}

In [None]:
%%appyter code_exec

readcounts_filename = {{readcounts_filename}}
negcontrol_filename = {{negcontrol_filename}}
control_names = [name.strip() for name in {{control_names}}.split(",")]
treatment_names = [name.strip() for name in {{treatment_names}}.split(",")]

paired_samples = {{paired_samples}}
norm_method = "{{norm_method}}"
adjustment_method = "{{adjustment_method}}"
pdf_report = {{pdf_report}}

topk_percent = {{topk_percent}}

## Loading read counts file + Preprocessing

For MAGeCK, the read counts file must be a tab-separated file with an optional header line. 

The treatment and control labels submitted must correspond with the column names in the read counts file; that will be double-checked below. 

In [None]:
%%appyter code_eval

{% if file_format.value == "excel" %}
data = pd.read_excel(
    {{ readcounts_filename }},
)

{% else %}
data = pd.read_csv(
    {{ readcounts_filename }},
    {{ file_format }}
)
{% endif %}

data.to_csv("readcounts.txt", sep = "\t", index = False)
data.head()


In [None]:
readcounts_filename = "readcounts.txt"

In [None]:
col_names = list(data.columns)

for nm in zip(control_names, treatment_names):
    try:
        nm in col_names
    except:
        print(f"{nm} is not a column in the read counts file!")
        

## Ranking sgRNAs and genes using MAGeCK

In [None]:
%%appyter markdown

Model-based Analysis of Genome-wide CRISPR/Cas-9 Knockout (MAGeCK - Li et al. 2014) determines positively- and negatively-ranked genes using a modified robust rank aggregation method. 

In [None]:
def display_download_link(treatment, summary_type):
    fn = treatment + "." + summary_type
    html = "<a href=\"./{}\" target='_blank'>{}</a>".format(fn, "Download: " + fn)
    return HTML(html)

In [None]:
for treatment, control in zip(treatment_names, control_names): 
    command = ['mageck', 'test', '-k', readcounts_filename, '-t', treatment, '-c', control, 
               '--norm-method', norm_method, '-n', treatment]
    if paired_samples:
        command.append('--paired')
    if len(negcontrol_filename) != 0:
        command.extend(['--control-sgrna', negcontrol_filename])
    if pdf_report:
        command.append("--pdf-report")

    try:
        result = subprocess.run(command, capture_output=True, check=True)
        #print(result.returncode)
        display(Markdown("Treatment: {0}\n Control: {1}".format(treatment, control)))

        display(display_download_link(treatment, 'sgrna_summary.txt'))
        display(display_download_link(treatment, 'gene_summary.txt'))

        if pdf_report:
            display(display_download_link(treatment, 'pdf'))

    except:
        traceback.print_exc()
        print(f"Error: could not successfully run analysis for {treatment} and {control}")


    #print(result.stdout)
    #print(result.stderr)

## Enrichment Analysis using Enrichr

In [None]:
%%appyter markdown

Enrichment analysis is a statistical procedure used to identify biological terms which are over-represented in a given gene set. These include signaling pathways, molecular functions, diseases, and a wide variety of other biological terms obtained by integrating prior knowledge of gene function from multiple resources. Enrichr (Kuleshov et al. 2016) is a web-based application which allows to perform enrichment analysis using a large collection of gene-set libraries and various interactive approaches to display enrichment results.

Here, enrichment analysis will be performed on the top {{topk_percent}}% of positively- and negatively-selected genes using Enrichr (Kuleshov et al. 2016)

In [None]:
def get_enrichr_url(gene_list, description):
    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/addList' 
    genes_str = '\n'.join(gene_list)
    payload = {
        'list': (None, genes_str),
        'description': (None, description)
    }

    response = requests.post(ENRICHR_URL, files=payload)
    if not response.ok:
        raise Exception('Error analyzing gene list')

    #time.sleep(0.5)

    data = json.loads(response.text)

    short_id = data['shortId']
    url = "https://maayanlab.cloud/Enrichr/enrich?dataset={}".format(short_id)
    return (url)

In [None]:
for treatment in treatment_names:
    gene_fn = "{0}.gene_summary.txt".format(treatment)
    output_df = pd.read_csv(gene_fn, sep = "\t")
    
    topk_genes = int(topk_percent / 100 * len(output_df))

    neg_genes_df = output_df[['id']].head(topk_genes)
    
    pos_df = output_df.sort_values(by=['pos|rank'])
    pos_genes_df = pos_df[['id']].head(topk_genes)

    pos_url = get_enrichr_url(pos_genes_df['id'].tolist(), "Top {0}% up genes for {1}".format(topk_percent, treatment))
    neg_url = get_enrichr_url(neg_genes_df['id'].tolist(), "Top {0}% down genes for {1}".format(topk_percent, treatment))
    
    display(Markdown("**Top {0} ({3}%) up genes for {1}**: <{2}>".format(topk_genes, treatment, pos_url, topk_percent)))
    display(Markdown("**Top {0} ({3}%) down genes for {1}**: <{2}>".format(topk_genes, treatment, neg_url, topk_percent)))

