In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import pandas as pd

import subprocess
from subprocess import Popen

import os

# Display
import IPython
from IPython.display import display, Markdown, HTML

# For Enrichr
import requests, json


In [None]:
%%appyter hide_code_exec
{% do SectionField(
    name='Data_Section',
    title='Load your Data',
    subtitle='Upload data to rank genes'
    
) %}

{% do SectionField(
    name='Enrichr_Section',
    title='Enrichment Analysis',
    subtitle='Submit up and down genes for enrichment analysis using Enrichr'
    
) %}

In [None]:
%%appyter code_exec
{% set temp_readcounts_filename = FileField(
    name='temp_readcounts_filename', 
    label='Upload read counts', 
    default='GSE158298_readcounts.txt',

    examples={'GSE158298_readcounts.txt': url_for('static', filename = 'GSE158298_readcounts.txt')}, 
    description='Upload data', 
    section='Data_Section')

%}


{% set file_format = ChoiceField(
  name='file_format',
  label='File Format',
  description='Please select your file format',
  default='TSV (.tsv / .txt)',
  choices={
    'TSV (.tsv / .txt)': "sep='\\t',",
    'GZipped TSV (.tsv.gz / .txt.gz)': "sep='\\t', compression='gzip',",
    'CSV (.csv)': "sep=',',",
    'GZipped CSV (.csv.gz)': "compression='gzip',",
    'Excel Sheet 1 (.xls, .xlsx, .xlsm, .xlsb, .odf, .ods, .odt)': "excel",
  },
  section='Data_Section',
) %}


{% set negcontrol_filename = FileField(
    name='negcontrol_filename', 
    label='Upload list of control sgRNAs (one per line)', 
    default='GSE158298_negativecontrol.txt',

    examples={'GSE158298_negativecontrol.txt': url_for('static', filename = 'GSE158298_negativecontrol.txt')}, 
    description='Upload negative control sgRNAs', 
    section='Data_Section')

%}

{% set temp_control_names = StringField(
    name='temp_control_names', 
    label='Column labels for control, separated by commas', 
    default='PreInfection, PreInfection', 
    description='class column name of data', 
    section='Data_Section')
%}

{% set temp_treatment_names = StringField(
    name='temp_treatment_names', 
    label='Column labels for treatment, separated by commas', 
    default='SARS2MOI001, SARS2MOI03', 
    description='Column labels for treatment', 
    section='Data_Section')
%}

{% set paired_samples = BoolField(
    name='paired_samples', 
    label='Paired samples? (make sure control and treatment sample indices are aligned)', 
    default= "false",
    description='Check if samples are paired', 
    section='Data_Section',
) 
%}

In [None]:
%%appyter code_exec


{% set topk_percent = IntField(
    name='topk_percent', 
    label='Percentage of top-ranked genes to use for enrichment analysis', 
    min=1, 
    max=5, 
    default=1, 
    description='Choose the percentage of top-ranked genes to use for enrichment analysis', 
    section='Enrichr_Section')
%}

In [None]:
%%appyter code_exec

temp_readcounts_filename = {{temp_readcounts_filename}}
negcontrol_filename = {{negcontrol_filename}}
temp_control_names = [name.strip() for name in {{temp_control_names}}.split(",")]
temp_treatment_names = [name.strip() for name in {{temp_treatment_names}}.split(",")]
paired_samples = {{paired_samples}}
topk_percent = {{topk_percent}}

## Reformatting to meet MaGeCK test input file specifications

#### Read counts file:
"The read count file should list the names of the sgRNA, the gene it is targeting, followed by the read counts in each sample. Each item should be separated by the tab ('\t'). A header line is optional."

Convert to tab-separated TXT file

In [None]:
%%appyter code_eval

{% if file_format.value == "excel" %}
data = pd.read_excel(
    {{ temp_readcounts_filename }},
)
{% else %}
data = pd.read_csv(
    {{ temp_readcounts_filename }},
    {# temp_readcounts_filename,#}
    {{ file_format }}
)
{% endif %}
data.head()

In [None]:
readcounts_filename = "readcounts.txt"

data.to_csv(readcounts_filename, sep = "\t", index = False)

#### Treatment and control sample names
In the -t/--treatment-id, -c/--control-id parameters, you can use either sample label or sample index to specify samples. If sample label is used, the labels [must] match the sample labels in the first line of the count table. For example, "HL60.final,KBM7.final".

You can also use sample index to specify samples. The index of the sample is the order it appears in the sgRNA read count file, starting from 0. The index is used in the -t/--treatment-id, -c/--control-id parameters. 

In [None]:
#",".join(list(data.columns))
col_names = list(data.columns)

col_names

In [None]:
temp_c = []
temp_t = []

print(temp_control_names, temp_treatment_names)

for nm in temp_control_names:
    nm = nm.strip()
    if nm not in col_names:
        # TODO throw error
        print("Error")
        break
    temp_c.append(nm)

for nm in temp_treatment_names:
    nm = nm.strip()
    if nm not in col_names:
        print("Error")
        break
    temp_t.append(nm)
    
control_names = ",".join(temp_c)
treatment_names = ",".join(temp_t)

control_names, treatment_names

#### Isolate control sgRNAs?

# Ranking sgRNAs and genes

Ranking sgRNAs and genes from read counts using MaGeCK (Li et al. 2014)


In [None]:
# Currently assumes len(treatment_names) == len(control_names) (i.e. samples are paired?)

if len(negcontrol_filename) == 0:
    if paired_samples == True:
        for treatment, control in zip(treatment_names.split(","), control_names.split(",")):
            print("Started1: treatment: {0}, control: {1}".format(treatment, control))
            test_proc = Popen(['mageck', 'test', '-k', readcounts_filename, '-t', treatment, '-c', control, '--paired', 
                               '-n', treatment])
            print(test_proc.wait())
            print("Completed: treatment: {0}, control: {1}".format(treatment, control))

    else:
        for treatment, control in zip(treatment_names.split(","), control_names.split(",")):
            print("Started2: treatment: {0}, control: {1}".format(treatment, control))
            test_proc = Popen(['mageck', 'test', '-k', readcounts_filename, '-t', treatment, '-c', control, 
                               '-n', treatment])
            print(test_proc.wait())
            print("Completed: treatment: {0}, control: {1}".format(treatment, control))
else:
    if paired_samples == True:
        for treatment, control in zip(treatment_names.split(","), control_names.split(",")):
            print("Started3: treatment: {0}, control: {1}".format(treatment, control))
            test_proc = Popen(['mageck', 'test', '-k', readcounts_filename, '-t', treatment, '-c', control, '--paired',
                               '--control-sgrna', negcontrol_filename, '-n', treatment])
            print(test_proc.wait())
            print("Completed: treatment: {0}, control: {1}".format(treatment, control))

    else:
        for treatment, control in zip(treatment_names.split(","), control_names.split(",")):
            print("Started4: treatment: {0}, control: {1}".format(treatment, control))
            test_proc = Popen(['mageck', 'test', '-k', readcounts_filename, '-t', treatment, '-c', control, 
                               '--control-sgrna', negcontrol_filename, '-n', treatment])
            print(test_proc.wait())
            print("Completed: treatment: {0}, control: {1}".format(treatment, control))

# Enrichment Analysis

Enrichment analysis using Enrichr (Kuleshov et al. 2016)


In [None]:
def get_enrichr_url(gene_list, description):
    ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/addList' 
    genes_str = '\n'.join(gene_list)
    payload = {
        'list': (None, genes_str),
        'description': (None, description)
    }

    response = requests.post(ENRICHR_URL, files=payload)
    if not response.ok:
        raise Exception('Error analyzing gene list')

    #time.sleep(0.5)

    data = json.loads(response.text)

    short_id = data['shortId']
    url = "https://amp.pharm.mssm.edu/Enrichr/enrich?dataset={}".format(short_id)
    return (url)

In [None]:

for treatment in treatment_names.split(","):
    gene_fn = "{0}.gene_summary.txt".format(treatment)
    output_df = pd.read_csv(gene_fn, sep = "\t")
    #output_df.head()
    
    topk_genes = int(topk_percent / 100 * len(output_df))

    neg_genes_df = output_df[['id']].head(topk_genes)
    #neg_genes_df.head()
    
    pos_df = output_df.sort_values(by=['pos|rank'])
    pos_genes_df = pos_df[['id']].head(topk_genes)

    #pos_genes_df.head()

    pos_url = get_enrichr_url(pos_genes_df['id'].tolist(), "Top {0}% up genes for {1}".format(topk_percent, treatment))
    neg_url = get_enrichr_url(neg_genes_df['id'].tolist(), "Top {0}% down genes for {1}".format(topk_percent, treatment))
    
    display(Markdown("**Top {0} ({3}%) up genes for {1}**: <{2}>".format(topk_genes, treatment, pos_url, topk_percent)))
    display(Markdown("**Top {0} ({3}%) down genes for {1}**: <{2}>".format(topk_genes, treatment, neg_url, topk_percent)))


