In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter markdown
# GMT File Augmentation
<span style="font-size:18px;">This Appyter takes as input a GMT file; it then adds genes to gene sets based
on co-expression or co-occurrence matrices, and finally outputs an augmented GMT file for download.</span>

In [None]:
import requests
import json
import time
from tqdm import tqdm

import pandas as pd
from IPython.display import display, HTML

In [None]:
%%appyter hide_code
{% do SectionField(
    title='Input Settings',
    subtitle='Upload a GMT file, select a gene-gene correlation matrix, and choose the number of augmented genes to add to each gene set.',
    name='settings',
    img='settings.png',
) %}

{% set gmt_file = FileField(
    name='gmt_file',
    label='GMT File (.gmt or .txt)',
    default='test.gmt',
    description='Upload a GMT file for augmentation. The GMT file format is a tab delimited file format that describes gene sets.',
    examples={
        'test.gmt': 'https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=text&libraryName=VirusMINT',
    },
    section='settings',
)%}

{% set augmentation_method = ChoiceField(
    name='augmentation_method',
    label='Resource for gene set augmentation',
    description='Select a method for augmenting gene sets. The options are described in depth here: https://maayanlab.cloud/geneshot/help.html',
    choices={
        'ARCHS4 co-expression':'coexpression',
        'Proteomics co-expression':'proteomics',
        'Enrichr co-occurrence':'enrichr',
        'Tagger co-occurrence':'tagger',
        'GeneRIF co-occurrence':'generif',
    },
    default= 'Proteomics co-expression',
    section='settings',
    
)%}

{% set num_genes = IntField(
    name='num_genes',
    label='Number of augmented genes to add per set',
    description='Select the number of augmented genes to append to each gene set [50-200].',
    min= 50,
    max=200,
    default= 100,
    section='settings',
)%}

{% set keep_original = BoolField(
    name='keep_original',
    label='Include original genes from each gene set?',
    description='Toggle whether each gene set should include the original genes or just the augmented genes.',
    default= True,
    section='settings',
)%}

In [None]:
%%appyter code_exec
{% if augmentation_method.value != "proteomics" %}
def geneshot(gene_list):
    GENESHOT_URL = 'https://maayanlab.cloud/geneshot/api/associate'
    payload = {
      "gene_list": gene_list,
      "similarity": "{{augmentation_method}}"
    }
    response = requests.post(GENESHOT_URL, json=payload)

    data = json.loads(response.text)
    df = pd.DataFrame.from_dict({k:v['simScore'] for k,v in data['association'].items()},
                       orient = 'index',
                       columns = ['Score'])
    df = df.sort_values(by=['Score'], ascending = False).dropna()[0:{{num_genes}}]
    augmented_genes = df.loc[~df.index.isin(gene_list)].index.tolist()
    
    return augmented_genes

{% else %}
proteomics_mat = pd.read_csv('https://appyters.maayanlab.cloud/storage/GMT_Augmentation_Appyter/proteomics_correlation.tsv.gz',
                             sep = '\t',
                             index_col = 0)

def proteomics_augmentation(gene_list):
    if proteomics_mat.index.isin(gene_list).any():
        return proteomics_mat.loc[proteomics_mat.index.isin(gene_list)].\
            mean(axis=0).\
            sort_values(ascending=False)[0:{{num_genes}}].index.tolist()
    else:
        return []
    
{% endif %}

In [None]:
%%appyter code_exec
with open({{gmt_file}}, 'r') as gmt_file:
    genesets = [line.strip() for line in gmt_file.read().split('\n')]
    genesetlibrary = {line.split("\t\t")[0]:line.split("\t")[2:] 
                      for line in genesets if len(line.split("\t")[2:]) > 0}

In [None]:
%%appyter markdown
<span style="font-size:18px;">We iterate through each gene set in the gene set library and send the genes for 
augmentation using the **{{augmentation_method}}** gene-gene similarity matrix. If none of the genes from the gene 
set are present in the matrix, they cannot be augmented and the gene set is omitted from the final output.</span>

In [None]:
%%appyter code_exec
augmented_genesetlibrary = {}
for term,genes in tqdm(genesetlibrary.items()):
    {% if augmentation_method.value != 'proteomics' %}
    augmented_list = geneshot(genes)
    time.sleep(0.5)
    {% else %}
    augmented_list = proteomics_augmentation(genes)
    {% endif %}
    if augmented_list != []:
        {% if keep_original %}
        augmented_genesetlibrary[term] = sorted(genes+augmented_list)
        {% else %}
        augmented_genesetlibrary[term] = sorted(augmented_list)
        {% endif %}
    else:
        print(f"Failed to augment {term}")

In [None]:
%%appyter code_exec
filename = {{gmt_file}}.split('.')[0]+'_augmented.gmt'
with open(filename, 'w') as f:
    for k,v in augmented_genesetlibrary.items():
        print(k+"\t",*v,sep='\t',file=f)

In [None]:
%%appyter markdown
<span style="font-size:18px;">Below is a link to an augmented gene set library created using
**{{augmentation_method}}** with **{{num_genes}}** augmented genes per set
{% if keep_original.value == True %} in addition to the original genes. {%endif%}</span>

In [None]:
display(HTML(f'<style>a{{font-size:18px}}</style><a target="_blank" href="{filename}">{filename}</a>'))