In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

# Dataset Pair Attribute Similarity
## Import modules

In [None]:
import gzip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import fisher
from IPython.display import FileLink

## Load Gene Set Libraries

In [None]:
%%appyter hide_code
{% do SectionField(
    name= 'data',
    title= 'Upload Data'
)%}

In [None]:
# code adapted from maayanlab_bioinformatics (https://github.com/MaayanLab/maayanlab-bioinformatics)
def load_gmt(file):
    gmt = {}
    for line in file:
        term, description, *geneset = line.strip().split('\t')
        gmt[term] = geneset
    return gmt

### Dataset 1

#### Import Gene Set Library

In [None]:
%%appyter code_eval
{% do DescriptionField(
    name= 'Description1',
    text= '<center>The examples below were sourced from <a href="https://maayanlab.cloud/Harmonizome/" target ="_blank">Harmonizome</a>.<center>',
    section='data',
)%}

{% set dataset1 = FileField(
    constraint= '.*\.(txt|gmt)(\.gz)?',
    name= 'Dataset 1',
    label= 'Gene Set Library 1',
    default= 'CCLE Cell Line Gene Expression Profiles.gmt.gz',
    examples= {
        'CCLE Cell Line Gene Expression Profiles.gmt.gz': 'https://maayanlab.cloud/static/hdfs/harmonizome/data/cclemrna/gene_set_library_up_crisp.gmt.gz'
    },
    section= 'data'
)%}

gmt1 = {{dataset1}}

if gmt1.split('.')[-1]=='gz':
    with gzip.open(gmt1, mode='rt', encoding='utf-8', newline='\n') as f:
        f = f.read().splitlines()
else:
    with open(gmt1, mode='r', encoding='utf-8', newline='\n') as f:
        f = f.read().splitlines()

ds1 = load_gmt(f)

### Dataset 2

### Import Gene Set Library

In [None]:
%%appyter code_eval

{% set dataset2 = FileField(
    constraint= '.*\.(txt|gmt)(\.gz)?',
    name= 'Dataset 2',
    label= 'Gene Set Library 2',
    default= 'LINCS KinomeScan Kinase Inhibitor Targets.gmt.gz',
    examples= {
        'LINCS KinomeScan Kinase Inhibitor Targets.gmt.gz': 'https://maayanlab.cloud/static/hdfs/harmonizome/data/kinomescan/gene_set_library_crisp.gmt.gz'
    },
    section= 'data'
)%}

gmt2 = {{dataset2}}

if gmt2.split('.')[-1]=='gz':
    with gzip.open(gmt2, mode='rt', encoding='utf-8', newline='\n') as f:
        f = f.read().splitlines()
else:
    with open(gmt2, mode='r', encoding='utf-8', newline='\n') as f:
        f = f.read().splitlines()

ds2 = load_gmt(f)

## Calculate Dataset-Dataset Attribute Similarity
The [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index) measures the similarity of two sets. For each gene set in Dataset 1 and Dataset 2, we'll compute how many genes are in both datasets and divide that by the number of genes in either dataset. Identical gene sets will have a Jaccard index of 1, and gene sets with no overlap will have a Jaccard index of 0.

In [None]:
genes = []
for geneset in ds1.values():
    genes += geneset
for geneset in ds2.values():
    genes += geneset
genes = np.unique(genes).shape[0]

In [None]:
jaccard = pd.DataFrame(0, index = ds1.keys(), columns = ds2.keys())
fisherp = pd.DataFrame(0, index = ds1.keys(), columns = ds2.keys())

for i in ds1:
    cset = ds1[i]
    for j in ds2:
        kset = ds2[j]
        a = np.intersect1d(cset,kset).shape[0]
        b = np.setdiff1d(cset, kset).shape[0]
        c = np.setdiff1d(kset,cset).shape[0]
        d = genes - a - b - c
        jaccard.loc[i,j] = a/(a+b+c)
        fisherp.loc[i,j] = fisher.pvalue(a, b, c, d).two_tail

display(jaccard)
display(fisherp)

## Analyze Results

In [None]:
jtop = jaccard.stack().sort_values(ascending=False).to_frame()
jtop.columns =  ['Jaccard']
jtop

In [None]:
ftop = fisherp.stack().sort_values().to_frame()
ftop.columns =  ['p']
ftop

In [None]:
top = pd.concat([jtop,ftop], axis=1).sort_values(['Jaccard','p'], ascending=[False,True])[:1000].reset_index()
top.columns = ['Dataset 1 Attribute', 'Dataset 2 Attribute', 'Jaccard', 'p']
top

## Visualization

### Generating Histogram of Gene Set Lengths

In [None]:
ds1len = {key: len(value) for key,value in ds1.items()}
ds2len = {key: len(value) for key,value in ds2.items()}
fig = plt.hist([list(ds1len.values()),list(ds2len.values())], 
               color=['red','blue'], 
               label=['Dataset 1', 'Dataset 2'])
plt.title('Gene Set Lengths')
plt.xlabel('Genes')
plt.ylabel('Gene Sets')
plt.legend()
plt.show()

### Generating Clustered Heatmap of Jaccard Indices

In [None]:
sns.clustermap(jaccard, cmap='seismic', center=0)

### Generating Clustered Heatmap of Fisher Exact Test P-values

In [None]:
fishergraph = fisherp.stack().to_frame()
fishergraph = fishergraph[fishergraph[0]<0.05].reset_index()
fishergraph.columns = ['Dataset 1 Attribute', 'Dataset 2 Attribute', 'p']
fishergraph = pd.crosstab(fishergraph['Dataset 1 Attribute'], fishergraph['Dataset 2 Attribute'], fishergraph['p'], aggfunc=max).replace(np.nan, fishergraph['p'].max())
sns.clustermap(fishergraph, cmap='seismic_r', center=fishergraph.median().median(), vmax=fishergraph.median().median())

## Export

In [None]:
gmt1 = gmt1.split('.')[0]
gmt2 = gmt2.split('.')[0]
output = gmt1+'_'+gmt2+'_'

In [None]:
jaccard.to_csv(output+'jaccard.tsv', sep='\t')
fisherp.to_csv(output+'fisher_twotail.tsv', sep='\t')
top.to_csv(output+'topassociations.tsv', sep='\t')
display(FileLink(output+'jaccard.tsv'))
display(FileLink(output+'fisher_twotail.tsv'))
display(FileLink(output+'topassociations.tsv'))