# Dataset Pair Attribute Similarity
## Import modules

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import gzip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import fisher
from IPython.display import FileLink

## Load Gene Set Libraries

In [None]:
%%appyter hide_code
{% do SectionField(
    name= 'data',
    title= 'Upload Data'
)%}

### Dataset 1

#### Import Gene Set Library

In [None]:
%%appyter code_eval
{% do DescriptionField(
    name= 'Description1',
    text= '<center>The examples below were sourced from <a href="https://maayanlab.cloud/Harmonizome/" target ="_blank">Harmonizome</a>.<center>',
    section='data',
)%}

{% set dataset1 = FileField(
    constraint= '.*.(txt|gmt)(.gz)?',
    name= 'Dataset 1',
    label= 'Gene Set Library 1',
    default= 'CCLE Cell Line Gene Expression Profiles.gmt.gz',
    examples= {
        'CCLE Cell Line Gene Expression Profiles.gmt.gz': 'https://maayanlab.cloud/static/hdfs/harmonizome/data/cclemrna/gene_set_library_up_crisp.gmt.gz'
    },
    section= 'data'
)%}

gmt1 = {{dataset1}}

if {{dataset1}}.split('.')[-1]=='gz':
    with gzip.open({{dataset1}}, mode='rt', encoding='utf-8', newline='\n') as f:
        f = f.read().splitlines()
else:
    with open({{dataset1}}, mode='r', encoding='utf-8', newline='\n') as f:
        f = f.read().splitlines()

In [None]:
# code adapted from maayanlab_bioinformatics (https://github.com/MaayanLab/maayanlab-bioinformatics)
ds1 = {'gene set':{}, 'len':{}}
for i in enumerate(f):
  i = i[1].split('\t')
  ds1['gene set'][i[0]] = i[2:]
  ds1['len'][i[0]] = len(i[2:])

### Dataset 2

### Import Gene Set Library

In [None]:
%%appyter code_eval

{% set dataset2 = FileField(
    constraint= '.*.(txt|gmt)(.gz)?',
    name= 'Dataset 2',
    label= 'Gene Set Library 2',
    default= 'LINCS KinomeScan Kinase Inhibitor Targets.gmt.gz',
    examples= {
        'LINCS KinomeScan Kinase Inhibitor Targets.gmt.gz': 'https://maayanlab.cloud/static/hdfs/harmonizome/data/kinomescan/gene_set_library_crisp.gmt.gz'
    },
    section= 'data'
)%}

gmt2 = {{dataset2}}

if {{dataset2}}.split('.')[-1]=='gz':
    with gzip.open({{dataset2}}, mode='rt', encoding='utf-8', newline='\n') as f:
        f = f.read().splitlines()
else:
    with open({{dataset2}}, mode='r', encoding='utf-8', newline='\n') as f:
        f = f.read().splitlines()

In [None]:
# code adapted from maayanlab_bioinformatics (https://github.com/MaayanLab/maayanlab-bioinformatics)
ds2 = {'gene set':{}, 'len':{}}
for i in enumerate(f):
  i = i[1].split('\n')[0].split('\t')
  ds2['gene set'][i[0]] = i[2:]
  ds2['len'][i[0]] = len(i[2:])

## Calculate Dataset-Dataset Attribute Similarity
The [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index) measures the similarity of two sets. For each gene set in Dataset 1 and Dataset 2, we'll compute how many genes are in both datasets and divide that by the number of genes in either dataset. Identical gene sets will have a Jaccard index of 1, and gene sets with no overlap will have a Jaccard index of 0.

In [None]:
jaccard = pd.DataFrame(0, index = ds1['gene set'].keys(), columns = ds2['gene set'].keys())
fisherp = pd.DataFrame(0, index = ds1['gene set'].keys(), columns = ds2['gene set'].keys())

for i in ds1['gene set']:
    cset = ds1['gene set'][i]
    for j in ds2['gene set']:
        kset = ds2['gene set'][j]
        a = len(np.intersect1d(cset,kset))
        b = len(np.setdiff1d(cset, kset))
        c = len(np.setdiff1d(kset,cset))
        jaccard.loc[i,j] = a/(a+b+c)
        if (a+b)<5 or (a+c)<5:
            fisherp.loc[i,j] = 0
        else: 
            fisherp.loc[i,j] = fisher.pvalue(a, b, c, 0).two_tail

display(jaccard)
display(fisherp)

## Visualization

### Generating Histogram of Gene Set Lengths

In [None]:
fig = plt.hist([list(ds1['len'].values()),list(ds2['len'].values())], 
               color=['red','blue'], 
               label=['Dataset 1', 'Dataset 2'])
plt.title('Gene Set Lengths')
plt.xlabel('Genes')
plt.ylabel('Gene Sets')
plt.legend()
plt.show()

### Generating Clustered Heatmap of Jaccard Indices

In [None]:
sns.clustermap(jaccard, cmap='seismic', center=0)

### Generating Clustered Heatmap of Fisher Exact Test P-values

In [None]:
sns.clustermap(fisherp, cmap='seismic', center=0)

## Analyze Results

In [None]:
top = jaccard.stack().sort_values(ascending=False).to_frame().reset_index()[:20]
top.columns =  ['Dataset 1 Attribute', 'Dataset 2 Attribute', 'Jaccard']
top

In [None]:
ftop = fisherp.stack().sort_values(ascending=False).to_frame().reset_index()[:20]
ftop.columns =  ['Dataset 1 Attribute', 'Dataset 2 Attribute', 'p']
ftop

## Export

In [None]:
gmt1 = gmt1.split('.')[0]
gmt2 = gmt2.split('.')[0]
output = gmt1+'_'+gmt2+'_'

In [None]:
jaccard.to_csv(output+'jaccard.tsv', sep='\t')
fisherp.to_csv(output+'fisher_twotail.tsv', sep='\t')
display(FileLink(output+'jaccard.tsv'))
display(FileLink(output+'fisher_twotail.tsv'))