# DisGeNET Gene-Phenotype Associations Harmonizome Processing
The DisGeNET database was downloaded from DisGeNET. The gene-disease associations were then processed to map genes to up-to-date and approved symbols, and phenotypes to IDs. In total, 196,561 associations between 14,002 genes and 6,832 phenotypes,  were extracted.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.spatial.distance as dist
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
import scanpy as sc
import sqlite3
from bokeh.io import output_file, output_notebook, export_svg, save
from bokeh.io.export import get_screenshot_as_png
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
import sys
sys.setrecursionlimit(100000)

## Load Data

In [None]:
dir = 'newdata/DisGeNETpheno/'

In [None]:
db = sqlite3.connect(dir+'disgenet_2020.db')
cur = db.cursor()

In [None]:
# This SQL query collects all the associations in the DisGeNET gene-disease network where the attributes are of type phenotype.
cur.execute(
    '''SELECT geneDiseaseNetwork.diseaseNID, geneDiseaseNetwork.geneNID, geneDiseaseNetwork.score FROM diseaseAttributes 
    JOIN geneDiseaseNetwork
    ON diseaseAttributes.diseaseNID = geneDiseaseNetwork.diseaseNID
    WHERE type = 'phenotype';'''
)

columns = []

for column in cur.description:
    columns.append(column[0])

disgenet = pd.DataFrame(cur.fetchall())
disgenet.columns = columns
disgenet = disgenet.reset_index(drop=True)

display(disgenet)
print(len(disgenet['diseaseNID'].unique()), 'phenotypes,', len(disgenet['geneNID'].unique()), 'genes')

In [None]:
cur.execute('SELECT geneNID,geneId,geneName,geneDescription FROM geneAttributes')

columns = []

for column in cur.description:
    columns.append(column[0])

disgenetgenes = pd.DataFrame(cur.fetchall())
disgenetgenes.columns = columns
disgenetgenes = disgenetgenes.set_index('geneNID')
disgenetgenes['geneId'] = disgenetgenes['geneId'].apply(int)

disgenetgenes

In [None]:
cur.execute('SELECT diseaseNID,diseaseId,diseaseName FROM diseaseAttributes WHERE type="phenotype"')

columns = []

for column in cur.description:
    columns.append(column[0])

disgenetphenotypes = pd.DataFrame(cur.fetchall())
disgenetphenotypes.columns = columns
disgenetphenotypes = disgenetphenotypes.set_index('diseaseNID')

disgenetphenotypes

In [None]:
disgenetmap = pd.read_csv('disGeNET_mapping.csv', header=None)
disgenetmap.columns = ['NID', 'ID', 'Name', 'Group']

NIDmap = disgenetmap.groupby('Group')['NID'].min().to_dict()
disgenetmap = disgenetmap.set_index('NID')

disgenetmap['GroupID'] = disgenetmap['Group'].apply(lambda x: disgenetmap.loc[NIDmap[x], 'ID'])
disgenetmap = disgenetmap.get(['GroupID', 'Group']).rename_axis('diseaseNID', axis=0)
disgenetmap.columns = ['diseaseId', 'diseaseName']
disgenetmap

In [None]:
for disease in disgenetphenotypes.index:
    if str(disease) in disgenetmap.index:
        disgenetphenotypes.loc[disease, 'diseaseId'] = disgenetmap.loc[str(disease), 'diseaseId']
        disgenetphenotypes.loc[disease, 'diseaseName'] = disgenetmap.loc[str(disease), 'diseaseName']

disgenetphenotypes['diseaseName'] = disgenetphenotypes['diseaseName'].apply(str.title)
diseaseNIDs = disgenetphenotypes.reset_index().groupby('diseaseName')['diseaseNID'].min().to_dict()
disgenetphenotypes['diseaseId'] = disgenetphenotypes['diseaseName'].apply(lambda x: disgenetphenotypes.loc[diseaseNIDs[x], 'diseaseId'])
disgenetphenotypes

## Pre-process Data

In [None]:
geneinfo = pd.read_csv('tables/gene_info', sep='\t', header=0)
geneinfo = geneinfo[geneinfo['#tax_id']==9606][geneinfo['type_of_gene']=='protein-coding'].get(['GeneID', 'Symbol', 'description']).set_index('GeneID')

dropgenes = []

for gene in disgenetgenes.index:
    gene = disgenetgenes.loc[gene]
    if gene['geneId'] not in geneinfo.index:
        dropgenes.append(gene.name)

disgenetgenes = disgenetgenes.drop(dropgenes, axis=0)

In [None]:
genes = []
for gene in disgenet['geneNID'].unique():
    if gene in disgenetgenes.index:
        genes.append(gene)

disgenet = disgenet.set_index('geneNID').T.get(genes).T.reset_index()
disgenet['diseaseNID'] = disgenet['diseaseNID'].apply(int)
disgenet['gene'] = disgenet['geneNID'].apply(lambda x: disgenetgenes.loc[x, 'geneName'])
disgenet['geneID'] = disgenet['geneNID'].apply(lambda x: disgenetgenes.loc[x, 'geneId'])
disgenet['disease'] = disgenet['diseaseNID'].apply(lambda x: disgenetphenotypes.loc[x, 'diseaseName'])
disgenet['diseaseID'] = disgenet['diseaseNID'].apply(lambda x: disgenetphenotypes.loc[x, 'diseaseId'])
disgenet = disgenet.groupby(['gene', 'geneID', 'disease', 'diseaseID'])['score'].agg(np.median).reset_index()
display(disgenet)
print(disgenet['gene'].unique().__len__(), 'genes,', disgenet['disease'].unique().__len__(), 'phenotypes')

## SQL Data Processing

### Datasets

In [None]:
# id, name, name wo resource, desc, association, gene set desc, gene sets desc, attribute set desc, signed, continuous, last_update, directory, num page views, resource fk, measurement fk, dataset group fk, attribute type fk, attribute group fk, evidencey type, evidence group, measurement bias, attribute type plural
(136, 'DisGeNET Gene-Phenotype Associations', 'Gene-Phenotype Associations', 'gene-phenotype associations sourced from curated repositories, GWAS catalogues, animal models and the scientific literature', 'gene-phenotype associations curated from genetic association studies', 'genes associated with the phenotype {0} in GWAS and other genetic association datasets from the DisGeNET Gene-Phenotype Associations dataset.', 'sets of genes associated with phenotypes in GWAS and other genetic association datasets from the DisGeNET Gene-Phenotype Associations dataset.', 'phenotypes associated with {0} gene in GWAS and other genetic association datasets from the DisGeNET Gene-Phenoptype Associations dataset.', 0, 0, '2023-03-09', 'disgenetphenotype', 0, 105, 19, 1, 15, 3, 'genetic association by data aggregation from genome-wide association and other genetic association studies', 'curated experimental data', 'high throughput, data driven', 'phenotypes')

### Genes To Add

In [None]:
geneinfo = pd.read_csv('tables/gene_info', sep='\t').get(['GeneID', 'Symbol', 'Synonyms', 'description'])
genes = pd.read_csv('production/gene.csv')
localgenes = pd.read_csv('newgenes.csv', header=None).drop(columns=[5,6,7,8]).set_index(0)

genefkdict = genes.set_index('ncbi_entrez_gene_id')['id'].to_dict()
genefkdict.update(localgenes.reset_index().set_index(2)[0].to_dict())

ncbigeneslist = geneinfo['GeneID'].to_list()
newgenes = []
dropgenes = []

for geneid in disgenet['geneID'].unique():
    if geneid not in genefkdict and geneid in ncbigeneslist:
        newgenes.append(geneid)
    elif geneid not in genefkdict and geneid not in ncbigeneslist:
        dropgenes.append(geneid)



# id, symbol, ncbi_entrez_gene_id, name, ncbi_entrez_gene_url
geneinfo = geneinfo.set_index('GeneID')
id = 57253

for gene in newgenes:
    gene = geneinfo.loc[gene]
    print((id, gene['Symbol'], gene.name, gene['description'], 'https://ncbi.nlm.nih.gov/gene/'+str(gene.name)), end=',\n')
    genefkdict[gene.name] = id
    id += 1

### Attributes To Add

In [None]:
id = 314069
medgenurl = 'https://www.ncbi.nlm.nih.gov/medgen/'
attributefkdict = {}
phenotypes = disgenet.get(['disease', 'diseaseID']).drop_duplicates('disease').reset_index(drop=True)

for phenotype in phenotypes.index:
    phenotype = phenotypes.loc[phenotype]
    print((id, phenotype['disease'], phenotype['diseaseID'], medgenurl+phenotype['diseaseID'], 99), end=',\n')
    attributefkdict[phenotype['diseaseID']] = id
    id += 1

### Gene Sets To Add

In [None]:
id = 133600000
disgeneturl = 'https://www.disgenet.org/browser/0/1/0/'
genesetfkdict = {}

#(id, name_from_dataset, id_from_dataset, url_from_dataset, dataset_fk, attribute_type_fk, attribute_fk)
for phenotype in phenotypes.index:
    phenotype = phenotypes.loc[phenotype]
    print((id, phenotype['disease'], phenotype['diseaseID'], disgeneturl+phenotype['diseaseID'], 136, 15, attributefkdict[phenotype['diseaseID']]), end=',\n')
    genesetfkdict[phenotype['diseaseID']] = id
    id += 1

### Associations To Add

In [None]:
id = 16000000

associations = disgenet.copy()
associations['gene_fk'] = associations['geneID'].apply(lambda x: genefkdict[x])
associations['gene_set_fk'] = associations['diseaseID'].apply(lambda x: genesetfkdict[x])
associations = associations.get(['gene_fk', 'gene_set_fk', 'score'])
associations['threshold'] = 1
associations.index += id
display(associations)
associations.to_csv('harmonizome-update/disgenetphenotype.csv')
print(len(associations['gene_fk'].unique()), 'genes,', len(associations['gene_set_fk'].unique()), 'phenotypes')

## Download File Processing

In [None]:
output_path = 'newdata/DisGeNETpheno/downloads/'

In [None]:
phenotypes = disgenet.groupby('disease')['gene'].describe()
phenotypes = phenotypes[phenotypes['count']>=5].index
edgelist = disgenet.set_index('disease').T.get(phenotypes).T.reset_index().get(['gene', 'geneID', 'disease', 'diseaseID', 'score'])
edgelist.columns = ['gene', 'geneID', 'phenotype', 'phenotypeID', 'score']
edgelist

### Binary Matrix

In [None]:
binarymatrix = pd.crosstab(index=edgelist['gene'],
                columns=edgelist['phenotype'],
                values=1,
                aggfunc=max).fillna(0)

binarymatrix = pd.DataFrame(binarymatrix, dtype=int)

binarymatrix = binarymatrix.rename_axis('Gene Symbol', axis='index').rename_axis('Phenotype',  axis='columns')

binarymatrixT = binarymatrix.T

binarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binarymatrix

### Gene List

In [None]:
genes = edgelist.get(['gene', 'geneID']).drop_duplicates().reset_index(drop=True)
genes.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
genes

### Attribute List

In [None]:
phenotypes = edgelist.get(['phenotype', 'phenotypeID']).drop_duplicates().reset_index(drop=True)
phenotypes.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
phenotypes

### Gene-Attribute Edge List

In [None]:
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene Set Library

In [None]:
arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'gene_set_library_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(binarymatrix.columns[i],*binarymatrix.index[arr[:,i]==1],
            sep='\t', end='\n', file=f)

### Attribute Set Library

In [None]:
arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(binarymatrixT.columns[i],*binarymatrixT.index[arr[:,i]==1],
            sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
similarity_matrix = dist.squareform(similarity_matrix)
similarity_matrix = 1 - similarity_matrix

gene_similarity_matrix = pd.DataFrame(similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None

gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
similarity_matrix = dist.pdist(binarymatrixT.to_numpy(dtype=np.int_), 'cosine')
similarity_matrix = dist.squareform(similarity_matrix)
similarity_matrix = 1 - similarity_matrix

attribute_similarity_matrix = pd.DataFrame(similarity_matrix, index=binarymatrixT.index, columns=binarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None

attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Cleaned Binary Matrix

In [None]:
binarymatrixcleaned = pd.crosstab(index=edgelist['gene'],
                columns=edgelist['phenotype'],
                values=edgelist['score'].values,
                aggfunc=max).fillna(0)

binarymatrixcleaned = binarymatrixcleaned.rename_axis('Gene Symbol', axis='index').rename_axis('Disease',  axis='columns')

binarymatrixcleaned

### Serializations

In [None]:
nodes = {}
edges = []

for gene in genes.index:
    id = int(genes.loc[gene, 'geneID'])
    label = genes.loc[gene, 'gene']
    nodes[id] = {
        "type": "gene",
        "properties": {
            "id": id,
            "label": label
        }
    }

for phenotype in phenotypes.index:
    id = phenotypes.loc[phenotype, 'phenotypeID']
    label = phenotypes.loc[phenotype, 'phenotype']
    nodes[id] = {
        "type": "phenotype",
        "properties": {
            "id": id,
            "label": label
        }
    }

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    source = edge['phenotypeID']
    target = int(edge['geneID'])
    edges.append({
    "source": source,
    "relation": "has material basis in somatic mutation in",
    "target": target,
    "properties": {
        "id": source+":"+str(target),
        "source_label": edge['phenotype'],
        "target_label": edge['gene'],
        "directed":True,
        "score":str(edge['score'])
    }
})

RDF

In [None]:
with open(output_path+'serializations/disgenetphenotype.rdf', 'w') as f:
    print('@prefix disease: https://www.disgenet.org/browser/0/1/0/', file=f)
    print('@prefix RO: http://purl.obolibrary.org/obo/RO_', file=f)
    print('@prefix gene: http://ncbi.nlm.nih.gov/gene/', file=f)
    print('', file=f)
    for edge in edges:
        print('disease:'+edge['source'], 'RO:0004004', 'gene:'+str(edge['target']), end=' .\n', file=f)

JSON

In [None]:
with open(output_path+'serializations/disgenetphenotype.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

TSV

In [None]:
nodeframe = pd.DataFrame(nodes)
edgeframe = pd.DataFrame(edges)

In [None]:
nodeframe = nodeframe.T
nodeframe['properties'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe = nodeframe.reset_index()
nodeframe.columns = ['id','type','label']
nodeframe['namespace'] = nodeframe.apply(lambda x: {'gene':'NCBI Entrez', 'phenotype':'MedGen'}[x['type']], axis=1)
nodeframe = nodeframe.get(['namespace', 'id', 'label'])
nodeframe.to_csv(output_path+'serializations/disgenetphenotype_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe['score'] = edgeframe['properties'].apply(lambda x: x['score'])
edgeframe = edgeframe.drop(columns=['properties'])
edgeframe.to_csv(output_path+'serializations/disgenetphenotype_tsv/edges.tsv', sep='\t')
edgeframe

## Visualization

### Gene-Attribute Clustered Heatmap

In [None]:
sns.clustermap(binarymatrix, cmap='seismic', center=0)

### Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0, metric='cosine')

### Attribute Similarity Clustered Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0, metric='cosine')

### UMAP

In [None]:
output_notebook()

In [None]:
umap = edgelist.sort_values(['phenotype', 'score'], ascending=[True, False])
umap = umap.groupby('phenotype')['gene'].apply(list).apply(' '.join).to_frame()
umap = umap['gene'].to_dict()

In [None]:
vec = TfidfVectorizer(max_df=0.5, min_df=10)
X = vec.fit_transform(umap.values())
adata = anndata.AnnData(X, dtype='float32')
adata.obs.index = umap.keys()

sc.pp.neighbors(adata, n_neighbors=50, use_rep='X')
sc.tl.leiden(adata)
sc.tl.umap(adata, min_dist=0.3, spread=4.6)

new_order = adata.obs.sort_values(by='leiden').index.tolist()
adata = adata[new_order,:]
adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

mapped_df = pd.DataFrame(adata.obsm['X_umap'])
mapped_df.columns = ['x', 'y']

mapped_df['cluster'] = adata.obs['leiden'].values
mapped_df['term'] = adata.obs.index

clusters = pd.unique(mapped_df['cluster']).tolist()
colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
color_mapper = {clusters[i]:colors[i%20] for i in range(len(clusters))}

mapped_df['color'] = mapped_df['cluster'].apply(lambda x: color_mapper[x])

xlabel = 'UMAP 1'
ylabel = 'UMAP 2'

source2 = ColumnDataSource(
        data=dict(
            x = mapped_df.x,
            y = mapped_df.y,
            alpha = [0.7] * mapped_df.shape[0],
            colors = mapped_df['color'], 
            size = [6] * mapped_df.shape[0],
            gene_set = mapped_df['term'],
            cluster = mapped_df['cluster']
        )
    )

hover_emb = HoverTool(name="df", tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
            <span style="font-size: 12px">@gene_set</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
            <span style="font-size: 12px">(@x,@y)</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
            <span style="font-size: 12px">@cluster</span>
        </div>
    </div>
    """)

tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']
title_emb = 'Gene Sets in DeepCoverMOA Library'
plot_emb = figure(width=1000, height=700, tools=tools_emb, title=title_emb, x_axis_label=xlabel, y_axis_label=ylabel)
plot_emb.circle( 'x', 'y', source = source2, size='size',
                alpha='alpha', line_alpha=0, line_width=0.01, name="df", 
                fill_color = 'colors', 
                line_color="black", hover_fill_color="firebrick")
plot_emb.xaxis.axis_label_text_font_style = 'normal'
plot_emb.xaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_style = 'normal'
plot_emb.title.align = 'center'
plot_emb.title.text_font_size = '18px'

show(plot_emb)


In [None]:
file(filename=output_path+'disgenetphenotype.html')
save(plot_emb)