# Format assertions for Neo4j ingestion
This notebook formats the filtered network for ingestion into the Neo4j database, which is necessary to run the KG UI. 

Also included is the option to plot histogram distributions of source edges, target edges, and edge significance in the unfiltered network. 

In [None]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd

Set up assertions directory

In [None]:
assertions_dir = './kg_assertions_for_neo4j'

if not os.path.exists(assertions_dir):
    os.makedirs(assertions_dir)

Choose a network to format

In [None]:
network_type = 'node_weighted'

Upload the filtered edge list

In [None]:
network_edge_file = f'./filtered_edge_list/large/{network_type}/edge_list_filtered.csv'
final_edge_list = pd.read_csv(network_edge_file)

## Add URI metadata to each TF node

We can use the code in `notebooks/serialization.ipynb` to find a gene ID associated with each gene symbol, and then associate each node with a URI that points to a web page for that gene. 

Print statements are added in `get_gene_metadata` to identify genes that aren't found in the NCBI library.

In [None]:
def fetch_save_read(url, file, reader=pd.read_csv, sep='\t', **kwargs):
  ''' Download file from {url}, save it to {file}, and subsequently read it with {reader} using pandas options on {**kwargs}.
  '''
  if not os.path.exists(file):
    if os.path.dirname(file):
      os.makedirs(os.path.dirname(file), exist_ok=True)
    df = reader(url, sep=sep, index_col=None)
    df.to_csv(file, sep=sep, index=False)
  return pd.read_csv(file, sep=sep, **kwargs)

# get the NCBI gene info
organism = "Mammalia/Homo_sapiens"
url = 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/{}.gene_info.gz'.format(organism)
file = '{}.gene_info.tsv'.format(organism)

ncbi_gene = fetch_save_read(url, file)

In [None]:
def maybe_split(record):
    ''' NCBI Stores Nulls as '-' and lists '|' delimited
    '''
    if record in {'', '-'}:
        return set()
    return set(record.split('|'))

def supplement_dbXref_prefix_omitted(ids):
    ''' NCBI Stores external IDS with Foreign:ID while most datasets just use the ID
    '''
    for id in ids:
        # add original id
        yield id
        # also add id *without* prefix
        if ':' in id:
            yield id.split(':', maxsplit=1)[1]

In [None]:
# find all synonyms for each gene
ncbi_gene['All_synonyms'] = [
    set.union(
      maybe_split(gene_info['Symbol']),
      maybe_split(gene_info['Symbol_from_nomenclature_authority']),
      maybe_split(str(gene_info['GeneID'])),
      maybe_split(gene_info['Synonyms']),
      maybe_split(gene_info['Other_designations']),
      maybe_split(gene_info['LocusTag']),
      set(supplement_dbXref_prefix_omitted(maybe_split(gene_info['dbXrefs']))),
    )
    for _, gene_info in ncbi_gene.iterrows()
  ]

# look up by synonym
synonyms, gene_id = zip(*{
    (synonym, gene_info['GeneID'])
    for _, gene_info in ncbi_gene.iterrows()
    for synonym in gene_info['All_synonyms']
  })

ncbi_lookup_syn = pd.Series(gene_id, index=synonyms)

# look up by symbol incl capitalized
symbols, cap, gene_id = zip(*{
    (gene_info['Symbol'], gene_info['Symbol'].upper(), gene_info['GeneID'])
    for _, gene_info in ncbi_gene.iterrows()
  })

ncbi_lookup_sym = pd.Series(gene_id, index=symbols)
ncbi_lookup_sym_cap = pd.Series(gene_id, index=cap)

# find duplicate synonyms
index_values = ncbi_lookup_syn.index.value_counts()
ambiguous = index_values[index_values > 1]

# disambiguate 
ncbi_lookup_syn_disambiguated = ncbi_lookup_syn[(
(ncbi_lookup_syn.index == ncbi_lookup_syn) | (~ncbi_lookup_syn.index.isin(ambiguous))
)]

sym_dict = ncbi_lookup_sym.to_dict()
syn_dict_cap = ncbi_lookup_sym_cap.to_dict()
syn_dict = ncbi_lookup_syn_disambiguated.to_dict()

In [None]:
def gene_lookup(gene):
    ''' Look up the gene ID associated with a gene
    '''
    gene_id = sym_dict.get(gene)
    if gene_id: return str(gene_id)

    gene_id = syn_dict_cap.get(gene)
    if gene_id: return str(gene_id)
    
    return str(syn_dict.get(gene))

all_genes = {}
gene_ids = set()

def get_gene_meta(gene):
    ''' Find the URI for each gene using the gene ID
    '''
    if gene in all_genes:
        return all_genes[gene]
    else:
        gene_id = gene_lookup(gene)
        if gene_id in gene_ids:
            print(f"{gene} ID ({gene_id}) already found")
            return None
        elif gene_id == 'None':
            print(f"{gene} not found")
            return None
        elif gene_id == None:
            print(f"{gene} not found")
            return None
        else:
            gene_ids.add(gene_id)
            all_genes[gene] = {
                "id": gene_id,
                "label": gene,
                "uri": "https://www.ncbi.nlm.nih.gov/gene/%s"%gene_id
            }
            return all_genes[gene]

## Format data for UI ingestion
This produces a nodes list that is compatible with the KG UI ingestion script. Nodes require two columns, id and label. Note that we only have one node type, Transcription Factor, so we only require one node file. Since the TF names are inherently unique, the TF name will be used for both fields. 

The KG UI edges require three fields: source, relation, and target. We have two relation types, upregulation and downregulation, so we need to produce two edge files. For the edges, we include the z-score and p-value as metadata for each relationship. This enables us to filter the network edges in the website by significance. 

The file names for both the node and edge files are formatted to be compatible with the KG UI ingestion script and should not be changed.

11 TFs were found to be formatted differently in NCBI than in the network. These substitutions update the gene names to reflect NCBI formatting. 

Format the nodes. Each node has a gene ID (id), gene symbol (label), and URI (metadata).\
`[id,label,metadata]`

In [None]:
# Set the label used to describe the node type
node_name = "Transcription Factor"

# collect all unique source and target nodes
nodes = set() 
for (source, target), group in final_edge_list.groupby(['source','target']):
    nodes.add(source)
    nodes.add(target)

metanodes = {node: get_gene_meta(node) for node in nodes}
metanode_df = pd.DataFrame(metanodes)
metanode_df.T.to_csv(f"{assertions_dir}/{node_name}.nodes.csv", index=False)

Format the edges. The ID used to identify each node is the gene ID. Find these IDs and create the edges. We include the p-value and z-score as metadata in order to allow edge sorting. \
`[source, relation, target, z-score, p-value]` 

In [None]:
# convert source and target labels to IDs
final_edge_list['sourceID'] = final_edge_list['source'].apply(lambda x: get_gene_meta(x)['id'] if get_gene_meta(x) is not None else x)
final_edge_list['targetID'] = final_edge_list['target'].apply(lambda x: get_gene_meta(x)['id'])

# reorder the index to match ingestion format
new_order = ['sourceID','relation','targetID','z-score','p-value','source','target']

reordered_df = final_edge_list[new_order]
reordered_df.rename(columns={'sourceID':'source','targetID':'target','z-score':'z_score','p-value':'p_value','source':'source_label','target':'target_label'}, inplace=True)

# rename relations to be more descriptive
relation_rename = {
    '+': 'upregulates',
    '-': 'downregulates'
}
reordered_df.loc[:,'relation'] = reordered_df.loc[:,'relation'].replace(relation_rename)
reordered_df.sort_values(by=['z_score'],ascending=False, inplace=True)

# split the edge list based on relation type and save to two files
for relation in reordered_df['relation'].unique():
    filtered_df = reordered_df[reordered_df['relation'] == relation]
    file_name = f"{assertions_dir}/{node_name}.{relation}.{node_name}.edges.csv"
    filtered_df.to_csv(file_name, index=False)

## Histogram figures of source edge count, target edge count, and edge p-value distributions
By default, figures will be saves as PNGs. You can adjust the DPI or change the filetype to SVG below.

In [None]:
outputdir = f'{assertions_dir}/summary_figures'
outputtype = 'png'
network_type = 'node_weighted'
path_to_network = f'./filtered_edge_list/{network_type}/p_sorted_edge_stats.csv'
dpi=300
figsize=(8,6)

if not os.path.exists(outputdir):
    os.makedirs(outputdir)

network = pd.read_csv(path_to_network, delimiter = '\t', usecols=['source', 'target', 'p-value'])

In [None]:
def plot_histogram(items, logbins, xlabel, ylabel, fig_name, ylog=False):
    _, ax = plt.subplots(figsize=figsize)

    ax.hist(items, bins=logbins, color='black')

    ax.set_xscale('log')
    
    if ylog:
        ax.set_yscale('log')

    ax.set_xlabel(xlabel, fontsize=14)
    ax.set_ylabel(ylabel, fontsize=14)

    ax.tick_params(axis='x', labelsize=12)
    ax.tick_params(axis='y', labelsize=12)

    plt.savefig(f'{outputdir}/{fig_name}.{outputtype}', dpi=dpi)
    plt.show()

Source edges per TF

In [None]:
sources = network['source'].value_counts()
logbins = np.logspace(0,4,100)
plot_histogram(sources, logbins, "Sources per transcription factor", "Nodes", "source_histo")

Target edges per TF

In [None]:
targets = network['target'].value_counts() 
logbins = np.logspace(0, 3.1,100)
plot_histogram(targets, logbins, "Targets per transcription factor", "Nodes", "target_histo")

Edge significance by p-value

In [None]:
pvalue = network['p-value']
logbins = np.logspace(-17,0,100)
plot_histogram(pvalue, logbins, "Edge p-value", "Edges", "pvalue_histo", ylog=True)