# BP Jaccard Similarity

This program requires the following files:

    Gene-Disease Associations, All GO IDs.csv
    GO-BP ID Ancestor Functions.ipynb
    go.obo

The program outputs the following file:

    BP Jaccard Similarity.csv
    
The program takes about 4 hours to produce these files and may seem unresponsive. 

## Define the default filenames

In [1]:
# Use os module to access files in other directories.
from os import path

# Default file names.
gene_disease_associations_path = path.abspath(
    '../Gene-Disease Associations/Gene-Disease Associations, All GO IDs.csv')

## Define a function to open gene-disease association files

In [2]:
# Import the pandas module for opening files.
import pandas

def get_disease_bp_terms(filename):
    '''Open the gene-disease associations file.
    
    Parameters:
    filename (str): The filename to work with.
    dtype: The data type.
    header (int): The row to serve as header.
    usecols (str): The columns to use.
    '''
    disease_bp_terms = pandas.read_csv(
        filename,
        dtype = str,
        header = 0,
        usecols = ['DB ID', 'Disease', 'GO-BP ID'])
    
    # Return the opened file object.
    return disease_bp_terms

## Open the gene-disease associations files to access the BP terms

In [3]:
# Open gene-disease file with all GO IDs.
disease_bp_terms = get_disease_bp_terms(
    gene_disease_associations_path)

### Display the contents of the gene-disease associations file

In [4]:
# For visualization only: may delete code line.
disease_bp_terms

Unnamed: 0,DB ID,Disease,GO-BP ID
0,114500,Colorectal cancer with chromosomal instability...,GO:0009895 | GO:0006417 | GO:0050918 | GO:0001...
1,114480,"Breast cancer, somatic | {Breast cancer, prote...",GO:0009895 | GO:0006417 | GO:0070201 | GO:0030...
2,125853,"Diabetes mellitus, noninsulin-dependent, late ...",GO:0006417 | GO:0070201 | GO:0071704 | GO:0072...
3,611162,"{Malaria, resistance to} | {Malaria, protectio...",GO:0009895 | GO:0006417 | GO:0070201 | GO:0030...
4,167000,"Ovarian cancer, somatic",GO:0009895 | GO:0006417 | GO:0070201 | GO:0071...
...,...,...,...
5410,235550,Hepatic venoocclusive disease with immunodefic...,GO:0008150 | GO:0006357 | GO:0044403 | GO:0016...
5411,615544,?Periventricular nodular heterotopia 6,GO:0008150 | GO:0007275 | GO:0032501 | GO:0048...
5412,234050,"Trichothiodystrophy 4, nonphotosensitive",GO:0008150 | GO:0009987 | GO:0051301 | GO:0007049
5413,614700,"Immunodeficiency, common variable, 8, with aut...",GO:0033036 | GO:0008150 | GO:0051179 | GO:0008104


## Create BP Jaccard score file without GO ID ancestors

In [5]:
# Get 'DB ID' and 'Disease' columns for BP Jaccard score file.
bp_jaccard_score = disease_bp_terms[['DB ID', 'Disease']]

### Display the BP Jaccard score file

In [6]:
# For visualization only: may delete code line.
bp_jaccard_score

Unnamed: 0,DB ID,Disease
0,114500,Colorectal cancer with chromosomal instability...
1,114480,"Breast cancer, somatic | {Breast cancer, prote..."
2,125853,"Diabetes mellitus, noninsulin-dependent, late ..."
3,611162,"{Malaria, resistance to} | {Malaria, protectio..."
4,167000,"Ovarian cancer, somatic"
...,...,...
5410,235550,Hepatic venoocclusive disease with immunodefic...
5411,615544,?Periventricular nodular heterotopia 6
5412,234050,"Trichothiodystrophy 4, nonphotosensitive"
5413,614700,"Immunodeficiency, common variable, 8, with aut..."


## Create a square matrix to store BP Jaccard scores

In [7]:
# Form square matrix by concatenating to transpose (no GO ancestors).
bp_jaccard_score = pandas.concat([bp_jaccard_score, 
                                  bp_jaccard_score.transpose()])

### Display BP Jaccard score file

In [8]:
# For visualization only: may delete code line.
bp_jaccard_score

Unnamed: 0,DB ID,Disease,0,1,2,3,4,5,6,7,...,5405,5406,5407,5408,5409,5410,5411,5412,5413,5414
0,114500,Colorectal cancer with chromosomal instability...,,,,,,,,,...,,,,,,,,,,
1,114480,"Breast cancer, somatic | {Breast cancer, prote...",,,,,,,,,...,,,,,,,,,,
2,125853,"Diabetes mellitus, noninsulin-dependent, late ...",,,,,,,,,...,,,,,,,,,,
3,611162,"{Malaria, resistance to} | {Malaria, protectio...",,,,,,,,,...,,,,,,,,,,
4,167000,"Ovarian cancer, somatic",,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5412,234050,"Trichothiodystrophy 4, nonphotosensitive",,,,,,,,,...,,,,,,,,,,
5413,614700,"Immunodeficiency, common variable, 8, with aut...",,,,,,,,,...,,,,,,,,,,
5414,618425,Neurodevelopmental disorder with impaired spee...,,,,,,,,,...,,,,,,,,,,
DB ID,,,114500,114480,125853,611162,167000,114550,211980,601626,...,609432,140000,228600,617175,176305,235550,615544,234050,614700,618425


## Get the BP terms associated to each disease based on the disease's index 

In [9]:
# Get dictionary from the gene-disease file with all GO IDs.
# Split each string and convert the resulting list into a set.
# Then store every set into a dictionary.
bp_term_dict = disease_bp_terms['GO-BP ID'].apply(
    lambda term: set(term.split(' | '))).to_dict()

### Display dictionary with row index keys and BP term list values

In [10]:
# For visualization only: may delete code line.
pandas.DataFrame.from_dict(bp_term_dict, orient = 'index')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2374,2375,2376,2377,2378,2379,2380,2381,2382,2383
0,GO:0045861,GO:0009798,GO:0034622,GO:0031032,GO:0061921,GO:2000147,GO:0032026,GO:0033233,GO:0032418,GO:0043149,...,GO:0061900,GO:0043244,GO:1900372,GO:0033623,GO:0051235,GO:0006302,GO:0010466,GO:0051656,GO:0010810,GO:0060260
1,GO:0045861,GO:0030214,GO:0034622,GO:2000147,GO:0016444,GO:0071824,GO:0030155,GO:1902253,GO:1900407,GO:0070374,...,,,,,,,,,,
2,GO:0034622,GO:2000147,GO:0046475,GO:0002819,GO:0071824,GO:0042330,GO:0030155,GO:0034375,GO:0060070,GO:0010962,...,,,,,,,,,,
3,GO:0045861,GO:2000332,GO:2000147,GO:0034114,GO:0002819,GO:0099024,GO:0030155,GO:1900407,GO:0050858,GO:0070374,...,,,,,,,,,,
4,GO:0045861,GO:0009798,GO:0034622,GO:2000147,GO:0033233,GO:0030155,GO:1902253,GO:1900407,GO:0060070,GO:0010962,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5410,GO:0044419,GO:0006357,GO:0044403,GO:0008150,GO:0016032,,,,,,...,,,,,,,,,,
5411,GO:0032502,GO:0032501,GO:0048856,GO:0007275,GO:0008150,,,,,,...,,,,,,,,,,
5412,GO:0007049,GO:0051301,GO:0008150,GO:0009987,,,,,,,...,,,,,,,,,,
5413,GO:0008104,GO:0008150,GO:0033036,GO:0051179,,,,,,,...,,,,,,,,,,


## Define a function that takes two row indexes and outputs the BP Jaccard similarity without including GO ID ancestors

Where the **BP Jaccard score** is defined as 

$$J_{BP}(x,y)= \dfrac{|remove\_ancestors(X\cap Y)|}{|remove\_ancestors(X\cup Y) \cup remove\_ancestors(X\cap Y)|}$$

where $X$ and $Y$ are the sets of biological processes associated to the corresponding diseases $x$ and $y$, and the function $remove\_ancestors$ removes biological process parents.

### Import the function that will remove GO ID ancestors

In [11]:
# Run Jupyter Notebook that has the remove_ancestors function.
%run "GO-BP ID Ancestor Functions.ipynb"

D:\Documents\Research\Paper\Camera Ready\Programs\Gene-Disease Associations\go.obo: fmt(1.2) rel(2020-06-01) 47,233 GO Terms
Create an acyclic-directed graph using the GO.obo file.
Define remove_ancestors: Take a list of BP IDs and remove redundant ID ancestors. 
Define get_shared_bps_no_ancestors: Return the set of BP terms that two diseases share after removing redundant BP term ancestors.
Define count_elements: Count the number of GO IDs left (assumes that entries with zero elements are empty or null).
Define get_all_ancestors: Take a string of GO IDs and return a list containing the GO IDs and their parents.
Define count_elements: Count the number of GO IDs left (assumes that entries with zero elements are empty or null).


In [12]:
def jaccard_similarity_no_ancestors(row1, row2):
    '''Return the Jaccard similarity between two diseases.
    
    Parameters:
    row1 (int): The row index number of disease 1.
    row2 (int): The row index number of disease 2.
    '''
    # Get set of shared BP terms (the intersection).
    intersection = bp_term_dict[row1].intersection(bp_term_dict[row2])
    
    # Get set of BP terms associated to both diseases (the union).
    union = bp_term_dict[row1].union(bp_term_dict[row2])
    
    # Remove ID ancestors after finding shared BP terms, not before.
    intersection_no_ancestors = remove_ancestors(intersection)
    
    # Remove ID ancestors.
    union_no_ancestors = remove_ancestors(union)
    
    # Return BP Jaccard similarity.
    return len(intersection_no_ancestors)/len(
        union_no_ancestors.union(intersection_no_ancestors))

## Define a function that stores the BP Jaccard similarity for every disease combination

In [13]:
def store_jaccard_similarity(jaccard_file, get_jaccard_similarity):
    '''Store the Jaccard similarity for every disease combination.
    
    Parameters:
    jaccard_file: Pandas data frame storing a square matrix of every
    disease combination.
    get_jaccard_similarity: Function that finds the BP Jaccard 
    similarity of diseases.
    '''
    # The number of rows is equal to number of keys in dictionary,
    # which is also equal to the number of diseases.
    row_count = len(bp_term_dict)
    
    # Iterate thru every cell in the matrix diagonal.
    for row in range(0, row_count):

        # Store Jaccard similarity.
        # This will only fill the matrix diagonal.
        jaccard_file.at[row, row] = 1
            
    # Iterate thru every row in the square matrix.
    for row in range(0, row_count):
        
        # Show progress every 50 lines.
        if (row % 50 == 0): print(row, end = ', ')
            
        # Iterate thru every column equal or larger than row index:
        # This will only fill half of the matrix.
        for col in range(row + 1, row_count):
            
            # Store Jaccard similarity.
            jaccard_file.at[row, col] = get_jaccard_similarity(
                row,col)

## Store the BP Jaccard similarity without GO ID ancestors for every disease combination

This takes about 4 hours.

In [14]:
# Fill the square matrix with BP Jaccard scores (GO ID ancestors).
store_jaccard_similarity(
    bp_jaccard_score, jaccard_similarity_no_ancestors)

0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1750, 1800, 1850, 1900, 1950, 2000, 2050, 2100, 2150, 2200, 2250, 2300, 2350, 2400, 2450, 2500, 2550, 2600, 2650, 2700, 2750, 2800, 2850, 2900, 2950, 3000, 3050, 3100, 3150, 3200, 3250, 3300, 3350, 3400, 3450, 3500, 3550, 3600, 3650, 3700, 3750, 3800, 3850, 3900, 3950, 4000, 4050, 4100, 4150, 4200, 4250, 4300, 4350, 4400, 4450, 4500, 4550, 4600, 4650, 4700, 4750, 4800, 4850, 4900, 4950, 5000, 5050, 5100, 5150, 5200, 5250, 5300, 5350, 5400, 

## Save the BP Jaccard similarity file as a .csv file (no GO ID ancestors)

In [15]:
# Specify the filename
filename = 'BP Jaccard Similarity.csv'

# Make index = True so that index columns aren't dropped.
bp_jaccard_score.to_csv(filename, index = True)

# Note: Do not save square matrices as Excel sheets using 'to_excel'
# function because saving matrices can take hours this way.
# Save matrices as .csv files and then manually convert them to Excel.