# Match Data for Max Gene Uniqueness

This program requires the following files:

    Max Gene Uniqueness.xlsx
    BP-Weight.csv
    BP Jaccard Similarity.csv

The program outputs the following file:

    Match Data for Max Gene Uniqueness.xlsx

The program takes about 2 minutes to produce these files and may seem unresponsive.

## Define the filenames

In [1]:
# Use pandas module for opening files.
import pandas 
# Use numpy module for specifying numpy.nan values.
import numpy
# Use os module to access files in other directories. 
from os import path

# Default file names.
max_gene_uniqueness_path = path.abspath(
    '../Max Gene Uniqueness/Max Gene Uniqueness.xlsx')

# Matrix file paths.
# wo = without.
bp_weights_wo_ancestors_path = path.abspath(
    '../BP-Weight/BP-Weight.csv')
bp_jaccard_wo_ancestors_path = path.abspath(
    '../BP Jaccard Similarity/BP Jaccard Similarity.csv')

# Place all paths in a list.
paths = [bp_weights_wo_ancestors_path,
         bp_jaccard_wo_ancestors_path]

# Each file in the paths list has a matching name.
names = ['BP-Weight without Ancestors', 
         'BP Jaccard without Ancestors']

## Import Jupyter Notebook to open files and extract data from matrices

In [2]:
%run "Data Matching Functions.ipynb"

Define open_max_excel_file: Open the file with the max disease pairs. 
Define open_file: Open .csv matrix file.
Define find_values_using_database_ids: Return table corresponding to specified DB ID pairs. 
Define get_values_from_matrices: Return values extracted from matrices.


## Open the max gene uniqueness file

In [3]:
# Open the max gene uniqueness file (using 'No Duplicates' sheet).
max_gene_uniqueness = open_max_excel_file(max_gene_uniqueness_path)

Opening Excel file.


### Display the contents of the max gene uniqueness file

In [4]:
# For visualization only: may delete code line.
max_gene_uniqueness

Unnamed: 0,DB ID 1,Disease 1,DB ID 2,Disease 2,Max Gene Uniqueness
0,114500,Colorectal cancer with chromosomal instability...,114550,"Hepatocellular cancer, somatic | Hepatoblastom...",4.818368146359217
1,114480,"Breast cancer, somatic | {Breast cancer, prote...",211980,"Adenocarcinoma of lung, response to tyrosine k...",3.853211163039713
3815,242600,"Iminoglycinuria, digenic",138500,Hyperglycinuria,2.938025771639991
442,202400,"Afibrinogenemia, congenital | Hypofibrinogenem...",616004,"Dysfibrinogenemia, congenital | Hypodysfibrino...",2.938025771639991
2192,226650,"Epidermolysis bullosa, generalized atrophic be...",226700,"Epidermolysis bullosa, junctional, Herlitz type",2.933706549333157
...,...,...,...,...,...
2752,618356,Neurodevelopmental disorder with central and p...,125800,"Diabetes insipidus, nephrogenic",0
2753,125800,"Diabetes insipidus, nephrogenic",615516,"Mental retardation, autosomal recessive 38",0
2755,617899,"Leukodystrophy, hypomyelinating, 14",214500,Chediak-Higashi syndrome,0
2756,214500,Chediak-Higashi syndrome,121850,Corneal fleck dystrophy,0


## Store the values obtained from the matrices

This takes about 2 minutes.

In [5]:
# Combine the measurements of the matrices specified by file paths
# using the DB IDs form the max gene uniqueness file.
values = get_values_from_matrices(max_gene_uniqueness, paths, names)

BP-Weight without Ancestors 	 D:\Documents\Research\Paper\Camera Ready\Programs\BP-Weight\BP-Weight.csv
Opening .csv file.
Dropping unused rows and columns.
Replacing labels with corresponding DB IDs.
Converting values to type float.
BP Jaccard without Ancestors 	 D:\Documents\Research\Paper\Camera Ready\Programs\BP Jaccard Similarity\BP Jaccard Similarity.csv
Opening .csv file.
Dropping unused rows and columns.
Replacing labels with corresponding DB IDs.
Converting values to type float.


### Display the contents of the values file

In [6]:
# For visualization only: may delete code line.
values

Unnamed: 0,DB ID 1,Disease 1,DB ID 2,Disease 2,Max Gene Uniqueness,BP-Weight without Ancestors,BP Jaccard without Ancestors
0,114500,Colorectal cancer with chromosomal instability...,114550,"Hepatocellular cancer, somatic | Hepatoblastom...",4.818368146359217,281.160848,0.430556
1,114480,"Breast cancer, somatic | {Breast cancer, prote...",211980,"Adenocarcinoma of lung, response to tyrosine k...",3.853211163039713,195.344487,0.339879
3815,242600,"Iminoglycinuria, digenic",138500,Hyperglycinuria,2.938025771639991,6.441373,1.000000
442,202400,"Afibrinogenemia, congenital | Hypofibrinogenem...",616004,"Dysfibrinogenemia, congenital | Hypodysfibrino...",2.938025771639991,27.881802,1.000000
2192,226650,"Epidermolysis bullosa, generalized atrophic be...",226700,"Epidermolysis bullosa, junctional, Herlitz type",2.933706549333157,14.422983,0.680000
...,...,...,...,...,...,...,...
2752,618356,Neurodevelopmental disorder with central and p...,125800,"Diabetes insipidus, nephrogenic",0,1.804430,0.160000
2753,125800,"Diabetes insipidus, nephrogenic",615516,"Mental retardation, autosomal recessive 38",0,1.471082,0.176471
2755,617899,"Leukodystrophy, hypomyelinating, 14",214500,Chediak-Higashi syndrome,0,0.492402,0.105263
2756,214500,Chediak-Higashi syndrome,121850,Corneal fleck dystrophy,0,1.799744,0.136364


## Sort the values file by gene uniqueness and by BP-weight

In [7]:
# Convert gene uniquness from string to float.
values['Max Gene Uniqueness'] = values[
    'Max Gene Uniqueness'].astype(float)

# Sort the table by gene uniqueness and BP-weight.
values.sort_values(ascending = False, inplace = True,
                   by=['Max Gene Uniqueness',
                       'BP-Weight without Ancestors'])

### Display the table after sorting

In [8]:
# For visualization only: may delete code line.
values

Unnamed: 0,DB ID 1,Disease 1,DB ID 2,Disease 2,Max Gene Uniqueness,BP-Weight without Ancestors,BP Jaccard without Ancestors
0,114500,Colorectal cancer with chromosomal instability...,114550,"Hepatocellular cancer, somatic | Hepatoblastom...",4.818368,281.160848,0.430556
1,114480,"Breast cancer, somatic | {Breast cancer, prote...",211980,"Adenocarcinoma of lung, response to tyrosine k...",3.853211,195.344487,0.339879
442,202400,"Afibrinogenemia, congenital | Hypofibrinogenem...",616004,"Dysfibrinogenemia, congenital | Hypodysfibrino...",2.938026,27.881802,1.000000
3815,242600,"Iminoglycinuria, digenic",138500,Hyperglycinuria,2.938026,6.441373,1.000000
2192,226650,"Epidermolysis bullosa, generalized atrophic be...",226700,"Epidermolysis bullosa, junctional, Herlitz type",2.933707,14.422983,0.680000
...,...,...,...,...,...,...,...
3172,613161,Beta-ureidopropionase deficiency,617271,Nephronophthisis 20,0.000000,0.000000,0.142857
3752,616814,Preimplantation embryonic lethality,610992,?Phosphoserine aminotransferase deficiency,0.000000,0.000000,0.142857
3505,614756,"Cerebellar ataxia, nonprogressive, with mental...",618827,Myopia 27,0.000000,0.000000,0.090909
2854,248000,"Macrocephaly/megalencephaly syndrome, autosoma...",245349,Lacticacidemia due to PDX1 deficiency,0.000000,0.000000,0.142857


## Save combined measurements as an Excel file

In [9]:
# Specify the filename.
filename = 'Match Data for Max Gene Uniqueness.xlsx'

# Saving Excel files is much slower than saving .csv files, but this
# is not a problem because the files are very small.
with pandas.ExcelWriter(filename) as spreadsheet:  

    # Make index = False so that index columns are dropped
    values.to_excel(
        spreadsheet, sheet_name = 'No BP Ancestors', index = False)