# Match Data for Max BP-Weight

This program requires the following files:

    Max BP-Weight.xlsx
    Gene Uniqueness.csv
    BP Jaccard Similarity.csv

The program outputs the following file:

    Match Data for Max BP-Weight.xlsx

The program takes about 2 minutes to produce these files and may seem unresponsive.

## Define the filenames

In [1]:
# Use pandas module for opening files.
import pandas 
# Use numpy module for specifying numpy.nan values.
import numpy
# Use os module to access files in other directories. 
from os import path

# Default file names.
max_bp_weight_path = path.abspath(
    '../Max BP-Weight/Max BP-Weight.xlsx')

# Matrix file paths.
# wo = without.
gene_uniqueness_path = path.abspath(
    '../Gene Uniqueness/Gene Uniqueness.csv')
bp_jaccard_wo_ancestors_path = path.abspath(
    '../BP Jaccard Similarity/BP Jaccard Similarity.csv')

# Place all paths in a list.
paths = [gene_uniqueness_path,
         bp_jaccard_wo_ancestors_path]

# Each file in the paths list has a matching name.
names = ['Gene Uniqueness', 
         'BP Jaccard without Ancestors']

## Import Jupyter Notebook to open files and extract data from matrices

In [2]:
%run "Data Matching Functions.ipynb"

Define open_max_excel_file: Open the file with the max disease pairs. 
Define open_file: Open .csv matrix file.
Define find_values_using_database_ids: Return table corresponding to specified DB ID pairs. 
Define get_values_from_matrices: Return values extracted from matrices.


## Open the max BP-weight file

In [3]:
# Open the max BP-weight file (using 'No Duplicates' sheet).
max_bp_weight = open_max_excel_file(max_bp_weight_path)

Opening Excel file.


### Display the contents of the max gene uniqueness file

In [4]:
# For visualization only: may delete code line.
max_bp_weight

Unnamed: 0,DB ID 1,Disease 1,DB ID 2,Disease 2,Max BP-Weight
0,114500.0,Colorectal cancer with chromosomal instability...,114480.0,"Breast cancer, somatic | {Breast cancer, prote...",304.279769056724
5,114550.0,"Hepatocellular cancer, somatic | Hepatoblastom...",114500.0,Colorectal cancer with chromosomal instability...,281.1608477056669
4,167000.0,"Ovarian cancer, somatic",114500.0,Colorectal cancer with chromosomal instability...,274.5626762412183
2,125853.0,"Diabetes mellitus, noninsulin-dependent, late ...",114500.0,Colorectal cancer with chromosomal instability...,211.5965500965389
6,211980.0,"Adenocarcinoma of lung, response to tyrosine k...",114500.0,Colorectal cancer with chromosomal instability...,206.8739154714535
...,...,...,...,...,...
5347,301035.0,"Hypothyroidism, congenital, nongoitrous, 9",114500.0,Colorectal cancer with chromosomal instability...,0.8938632739490935
5379,615355.0,Noonan syndrome 8,114500.0,Colorectal cancer with chromosomal instability...,0.868245730838466
5411,615544.0,?Periventricular nodular heterotopia 6,114500.0,Colorectal cancer with chromosomal instability...,0.7104465819539965
5413,614700.0,"Immunodeficiency, common variable, 8, with aut...",114500.0,Colorectal cancer with chromosomal instability...,0.5778523012099661


## Store the values obtained from the matrices

This takes about 2 minutes.

In [5]:
# Combine the measurements of the matrices specified by file paths
# using the DB IDs form the BP-weight file.
values = get_values_from_matrices(max_bp_weight, paths, names)

Gene Uniqueness 	 D:\Documents\Research\Paper\Camera Ready\Programs\Gene Uniqueness\Gene Uniqueness.csv
Opening .csv file.
Dropping unused rows and columns.
Replacing labels with corresponding DB IDs.
Converting values to type float.
BP Jaccard without Ancestors 	 D:\Documents\Research\Paper\Camera Ready\Programs\BP Jaccard Similarity\BP Jaccard Similarity.csv
Opening .csv file.
Dropping unused rows and columns.
Replacing labels with corresponding DB IDs.
Converting values to type float.


### Display the contents of the values file

In [6]:
# For visualization only: may delete code line.
values

Unnamed: 0,DB ID 1,Disease 1,DB ID 2,Disease 2,Max BP-Weight,Gene Uniqueness,BP Jaccard without Ancestors
0,114500.0,Colorectal cancer with chromosomal instability...,114480.0,"Breast cancer, somatic | {Breast cancer, prote...",304.279769056724,2.873541,0.387429
5,114550.0,"Hepatocellular cancer, somatic | Hepatoblastom...",114500.0,Colorectal cancer with chromosomal instability...,281.1608477056669,4.818368,0.430556
4,167000.0,"Ovarian cancer, somatic",114500.0,Colorectal cancer with chromosomal instability...,274.5626762412183,2.884662,0.407754
2,125853.0,"Diabetes mellitus, noninsulin-dependent, late ...",114500.0,Colorectal cancer with chromosomal instability...,211.5965500965389,0.000000,0.245000
6,211980.0,"Adenocarcinoma of lung, response to tyrosine k...",114500.0,Colorectal cancer with chromosomal instability...,206.8739154714535,0.951003,0.258938
...,...,...,...,...,...,...,...
5347,301035.0,"Hypothyroidism, congenital, nongoitrous, 9",114500.0,Colorectal cancer with chromosomal instability...,0.8938632739490935,0.000000,0.001572
5379,615355.0,Noonan syndrome 8,114500.0,Colorectal cancer with chromosomal instability...,0.868245730838466,0.000000,0.001575
5411,615544.0,?Periventricular nodular heterotopia 6,114500.0,Colorectal cancer with chromosomal instability...,0.7104465819539965,0.000000,0.001572
5413,614700.0,"Immunodeficiency, common variable, 8, with aut...",114500.0,Colorectal cancer with chromosomal instability...,0.5778523012099661,0.000000,0.001572


## Sort the values file by BP-weight and by gene uniqueness

In [7]:
# Convert BP-weight from string to float.
values['Max BP-Weight'] = values[
    'Max BP-Weight'].astype(float)

# Sort the table by gene uniqueness and BP-weight.
values.sort_values(ascending = False, inplace = True,
                   by=['Max BP-Weight',
                       'Gene Uniqueness'])

### Display the table after sorting

In [8]:
# For visualization only: may delete code line.
values

Unnamed: 0,DB ID 1,Disease 1,DB ID 2,Disease 2,Max BP-Weight,Gene Uniqueness,BP Jaccard without Ancestors
0,114500.0,Colorectal cancer with chromosomal instability...,114480.0,"Breast cancer, somatic | {Breast cancer, prote...",304.279769,2.873541,0.387429
5,114550.0,"Hepatocellular cancer, somatic | Hepatoblastom...",114500.0,Colorectal cancer with chromosomal instability...,281.160848,4.818368,0.430556
4,167000.0,"Ovarian cancer, somatic",114500.0,Colorectal cancer with chromosomal instability...,274.562676,2.884662,0.407754
2,125853.0,"Diabetes mellitus, noninsulin-dependent, late ...",114500.0,Colorectal cancer with chromosomal instability...,211.596550,0.000000,0.245000
6,211980.0,"Adenocarcinoma of lung, response to tyrosine k...",114500.0,Colorectal cancer with chromosomal instability...,206.873915,0.951003,0.258938
...,...,...,...,...,...,...,...
5347,301035.0,"Hypothyroidism, congenital, nongoitrous, 9",114500.0,Colorectal cancer with chromosomal instability...,0.893863,0.000000,0.001572
5379,615355.0,Noonan syndrome 8,114500.0,Colorectal cancer with chromosomal instability...,0.868246,0.000000,0.001575
5411,615544.0,?Periventricular nodular heterotopia 6,114500.0,Colorectal cancer with chromosomal instability...,0.710447,0.000000,0.001572
5413,614700.0,"Immunodeficiency, common variable, 8, with aut...",114500.0,Colorectal cancer with chromosomal instability...,0.577852,0.000000,0.001572


## Save combined measurements as an Excel file

In [9]:
# Specify the filename.
filename = 'Match Data for Max BP-Weight.xlsx'

# Saving Excel files is much slower than saving .csv files, but this
# is not a problem because the files are very small.
with pandas.ExcelWriter(filename) as spreadsheet:  

    # Make index = False so that index columns are dropped
    values.to_excel(
        spreadsheet, sheet_name = 'No BP Ancestors', index = False)