# Max Value Finder

This notebook contains reusable functions that find the maximum value pairs of half-full matrices. Such matrices include:

    Gene Uniqueness.csv
    BP-Weight.csv
    BP Jaccard Similarity.csv

## Define a function to open .csv matrix files

In [1]:
# Import pandas module for opening files.
import pandas

def open_file(filename):
    '''Open .csv matrix files. '''
    print("Opening file.")
    file = pandas.read_csv(filename,      # Pass filename.
                           dtype = str,   # Treat table as type string.
                           index_col = 0) # Use column 0 as index.
    
    # Return the opened file object.
    return file

## Define a function that gets the submatrix containing numeric values (ignoring disease names and IDs)

In [2]:
def get_values(matrix):
    '''Return a submatrix obtained from a matrix file. The matrix only
    contains numeric values, meaning that there are columns such as
    'Disease' or 'DB ID'.
    
    Parameters:
    matrix: Pandas dataframe containing all the values. The matrix 
    must be half full. 
    '''
    
    # Get the name of the last index, for example '4999'.
    last_disease = matrix.columns[-1]
    
    # If the last column is '4999', then its size is 5000x5000.
    # Print the size of the matrix.
    print('Getting square matrix of size ', 
          int(last_disease) + 1, 'x', 
          int(last_disease) + 1, ' as type float.', 
          sep = '')
    
    # Locate the square matrix and convert it into type float.
    values = matrix.loc[
        '0':last_disease,'0':last_disease].astype(float)
    
    # Convert the header and index column into integers. This is
    # important because the function df.at[0.0, 0.0] will return
    # an error, while the function df.at[0,0] will not, so convert
    # [0.0, 1.0, 2.0, ... 4999.0] into [0, 1.0, 2, ... 4999].
    values.columns = list(map(int, values.columns.to_list()))
    values.index = list(map(int, values.index.to_list()))
    
    # Returns square matrix of values extracted from the matrix file.
    return values

## Define a function that gets the location of max values by scanning a half-full matrix vertically and horizontally

In [3]:
# Import numpy module for dealing with NaN values
# and replacing matrix diagonals.
import numpy

def find_max_values(values):
    '''Return the row and column indexes of the cells containing the
    maximum values in the matrix.
    
    Parameters:
    values: Pandas dataframe containing all the values. The dataframe
    must be a matrix and be half full.
    '''
    # Replace all the values in the diagonal using NaN values, or
    # the maximum values will all be located along the diagonal.
    print('Replacing matrix diagonal with NaN values.')
    numpy.fill_diagonal(values.values, numpy.nan)

    # Vertically scan every column and return row where max val is.
    # The columns are the keys, and the rows are the values.
    print('Finding max values vertically.')
    column_row_max = values.idxmax(axis = 'rows')

    # Horizontally scan every row and return column where max val is.
    # The rows are the keys, and the columns are the values.
    print('Finding max values horizontally.')
    row_column_max = values.idxmax(axis = 'columns')
    
    # Return row and column indexes of cells containing maximum vals.
    return column_row_max, row_column_max

## Define a function that compares the vertically-found and horizontally-found max values, and returns the bigger of the two

In [4]:
def compare_max_values(column_row_max, row_column_max, values):
    '''Compare values vertically and horizontally and return the
    bigger of the two, along with the row and column indexes. This is 
    because the matrix is hall full so both halves need to be compared. 
    
    Parameters:
    column_row_max (dict): Dictionary containing columns as keys and 
    rows as values. The (column, row) coordinates point at a max val.
    row_column_max (dict): Dictionary containing rows as keys and
    columns as vals. The (column, row) coordinates point at a max val.
    values: Pandas data frame containing all the values in a square
    matrix.
    '''
    print('Comparing vertical and horizontal values.')
    
    # Create a dictionary to store the indexes and the max values.
    max_value = {}
    
    # Iterate thru evey disease index.
    for disease in range(len(column_row_max)):
        
        # Get the row index of the maximum value.
        row = column_row_max[disease]
        
        # Get the column index of the maximum value.
        col = row_column_max[disease]

        # Define whether or not row index is NaN.
        row_isnan = numpy.isnan(row)

        # Define whether or not col index is NaN.
        col_isnan = numpy.isnan(col)

        # If column and row indexes are NaN, store all values as NaN.
        if  row_isnan and col_isnan:
            
            # Row index, column index, and value are NaN.
            max_value[disease] = (numpy.nan, numpy.nan, numpy.nan)

        # If only the row is NaN, then store the column index.
        elif row_isnan and not col_isnan:
            
            # Convert column index to an integer since it is defined.
            col = int(col)
            
            # Use the column index to obtain the max value.
            value = values.at[disease, col]
            
            # Store the row index, column index, and maximum value.
            max_value[disease] = (disease, col, value)

        # If only the column index is NaN, then store the row index.
        elif col_isnan and not row_isnan:
            
            # Convert row index to an integer since it is defined.
            row = int(row)
            
            # Use the row index to obtain the max value.
            value = values.at[row, disease]
            
            # Store the row index, column index, and maximum value.
            max_value[disease] = (disease, row, value)

        # Else both row and column are defined, so find bigger value.
        else:
            
            # Convert row index to an integer since it is defined.
            row = int(row)
            
            # Convert column index to an integer since it is defined.
            col = int(col)

            # Use the row index to obtain the max value.
            val1 = values.at[row, disease]
            
            # Use the column index to obtain the max value.
            val2 = values.at[disease, col]

            # Compare the max values.
            # Store the row index, column index, and maximum value.
            if val1 > val2:
                max_value[disease] = (disease, row, val1)
            else:
                max_value[disease] = (disease, col, val2)
               
    # Return the dictionary storing the indexes and the max values.
    return max_value

## Define a function that turns a dictionary into a Pandas dataframe

In [5]:
def dictionary_to_dataframe(max_values, sort = True):
    '''Convert a dictionary into a Pandas dataframe. The dataframe is
    then sorted using the 'Value' column.
    
    Parameters:
    max_values (dict): Dictionary containing a tuple with three
    elements (Index 1, Index 2, and Value).
    sort (bool): Whether or not to sort the index pairs based on
    value.
    '''
    
    # Define the column names for the dictionary.
    column_names = ['Index 1', 'Index 2', 'Value']
    
    # Convert the dictionary into a pandas dataframe.
    max_values = pandas.DataFrame.from_dict(max_values,
                                            orient = 'index', 
                                            columns = column_names)
    
    # Sort the table if True, otherwise leave table unsorted.
    if sort: 
        
        # Sort the table in descending order using the 'Value' column.
        max_values = max_values.sort_values(
            ['Value'],ascending = False)

    # Return the table containing indexes and max value entries.
    return max_values

## Define a function that gets the max value index pairs and max values from a square matrix that is half-full

In [6]:
def get_max_values(filename, sort = True):
    '''Return max value index pairs from a square matrix that is
    half-full.
    
    Parameters:
    filename (str): The filename of the matrix with the values.
    sort (bool): Whether or not to sort the index pairs based on
    value.
    '''
    # Open the matrix file.
    matrix = open_file(filename)
    
    # Get the submatrix containing only numerical values.
    values = get_values(matrix)
    
    # Get the column and row index of the max values stored in the 
    # submatrix that only contains numerical values.
    column_row_max, row_column_max = find_max_values(values)
    
    # Compare the max values obtained vertically and horizontally
    # and keep the bigger values.
    max_values=compare_max_values(
        column_row_max, row_column_max, values)
    
    # Return the max values as a pandas dataframe (sorted by value).
    return dictionary_to_dataframe(max_values, sort)

## Define a function that retrieves specified columns from another file

In [7]:
def get_columns(filename, columns = ['DB ID', 'Disease']):
    '''Open .csv files containing row data.
    
    Parameters:
    filename (str): The .csv filename to work with.
    columns (list): The columns containing the data.
    '''
    file_columns = pandas.read_csv(
        filename,          # Pass filename.
        dtype = str,       # Treat table as type string.
        header = 0,        # Use row 0 as header.
        usecols = columns) # Pass column list. 
    
    # Remove rows which are completely empty:
    # this should only apply to the last few rows of the file.
    file_columns = file_columns.dropna()
    
    # Return the opened file object.
    return file_columns

## Define a function that replaces index columns with the specified columns coming from another file with matching row indexes

In [8]:
def replace_index_rows(frame,diseases_path,columns=['DB ID', 'Disease']):
    '''Replace the data of 'Index 1' and 'Index 2' columns with
    the data obtained from another file.
    
    Parameters:
    frame: Pandas dataframe obtained using get_max_values function.
    diseases_path (str): Path to file containing disease data.
    columns (list): The column names that will be used to replace
    the index columns.
    '''
    
    # Make a copy of the dataframe so that original is not modified.
    df = frame.copy()
    
    # Get the disease data that will replace the index rows.
    diseases = get_columns(diseases_path, columns)
    
    # 1st column will be inserted at 0, 2nd at 1, 3rd at 2, etc.
    insertion_index = 0
    
    # Replace 'Index 1' column.
    for column in columns:
        
        # Insert data that corresponds with current index, example: 
        # df.insert(0, 'DB ID 1', diseases['DB ID']),
        # df.insert(1, 'Disease 1', diseases['Disease']).
        df.insert(insertion_index, column + ' 1', diseases[column])
        insertion_index += 1
        
    # Make a backup of the current index order before replacing it.
    index = df.index    
    
    # Use the 'Row 2' column as index (current index will be lost).
    df = df.set_index(['Index 2'])
    
    # Replace 'Index 2' column.
    for column in columns:
        
        # Insert data that corresponds with current index, example: 
        # df.insert(2, 'DB ID 2', diseases['DB ID']),
        # df.insert(3, 'Disease 3', diseases['Disease']).
        df.insert(insertion_index, column + ' 2', diseases[column])
        insertion_index += 1
    
    # Recover original index.
    df.index = index

    # Remove the 'Row 1' column.
    df = df.drop(['Index 1'], axis = 1)
    
    # Return dataframe after replacing 'Index 1' and 'Index 2'.
    return df