In [None]:
# default_exp diffquant_utils

# Utility Functions general

In [None]:
#export
def get_condpairname(condpair):
    return f"{condpair[0]}_VS_{condpair[1]}"

In [None]:
#export
def get_middle_elem(sorted_list):
    nvals = len(sorted_list)
    if nvals==1:
        return sorted_list[0]
    middle_idx = nvals//2
    if nvals%2==1:
        return sorted_list[middle_idx]
    return 0.5* (sorted_list[middle_idx] + sorted_list[middle_idx-1])

In [None]:
#export
import numpy as np
def get_nonna_array(array_w_nas):
    res = []
    isnan_arr = np.isnan(array_w_nas)

    for idx in range(len(array_w_nas)):
        sub_res = []
        sub_array = array_w_nas[idx]
        na_array = isnan_arr[idx]
        for idx2 in range(len(sub_array)):
            if not na_array[idx2]:
               sub_res.append(sub_array[idx2])
        res.append(np.array(sub_res))
    return np.array(res)

In [None]:
#export
import numpy as np
def get_non_nas_from_pd_df(df):
    # vals = df.values
    # result_dict = dict()
    # pep_names = df.index.values
    # for pep_name, sub_vals in zip(pep_names, vals):
    #     result_dict[pep_name] = sub_vals[~np.isnan(sub_vals)]
    # return result_dict
    return {
        pep_name: sub_vals[~np.isnan(sub_vals)] for pep_name, sub_vals in
        zip( df.index.values, df.values)
    }

In [None]:
#export
def invert_dictionary(my_map):
    inv_map = {}
    for k, v in my_map.iteritems():
        inv_map[v] = inv_map.get(v, []) + [k]
    return inv_map


# Input Parsers
The Alphaquant pipeline is run using a generic wide-table input format, as specified in the documentation. The following parsers convert long format tables as provided e.g. by Spectronaut or DIA-NN into this generic format. The configuration for the parsers is set by a yaml file.

## Convert long format to wide format

### Parse .yaml file
The relevant parameters for reading and reformatting the long table are stored in the "longtable_config.yaml" file. The functions below are for reading and reformating the config info

In [None]:
#export
import yaml

def get_relevant_columns(protein_cols, ion_cols, sample_ID, quant_ID, filter_dict):
    filtcols = []
    for filtconf in filter_dict.values():
        filtcols.append(filtconf.get('param'))
    relevant_cols = protein_cols + ion_cols + [sample_ID] + [quant_ID] + filtcols
    relevant_cols = list(set(relevant_cols)) # to remove possible redudancies
    return relevant_cols


def retrieve_configuration(config_yaml, input_type):
    """collect the relevant parameters for a given type of input file (eg. DIA-NN type)"""
    stream = open(config_yaml, 'r')
    config_all = yaml.safe_load(stream)
    config_dict = config_all.get(input_type)
    return get_config_columns(config_dict)


def get_config_columns(config_dict):
    protein_cols = config_dict.get("protein_cols")
    ion_cols = config_dict.get("ion_cols")
    sample_ID = config_dict.get("sample_ID")
    quant_ID = config_dict.get("quant_ID")
    filter_dict = config_dict.get("filters", {})
    relevant_cols = get_relevant_columns(protein_cols, ion_cols, sample_ID, quant_ID, filter_dict)
    return relevant_cols, protein_cols, ion_cols, sample_ID, quant_ID, filter_dict

def load_config(config_yaml):
    stream = open(config_yaml, 'r')
    config_all = yaml.safe_load(stream)
    return config_all

def get_type2relevant_cols(config_all):
    type2relcols = {}
    for type in config_all.keys():
        config_typedict = config_all.get(type)
        relevant_cols = get_config_columns(config_typedict)[0]
        type2relcols[type] = relevant_cols
    return type2relcols


### Filter and reformat

In [None]:
#export

def filter_input(filter_dict, input):
    for filtname,filterconf in filter_dict.items():
        param = filterconf.get('param')
        comparator = filterconf.get('comparator')
        value = filterconf.get('value')
        
        if comparator not in [">",">=", "<", "<=", "==", "!="]:
            raise TypeError(f"cannot identify the filter comparator of {filtname} given in the longtable config yaml!")

        if comparator=="==":
            input = input[input[param] ==value]
            continue
        try:
            input = input.astype({f"{param}" : "float"})
        except:
            pass

        if comparator==">":
            input = input[input[param].astype(type(value)) >value]

        if comparator==">=":
            input = input[input[param].astype(type(value)) >=value]

        if comparator=="<":
            input = input[input[param].astype(type(value)) <value]

        if comparator=="<=":
            input = input[input[param].astype(type(value)) <=value]
        
        if comparator=="!=":
            input = input[input[param].astype(type(value)) !=value]
        
    return input

In [None]:
#export
def merge_protein_and_ion_cols(input_df, protein_cols, ion_cols):
    input_df['protein'] = input_df.loc[:, protein_cols].astype('string').sum(axis=1)
    input_df['ion'] = input_df.loc[:, ion_cols].astype('string').sum(axis=1)
    return input_df

In [None]:
#export

def reformat_longtable_according_to_config(input_file, input_type, results_folder, config_file = "longtable_config.yaml", sep = "\t",decimal = "."):
    """Reshape a long format proteomics results table (e.g. Spectronaut or DIA-NN) to a wide format table. 
    :param file input_file: long format proteomic results table
    :param string input_type: the configuration key stored in the config file (e.g. "diann_precursor")
    """
    relevant_cols, protein_cols, ion_cols, sample_ID, quant_ID, filters = retrieve_configuration(config_file, input_type)
    
    input_df = pd.read_csv(input_file, sep = sep, decimal=decimal, usecols= relevant_cols).drop_duplicates()
    input_df = filter_input(filters, input_df)
    input_df = merge_protein_and_ion_cols(input_df, protein_cols, ion_cols)
    
    
    input_df = input_df.astype({f'{quant_ID}': 'float'})
    input_reshaped = pd.pivot_table(input_df, index = ['protein', 'ion'], columns = sample_ID, values = quant_ID, fill_value=0)
    if input_reshaped.iloc[:,0].replace(0, np.nan).median() <100: #when values are small, rescale by a constant factor to prevent rounding errors in the subsequent aq analyses
        input_reshaped = input_reshaped *10000
    
    input_reshaped = input_reshaped.reset_index()
    input_reshaped = input_reshaped.set_index("ion")
    ion_level = "fragion" if "fragion" in input_file else "precursor"
    input_reshaped.to_csv(f"{input_file}.aq_reformat.{ion_level}.tsv", index = False, sep = "\t")
    
    return input_reshaped

In [None]:
#export
def read_wideformat_table(peptides_tsv, config_dict):
    input_df = pd.read_csv(peptides_tsv,sep="\t")
    filter_dict = config_dict.get("filters")
    protein_cols = config_dict.get("protein_cols")
    ion_cols = config_dict.get("ion_cols")
    input_df = filter_input(filter_dict, input_df)
    input_df = merge_protein_and_ion_cols(input_df,protein_cols, ion_cols)
    input_df = input_df.set_index("ion")
    display(input_df)
    if 'quant_prefix' in config_dict.keys():
        quant_prefix = config_dict.get('quant_prefix')
        headers = ['protein'] + list(filter(lambda x: x.startswith(quant_prefix), input_df.columns))
        input_df = input_df[headers]
        input_df = input_df.rename(columns = lambda x : x.replace(quant_prefix, ""))

    return input_df

In [None]:
#export
def read_mq_peptides_table(peptides_tsv, pepheader = "Sequence", protheader = "Leading razor protein"):
    peps = pd.read_csv(peptides_tsv,sep="\t")
    peps = peps[peps["Reverse"] != "+"]
    peps = peps[peps["Potential contaminant"] != "+"]
    if pepheader != None:
        peps = peps.rename(columns = {pepheader : "ion"})
    if protheader != None:
        peps = peps.rename(columns = {protheader: "protein"})
    peps = peps.set_index("ion")
    headers = ['protein'] + list(filter(lambda x: x.startswith("Intensity "), peps.columns))
    peps = peps[headers]
    peps = peps.rename(columns = lambda x : x.replace("Intensity ", ""))

    return peps

## check for already processed files

In [None]:
#export
import os
def check_for_processed_runs_in_results_folder(results_folder):
    contained_condpairs = []
    folder_files = os.listdir(results_folder)
    result_files = list(filter(lambda x: "results.tsv" in x ,folder_files))
    for result_file in result_files:
        res_name = result_file.replace(".results.tsv", "")
        if ((f"{res_name}.normed.tsv" in folder_files) & (f"{res_name}.results.ions.tsv" in folder_files)):
            contained_condpairs.append(res_name)
    return contained_condpairs



## Wrapper functions

In [None]:
#export
import pandas as pd
import os
import pkg_resources
import pathlib

def import_data(input_file, results_folder, verbose=True, dashboard=False):
    """
    Function to import peptide level data. Depending on available columns in the provided file,
    the function identifies the type of input used (e.g. Spectronaut, MaxQuant, DIA-NN), reformats if necessary
    and returns a generic wide-format dataframe
    :param file input_file: quantified peptide/ion -level data
    :param file results_folder: the folder where the AlphaQuant outputs are stored
    """
    config_file = os.path.join(pathlib.Path(__file__).parent.absolute(), "..", "longtable_config.yaml") #the yaml config is located one directory below the python library files
    config_dict = load_config(config_file)
    type2relevant_columns = get_type2relevant_cols(config_dict)

    
    file_ext = os.path.splitext(input_file)[-1]
    if file_ext=='.csv':
        sep=','
    if file_ext=='.tsv':
        sep='\t'
    if file_ext=='.txt':
        sep='\t'

    if 'sep' not in locals():
        raise TypeError(f"neither of the file extensions (.tsv, .csv, .txt) detected for file {input_file}! Your filename has to end with one of these extensions. Please modify your file name accordingly.")
    
    if "aq_reformat" in input_file:
        data = pd.read_csv(input_file, sep = "\t")
        return data

    uploaded_data_columns = set(pd.read_csv(input_file, sep=sep, nrows=1).columns)
    
    for input_type in type2relevant_columns.keys():

        relevant_columns = type2relevant_columns.get(input_type)
        relevant_columns = [x for x in relevant_columns if x] #filter None values
        print(f"recols\t {relevant_columns}")
        if set(relevant_columns).issubset(uploaded_data_columns):
            config_dict_type =  config_dict.get(input_type)
            format = config_dict_type.get("format")
            if verbose:
                print(f"{input_type} headers in format {format} detected. Importing and re-formating.")
            if format == "longtable":
                data = reformat_longtable_according_to_config(input_file, input_type = input_type, results_folder=results_folder, sep = sep, config_file=config_file)
            elif format == "widetable":
                data = read_wideformat_table(input_file, config_dict_type)
            else:
                raise Exception("format: not specified in longtable_config.yaml")
            return data

    #if non of the cases match, return error
    raise TypeError(f'Input data format for {input_file} not known.')


In [None]:
#export
import pandas as pd

def get_samplenames(data):
    """extracts the names of the samples of the AQ input dataframe"""
    names = list(data.columns)
    names.remove('protein')
    return names


In [None]:
#export

import pandas as pd

def load_samplemap(samplemap_file):
    file_ext = os.path.splitext(samplemap_file)[-1]
    if file_ext=='.csv':
        sep=','
    if (file_ext=='.tsv') | (file_ext=='.txt'):
        sep='\t'
    
    if 'sep' not in locals():
        raise TypeError(f"neither of the file extensions (.tsv, .csv, .txt) detected for file {samplemap_file}! Your filename has to end with one of these extensions. Please modify your file name accordingly.")
        sep = "\t"

    return pd.read_csv(samplemap_file, sep = sep)

In [None]:
#export
def prepare_loaded_tables(data_df, samplemap_df):
    """
    Integrates information from the peptide/ion data and the samplemap, selects the relevant columns and log2 transforms intensities.
    """
    samplemap_df = samplemap_df[samplemap_df["condition"]!=""] #remove rows that have no condition entry
    filtvec_not_in_data = [(x in data_df.columns) for x in samplemap_df["sample"]] #remove samples that are not in the dataframe
    samplemap_df = samplemap_df[filtvec_not_in_data]
    headers = ['protein'] + samplemap_df["sample"].to_list()
    
    for sample in samplemap_df["sample"]:
        data_df[sample] = np.log2(data_df[sample].replace(0, np.nan))
    return data_df[headers], samplemap_df