In [None]:
TEST_FILE_DIR = "../../test_data/unit_tests/quant_reader_tables"
OUTDIR = f"{TEST_FILE_DIR}/outdir"

SPECTRONAUT_FILE = f"{TEST_FILE_DIR}/spectronaut.frgions.large.tsv"

FILE_DEFAULT = "default_out.tsv"
FILE_DASK_PROC = "dask_proc_out.tsv"


In [None]:

import alphabase.tools.data_downloader as ab_data_downloader

downloader = ab_data_downloader.DataShareDownloader("https://datashare.biochem.mpg.de/s/8OQTYwtKkpVwOys", output_dir="../../test_data/unit_tests/")#this downloads the folder quant_reader_tables into the specified path

downloader.download()

In [None]:

import os
import pandas as pd
import shutil
import alphabase.quantification.quant_reader.config_dict_loader as config_dict_loader
import alphabase.quantification.quant_reader.longformat_reader as longformat_reader



def test_table_loadings(input_file, outdir, file_default, file_dask_proc):
    if not os.path.exists(outdir):
        os.mkdir(outdir)
        #os.chdir(outdir)

    input_type, config_dict_for_type, sep = config_dict_loader.get_input_type_and_config_dict(input_file)

    longformat_reader.reformat_and_write_longtable_according_to_config(input_file,outfile_name=file_default,config_dict_for_type=config_dict_for_type, chunksize=10_000, use_alphaquant_format=True)

    longformat_reader.reformat_and_write_longtable_according_to_config(input_file,outfile_name=file_dask_proc,config_dict_for_type=config_dict_for_type, enforce_largefile_processing=True, chunksize=10_000, use_alphaquant_format=True)


    df_default = pd.read_csv(file_default, sep = "\t")
    df_dask_proc = pd.read_csv(file_dask_proc, sep = "\t")
    display(df_default)
    display(df_dask_proc)


    df_dask_proc = df_dask_proc[df_default.columns]

    assert df_default.equals(df_dask_proc)

    assert set(df_default["quant_id"]) == set(df_dask_proc["quant_id"])

    os.remove(file_default)
    os.remove(file_dask_proc)
    shutil.rmtree(outdir)


test_table_loadings(SPECTRONAUT_FILE, OUTDIR, FILE_DEFAULT, FILE_DASK_PROC)


In [None]:
%reload_ext autoreload
%autoreload 2
import alphabase.quantification.quant_reader.quant_reader_manager as quant_reader_manager
import os
import shutil

INPUT_FILES = [os.path.join(TEST_FILE_DIR, x ) for x in ["diann.tsv", "diann.parquet","spectronaut.tsv", "spectronaut_frgion.tsv", "mq_peptides.txt", "diann_test_input_mDIA.tsv", "fragpipe.tsv"]]


def perform_table_loading(input_file):
    """only makes sure that the commands run without error"""

    #import the input table once the input and the results folder are specified.
    # The function automatically recognizes the format (Currently MQ, Spectronaut, DIA-NN configured)
    input_data = quant_reader_manager.import_data(input_file)
    display(input_data)


for idx in range(len(INPUT_FILES)):
    perform_table_loading(INPUT_FILES[idx])
    print('loading ran through')




In [None]:

#hide
import yaml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def compare_generic_table_with_original(preprocessed_input_df, original_spectronaut_file, config_yaml,input_typename_config, sep = "\t"):
    id2quant_orig, id2quant_preproc = get_processed_original_id2quant_maps(preprocessed_input_df, original_spectronaut_file, config_yaml,input_typename_config)
    keys_orig = set(id2quant_orig.keys())
    keys_preproc = set(id2quant_preproc.keys())
    print(keys_orig)
    print(keys_preproc)
    keydiff = keys_orig.difference(keys_preproc)
    keys_orig = sorted(keys_orig)
    keys_preproc = sorted(keys_preproc)


    assert(len(keydiff)==0) #check that all keys in the preprocessed set are part of the original set

    assert set(id2quant_orig.keys()) == set(id2quant_preproc.keys())

    quantvec_orig = np.array([id2quant_orig.get(x)for x in id2quant_preproc.keys()])
    quantvec_preproc = np.array([id2quant_preproc.get(x)for x in id2quant_preproc.keys()])
    plt.show()
    plt.scatter(quantvec_orig, quantvec_preproc)
    plt.xlabel("original quant")
    plt.ylabel("preprocessed quant")
    plt.show()
    corrcoeff = np.corrcoef(quantvec_orig,quantvec_preproc)[0][1]
    print(f"correlation between both processings: {corrcoeff}")
    assert(corrcoeff>0.999)



def get_processed_original_id2quant_maps(preprocessed_input_df, original_spectronaut_file, config_yaml,input_typename_config, sep = "\t"):
    config_all = yaml.safe_load(open(config_yaml, 'r'))
    config_dict = config_all.get(input_typename_config)
    id_cols = config_dict.get("ion_cols") + [config_dict.get("sample_ID")]
    quant_col = list(config_dict.get("quant_ID").values())
    id2quant_orig = get_id2quant_original(original_spectronaut_file, id_cols, quant_col, sep)
    id2quant_preproc = get_id2quant_processed(preprocessed_input_df)

    return id2quant_orig, id2quant_preproc


def get_id2quant_original(original_spectronaut_file, id_cols, quant_col, sep):
    orig_df = pd.read_csv(original_spectronaut_file, sep=sep, usecols= id_cols+quant_col)
    orig_df["compareID"] = orig_df.apply(lambda row : "".join(row[id_cols].astype('string')), axis = 1)
    id2quant = dict(zip(orig_df["compareID"], orig_df[quant_col[0]]))
    id2quant = {k: round(v,3) for k, v in id2quant.items()}
    return id2quant


def get_id2quant_processed(preprocessed_input_df):
    quant_cols = [x for x in preprocessed_input_df.columns if x.endswith(".raw")]
    melted_preproc_df = preprocessed_input_df.melt(id_vars = ["MOD", "CHARGE"], value_vars = quant_cols).dropna()
    melted_preproc_df["compareID"] = melted_preproc_df["MOD"].astype('string') + melted_preproc_df["CHARGE"].astype('string') + melted_preproc_df["variable"].astype('string')
    melted_preproc_df["compareID"] = melted_preproc_df["compareID"].str.replace("MOD_", "").str.replace("CHARGE_", "")
    id2quant_preproc = dict(zip(melted_preproc_df["compareID"], melted_preproc_df["value"]))
    return id2quant_preproc


def load_and_format_input_data(input_file, input_type_to_use):
    input_df = quant_reader_manager.import_data(input_file, input_type_to_use=input_type_to_use)
    input_df = input_df.set_index('quant_id')
    input_df["CHARGE"] = input_df["CHARGE"].astype('string')
    #get all column names with numerical values
    numerical_cols = input_df.select_dtypes(include=['number']).columns
    input_df = input_df.replace(0, np.nan)
    #input_df[numerical_cols] = np.log2(input_df[numerical_cols].replace(0, np.nan))
    return input_df



input_df = load_and_format_input_data(SPECTRONAUT_FILE, "spectronaut_precursor_v2")
display(input_df)
compare_generic_table_with_original(input_df, SPECTRONAUT_FILE, "../../alphabase/constants/const_files/quant_reader_config.yaml", "spectronaut_precursor_v2")



In [None]:
import glob
import os

# Define the directory to search in

# Find all files with "aq_reformat.tsv" at the end
files_to_delete = glob.glob(os.path.join(TEST_FILE_DIR, "*aq_reformat.tsv"))

# Delete the files
for file in files_to_delete:
    os.remove(file)
    print(f"Deleted: {file}")
