# Quicktests on Mixed Species Dataset Multi-Condition

This notebook runs AlphaQuant on a Mixed Species dataset, subsetted to only around 60 proteins. We run Alphaquant in 'multicond_median_analysis' mode, which means, that a median reference is created against which the other conditions are compared. We then check that the comparisons against this median conditon neither violate fdr nor are less sensitive than expected. We additionally check if the ratios are as expected.

In [None]:
#download test files for benchmarking

import alphaquant.benchm.testfile_handling
test_folder = "../../test_data/"
links_yaml_quicktest_files = "../../alphaquant/config/download_links_for_testfiles_quicktest.yaml"



testfieldownloader = alphaquant.benchm.testfile_handling.TestFileDownloader(test_folder=test_folder, links_yaml=links_yaml_quicktest_files, subfolder_of_interest="mixed_species")
testfieldownloader = alphaquant.benchm.testfile_handling.TestFileDownloader(test_folder=test_folder, links_yaml=links_yaml_quicktest_files, subfolder_of_interest="databases")

In [None]:
import os

os.chdir("../../test_data/quicktests/mixed_species")

In [None]:
INPUT_FILE = "20210210_154121_S209-S-1-240min_Report_quicktest_shortened.tsv.zip"
SAMPLEMAP = "samplemap.tsv"
RESULTS_DIR = "results_multicond"
SHARED_PEPTIDES_BETWEEN_SPECIES_FILE = "../../databases/intersecting_peptides_human_yeast_cael_ecoli.tsv"

In [None]:
import alphaquant.run_pipeline as run_pipeline

run_pipeline.run_pipeline(input_file=INPUT_FILE, samplemap_file=SAMPLEMAP, results_dir=RESULTS_DIR, runtime_plots=True, minrep_either= 2, take_median_ion= True, multicond_median_analysis=True,
                           annotation_columns=["PG.Genes", "PG.Organisms"], input_type_to_use= "spectronaut_fragion_ms1_protein", peptides_to_exclude_file=SHARED_PEPTIDES_BETWEEN_SPECIES_FILE)

In [None]:
import pandas as pd
results_table_s1 = f"{RESULTS_DIR}/S1_VS_median_reference.results.tsv"
results_table_s2 = f"{RESULTS_DIR}/S2_VS_median_reference.results.tsv"

results_df_s1 = pd.read_csv(results_table_s1, sep="\t")
results_df_s2 = pd.read_csv(results_table_s2, sep="\t")

results_df_reformat_s1 = results_df_s1[["protein", "PG.Organisms", "log2fc", "fdr"]].rename(columns={"PG.Organisms": "organism_alphaquant","log2fc": "log2fc_alphaquant", 
                                                                                               "fdr": "fdr_alphaquant"})
results_df_reformat_s2 = results_df_s2[["protein", "PG.Organisms", "log2fc", "fdr"]].rename(columns={"PG.Organisms": "organism_alphaquant","log2fc": "log2fc_alphaquant",
                                                                                                  "fdr": "fdr_alphaquant"})

In [None]:
import numpy as np
import alphaquant.benchm.sensitivity as aq_benchm_sensitivity

classification_benchmark_s1 = aq_benchm_sensitivity.RatioClassificationTableGenerator(results_df_reformat_s1, decoy_organism="Homo sapiens", method_suffixes=["_alphaquant"])
classification_df_s1 = classification_benchmark_s1.per_species_results_df.replace(np.nan, 0)
display(classification_df_s1)

aq_benchm_sensitivity.plot_sighits_barplot(classification_df_s1, suffixes=["_alphaquant"], decoy_organism="Homo sapiens")


classification_benchmark_s2 = aq_benchm_sensitivity.RatioClassificationTableGenerator(results_df_reformat_s1, decoy_organism="Homo sapiens", method_suffixes=["_alphaquant"])
classification_df_s2 = classification_benchmark_s2.per_species_results_df.replace(np.nan, 0)
display(classification_df_s2)

aq_benchm_sensitivity.plot_sighits_barplot(classification_df_s2, suffixes=["_alphaquant"], decoy_organism="Homo sapiens")



In [None]:
def assert_fdr_is_not_violated(classification_df, suffix = "_alphaquant",  decoy_organism="Homo sapiens"):
    idx_of_decoy_organism = classification_df[classification_df.index == decoy_organism].index
    num_allowed_hits = classification_df.loc[idx_of_decoy_organism, f"allowed_decoy_hits{suffix}"].values[0]
    num_allowed_hits_w_tolerance = int(1.1*num_allowed_hits)
    num_actual_hits =  classification_df.loc[idx_of_decoy_organism,f"hits{suffix}"].values[0]

    assert num_actual_hits < num_allowed_hits_w_tolerance , "more false postives than expected"


def assert_sensitivity_is_as_expected(classification_df):
    assert classification_df.loc["Caenorhabditis elegans", "hits_alphaquant"] >=19
    assert classification_df.loc["Escherichia coli (strain K12)", "hits_alphaquant"] >=14
    assert classification_df.loc["Saccharomyces cerevisiae (strain ATCC 204508 / S288c)", "hits_alphaquant"] >=13


assert_fdr_is_not_violated(classification_df_s1)
assert_sensitivity_is_as_expected(classification_df_s1)

assert_fdr_is_not_violated(classification_df_s2)
assert_sensitivity_is_as_expected(classification_df_s2)


In [None]:
import matplotlib.pyplot as plt
medianref_table = f"{RESULTS_DIR}/medianref_protein_alphaquant.tsv"

df_medianref = pd.read_csv(medianref_table, sep="\t")

df_medianref["fcdiff"] = df_medianref["S1"] - df_medianref["S2"]

fig, ax = plt.subplots()
df_medianref.hist(column="fcdiff", bins=20, ax=ax)

expected_log2fcs= np.array([-2.2, -1.2, 0,2])
for fc in expected_log2fcs:
    ax.axvline(x=fc, color='r', linestyle='--')

fcdiff_values = df_medianref["fcdiff"].dropna().values

within_tolerance_list = []
for fc in fcdiff_values:
    within_tolerance = np.any(np.abs(expected_log2fcs - fc) < 0.4)
    within_tolerance_list.append(within_tolerance)

fraction_within_tolerance = sum(within_tolerance_list)/ len(within_tolerance_list)

assert fraction_within_tolerance > 0.95

