In [None]:
%reload_ext autoreload

%autoreload 2

In [None]:
INPUT_FILE = "../../test_data/system_tests/diaPASEF_diann/report.tsv.top3added.tsv"

DIRECTLFQ_FILE_PRECURSORS_MS1_MS2 = "../../test_data/system_tests/diaPASEF_diann/report.tsv.top3added.tsv.diann_precursor_ms1_and_ms2.protein_intensities.tsv"
DIRECTLFQ_FILE_PRECURSORS = "../../test_data/system_tests/diaPASEF_diann/report.tsv.top3added.tsv.diann_precursors.protein_intensities.tsv"
DIRECTLFQ_FILE_FRAGIONS_TOP3 = "../../test_data/system_tests/diaPASEF_diann/report.tsv.top3added.tsv.diann_fragion_isotopes_topn.protein_intensities.tsv"
DIRECTLFQ_FILE_FRAGIONS = "../../test_data/system_tests/diaPASEF_diann/report.tsv.top3added.tsv.diann_precursor_ms1_and_ms2.protein_intensities.tsv"

OUTPUT_FILE_PRECURSORS_MS1_MS2 = "../../test_data/system_tests/diaPASEF_diann/precursors_ms1_ms2.pdf"
OUTPUT_FILE_PRECURSORS = "../../test_data/system_tests/diaPASEF_diann/precursors.pdf"
OUTPUT_FILE_PRECURSORS_FRAGION_TOP3 = "../../test_data/system_tests/diaPASEF_diann/precursors_fragion_top3.pdf"
OUTPUT_FILE_FRAGION= "../../test_data/system_tests/diaPASEF_diann/fragion.pdf"

diann_protein_intensities = "../../test_data/system_tests/diaPASEF_diann/report.tsv.diann_protein.aq_reformat.tsv"
iq_protein_intensities = "../../test_data/system_tests/diaPASEF_diann/report_iq_results.tsv"

samplemap  = "../../test_data/system_tests/diaPASEF_diann/samplemap.tsv"

run_directlfq = False
run_protein_reformating = False

In [None]:
import alphabase.tools.data_downloader as tdd

tdd.DataShareDownloader(url="https://datashare.biochem.mpg.de/s/9UoHMjTCJ8TmmlA", output_dir="../../test_data/system_tests/").download()

In [None]:
if run_protein_reformating:
    import directlfq.utils as lfq_utils
    lfq_utils.import_data(input_file=INPUT_FILE, input_type_to_use="diann_protein")

In [None]:
if run_directlfq:
    import directlfq.lfq_manager as lfqmgr
    input_types_to_use  = ["diann_fragion_isotopes_topn", "diann_precursor_ms1_and_ms2", "diann_precursors", "diann_fragion_isotopes"]
    for input_type in input_types_to_use:
        lfqmgr.run_lfq(input_file=INPUT_FILE, input_type_to_use=input_type, min_nonan=1)


In [None]:
import directlfq.benchmarking as lfqbenchmark
import directlfq.utils as lfqutils


samples_used = lfqutils.get_samples_used_from_samplemap_file(samplemap, cond1="45ng", cond2="15ng")


restable_directlfq_ms1_and_ms2 = lfqbenchmark.ResultsTableDirectLFQ(input_file=DIRECTLFQ_FILE_PRECURSORS_MS1_MS2, input_name="directLFQ", samples_c1=samples_used[0], samples_c2=samples_used[1])
restable_directlfq_precursors = lfqbenchmark.ResultsTableDirectLFQ(input_file=DIRECTLFQ_FILE_PRECURSORS, input_name="directLFQ", samples_c1=samples_used[0], samples_c2=samples_used[1])
restable_directlfq_fragions = lfqbenchmark.ResultsTableDirectLFQ(input_file=DIRECTLFQ_FILE_FRAGIONS, input_name="directLFQ", samples_c1=samples_used[0], samples_c2=samples_used[1])
restable_directlfq_fragions_top3 = lfqbenchmark.ResultsTableDirectLFQ(input_file=DIRECTLFQ_FILE_FRAGIONS_TOP3, input_name="directLFQ", samples_c1=samples_used[0], samples_c2=samples_used[1])

restable_diann = lfqbenchmark.ResultsTableDirectLFQ(input_file=diann_protein_intensities, input_name="DIANN", samples_c1=samples_used[0], samples_c2=samples_used[1])

restables = [restable_diann, restable_directlfq_ms1_and_ms2, restable_directlfq_precursors, restable_directlfq_fragions, restable_directlfq_fragions_top3]
restable_names = ["MaxLFQ DIANN", "directLFQ (precursors, MS1+MS2)", "directLFQ (precursors)", "directLFQ (fragions)", "directLFQ (fragions top3)"]

In [None]:
organism_annotator_diann_based = lfqbenchmark.OrganismAnnotatorDIANN(mapping_file=INPUT_FILE)
for restable in restables:
    organism_annotator_diann_based.annotate_table_with_organism(restable)


In [None]:
import seaborn as sns
import directlfq.visualizations as lfq_viz

class MultiOrganismMultiMethodBoxPlotAdapted(lfq_viz.MultiOrganismMultiMethodBoxPlot):
    def plot_boxplot(self):
        color_palette = sns.color_palette(self._colorlist_hex, n_colors=len(self._fcs_to_expect))
        sns.boxplot(data=self._method_ratio_results_table, x="method", y = "log2fc", hue= "organism", palette=color_palette, hue_order=self._organisms_to_plot, ax=self.ax, linewidth=0.8)


In [None]:
import numpy as np
class MultiOrganismIntensityFCPlotter():
    def __init__(self, ax, resultstable_w_ratios, organisms_to_plot = None, fcs_to_expect = None, title = ""):
        print('init MultiOrganismIntensityFCPlotter')
        self.ax = ax
        self._color_list_hex = ['#ffd479', '#325e7a', '#bad566']
        self._resultstable_w_ratios = resultstable_w_ratios
        self._organism_column = resultstable_w_ratios.organism_column
        self._log2fc_column = resultstable_w_ratios.log2fc_column
        self._mean_intensity_column = resultstable_w_ratios.mean_intensity_column
        
        self._organisms_to_plot = self._get_organisms_to_plot(organisms_to_plot)
        self._fcs_to_expect = fcs_to_expect

        self._title = self._get_title(title)
        self._scatter_per_organism()
        self._add_expected_lines()

    def _get_organisms_to_plot(self, organisms_to_plot):
        if organisms_to_plot is not None:
            return organisms_to_plot
        else:
            return sorted(list(set(self._resultstable_w_ratios.formated_dataframe[self._organism_column].astype('str'))))
    
    def _get_title(self, title):
        if title !="":
            self._print_infos_about_data()
            return title
        return self._generate_title()

    def _print_infos_about_data(self):
        for organism in self._organisms_to_plot:
            subtable_organism = self._get_organism_subtable(organism)
            print(self._get_stats_of_organism(organism, subtable_organism))

    def _generate_title(self):
        for organism in self._organisms_to_plot:
            subtable_organism = self._get_organism_subtable(organism)
            title += self._get_stats_of_organism(organism, subtable_organism)
        return title

    def _scatter_per_organism(self):
        complete_table = self._resultstable_w_ratios.formated_dataframe.copy()
        complete_table[self._mean_intensity_column] = np.log2(complete_table[self._mean_intensity_column])
        complete_table = self._remove_omitted_organisms_from_table(complete_table)
        color_palette = sns.color_palette(self._color_list_hex, n_colors=len(self._organisms_to_plot))
        sns.scatterplot(data= complete_table, x =self._mean_intensity_column, y= self._log2fc_column, hue=self._organism_column, alpha=0.15, ax=self.ax, 
        hue_order=self._organisms_to_plot, palette=color_palette, size=0.2)
        self.ax.set_title(self._title)
    
    def _remove_omitted_organisms_from_table(self, complete_table):
        row_w_permitted_organism = [x in self._organisms_to_plot for x in complete_table["organism"]]
        return complete_table[row_w_permitted_organism]

    def _add_expected_lines(self):
        if self._fcs_to_expect is not None:
            for idx, fc in enumerate(self._fcs_to_expect):
                color = self._color_list_hex[idx]
                self.ax.axhline(fc, color = color)

    def _get_organism_subtable(self, organism):
        complete_table = self._resultstable_w_ratios.formated_dataframe
        return complete_table[complete_table[self._organism_column] == organism]
    
    def _get_stats_of_organism(self, organism, subtable_organism):
        fcs = subtable_organism[self._log2fc_column].to_numpy()
        fcs = fcs[np.isfinite(fcs)]
        median_fc = np.nanmedian(fcs)
        std_fc = np.nanstd(fcs)
        num_ratios = sum(~np.isnan(fcs))
        return f"{organism} num:{num_ratios} median_FC:{median_fc:.2} STD:{std_fc:.2}\n"

In [None]:
display(restable_directlfq_ms1_and_ms2.formated_dataframe)

In [None]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import directlfq.benchmarking
import directlfq.visualizations




methodname2formatted_df = {'directLFQ_precursor_ms1_and_ms2' : restable_directlfq_ms1_and_ms2.formated_dataframe, 'directLFQ_precursor' : restable_directlfq_precursors.formated_dataframe,
                        'directLFQ_fragions' : restable_directlfq_fragions.formated_dataframe, 'directLFQ_fragions_top3' : restable_directlfq_fragions_top3.formated_dataframe,
                        'MaxLFQ\nDIANN' : restable_diann.formated_dataframe}

num_results = len(restables)
a4_dims = (11.7, 8.27)
a4_width_no_margin = 10.5
fig, axes = plt.subplots(1, num_results+1,figsize=(4*num_results,4.363 ))

organisms_to_plot = [ "YEAST", "HUMAN"]
fcs_to_expect = [  np.log2(3), 0]

merged_table = directlfq.benchmarking.ResultsTableMerger(method_name2results_df=methodname2formatted_df).merged_table


MultiOrganismMultiMethodBoxPlotAdapted(method_ratio_results_table=merged_table, ax = axes[0], organisms_to_plot=organisms_to_plot,fcs_to_expect= fcs_to_expect)


for idx in range(num_results):
    restable = restables[idx]
    name = restable_names[idx]
    print(name)
    directlfq.benchmarking.MultiOrganismIntensityFCPlotter(ax=axes[idx+1], resultstable_w_ratios=restable, organisms_to_plot = organisms_to_plot, fcs_to_expect= fcs_to_expect, title=name)




for ax in axes:
    ax.get_legend().remove()

for ax in axes[1:]:
    ax.set_ylabel("")

for ax in axes:
    ax.set_xlabel("mean intensity")
axes[0].set_xlabel("")
for idx in range(num_results+1):
    if idx>0:
        axes[idx].set_xlabel("")
    axes[idx].set_ylim([-1, 2.5])

#rotate xticklabels of first plot
for tick in axes[0].get_xticklabels():
    tick.set_rotation(90)


lines = axes[1].get_lines()
#axes[1].legend(lines, ["S. cerevisiae", "H. sapiens", "C. elegans"], loc='upper left', bbox_to_anchor=(1, 1))
fig.legend(lines, ["$S. cerevisiae$", "$H. sapiens$"], bbox_to_anchor=[0.85, 0.25], 
           loc='center', ncol=1, title = None, frameon = False, labelspacing = 0, handlelength = 0.5, handletextpad = 0.5)
#change label type in legend



In [None]:
import numpy as np
import directlfq.test_utils as testutils

organism2expectedfc={"YEAST" : np.log2(3), "HUMAN" : 0}
organism2CI95={"YEAST" : 0.93, "HUMAN" : 0.3}

for method, formatted_df in methodname2formatted_df.items():
	print(f"Checking {method}")
	testutils.RatioChecker(formatted_df=formatted_df, organism2expectedfc=organism2expectedfc, organism2CI95=organism2CI95)


