In [None]:
#| default_exp protein_intensity_estimation

In [None]:
%reload_ext autoreload

%autoreload 2

In [None]:
#| export
import pandas as pd
import numpy as np
import directlfq.normalization as lfqnorm
import multiprocess
import itertools

def estimate_protein_intensities(normed_df, min_nonan, num_samples_quadratic, num_cores):
    "derives protein pseudointensities from between-sample normalized data"
    
    allprots = list(normed_df.index.get_level_values(0).unique())
    print(f"{len(allprots)} prots total")
    
    list_of_tuple_w_protein_profiles_and_shifted_peptides = get_list_of_tuple_w_protein_profiles_and_shifted_peptides(allprots, normed_df, num_samples_quadratic, min_nonan, num_cores)
    protein_df = get_protein_dataframe_from_list_of_protein_profiles(allprots=allprots, list_of_tuple_w_protein_profiles_and_shifted_peptides=list_of_tuple_w_protein_profiles_and_shifted_peptides, normed_df= normed_df)
    ion_df = get_ion_intensity_dataframe_from_list_of_shifted_peptides(list_of_tuple_w_protein_profiles_and_shifted_peptides, allprots)

    return protein_df, ion_df


def get_list_of_tuple_w_protein_profiles_and_shifted_peptides(allprots, normed_df, num_samples_quadratic, min_nonan, num_cores):
    if num_cores is not None and num_cores <=1:
        list_of_tuple_w_protein_profiles_and_shifted_peptides = get_list_with_sequential_processing(allprots, normed_df, num_samples_quadratic, min_nonan)
    else:
        list_of_tuple_w_protein_profiles_and_shifted_peptides = get_list_with_multiprocessing(allprots, normed_df, num_samples_quadratic, min_nonan, num_cores)
    return list_of_tuple_w_protein_profiles_and_shifted_peptides

def get_list_with_sequential_processing(allprots, normed_df, num_samples_quadratic, min_nonan):
    input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan = get_input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan(normed_df, allprots, num_samples_quadratic, min_nonan)
    list_of_tuple_w_protein_profiles_and_shifted_peptides = list(map(lambda x : calculate_peptide_and_protein_intensities(*x), input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan))
    return list_of_tuple_w_protein_profiles_and_shifted_peptides
    
def get_list_with_multiprocessing(allprots, normed_df, num_samples_quadratic, min_nonan, num_cores):
    pool = get_configured_multiprocessing_pool(num_cores)
    input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan = get_input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan(normed_df, allprots, num_samples_quadratic, min_nonan)
    list_of_tuple_w_protein_profiles_and_shifted_peptides = pool.starmap(calculate_peptide_and_protein_intensities, input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan)
    pool.close()
    return list_of_tuple_w_protein_profiles_and_shifted_peptides


def get_configured_multiprocessing_pool(num_cores):
    multiprocess.freeze_support()
    if num_cores is None:
        num_cores = multiprocess.cpu_count() if multiprocess.cpu_count() < 60 else 60 #windows upper thread limit
    pool = multiprocess.Pool(num_cores)
    print(f"using {pool._processes} processes")
    return pool


def get_input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan(normed_df, allprots, num_samples_quadratic, min_nonan):
    list_of_normed_dfs = get_normed_dfs(normed_df, allprots)
    return zip(range(len(list_of_normed_dfs)),list_of_normed_dfs, itertools.repeat(num_samples_quadratic), itertools.repeat(min_nonan))




def get_normed_dfs(normed_df, allprots):
    list_of_normed_dfs = []
    for protein in allprots:
        peptide_intensity_df = pd.DataFrame(normed_df.loc[protein])#DataFrame definition to avoid pandas Series objects
        if len(peptide_intensity_df.index) > 1:
            peptide_intensity_df = ProtvalCutter(peptide_intensity_df, maximum_df_length=100).get_dataframe()
            peptide_intensity_df = OrphanIonRemover(peptide_intensity_df).orphan_removed_df
        list_of_normed_dfs.append(peptide_intensity_df)

    return list_of_normed_dfs


def get_ion_intensity_dataframe_from_list_of_shifted_peptides(list_of_tuple_w_protein_profiles_and_shifted_peptides, allprots):
    ion_ints = [x[1] for x in list_of_tuple_w_protein_profiles_and_shifted_peptides]
    ion_ints = add_protein_names_to_ion_ints(ion_ints, allprots)
    ion_df = 2**pd.concat(ion_ints)
    ion_df = ion_df.replace(np.nan, 0)
    return ion_df

def add_protein_names_to_ion_ints(ion_ints, allprots):
    ion_ints = [add_protein_name_to_ion_df(ion_ints[idx], allprots[idx]) for idx in range(len(ion_ints))]
    return ion_ints

def add_protein_name_to_ion_df(ion_df, protein):
    ion_df["protein"] = protein
    ion_df = ion_df.reset_index().set_index(["protein", "ion"])
    return ion_df


def get_protein_dataframe_from_list_of_protein_profiles(allprots, list_of_tuple_w_protein_profiles_and_shifted_peptides, normed_df):
    index_list = []
    profile_list = []

    list_of_protein_profiles = [x[0] for x in list_of_tuple_w_protein_profiles_and_shifted_peptides]
    
    for idx in range(len(allprots)):
        if list_of_protein_profiles[idx] is None:
            continue
        index_list.append(allprots[idx])
        profile_list.append(list_of_protein_profiles[idx])
    
    index_for_protein_df = pd.Index(data=index_list, name="protein")
    protein_df = 2**pd.DataFrame(profile_list, index = index_for_protein_df, columns = normed_df.columns)
    protein_df = protein_df.replace(np.nan, 0)
    protein_df = protein_df.reset_index()
    return protein_df


def calculate_peptide_and_protein_intensities(idx,peptide_intensity_df , num_samples_quadratic, min_nonan):
    if(idx%100 ==0):
        print(f"prot {idx}")
    summed_pepint = np.nansum(2**peptide_intensity_df)
    
    if(peptide_intensity_df.shape[1]<2):
        shifted_peptides = peptide_intensity_df
    else:
        shifted_peptides = lfqnorm.NormalizationManagerProtein(peptide_intensity_df, num_samples_quadratic = num_samples_quadratic).complete_dataframe
    
    protein_profile = get_protein_profile_from_shifted_peptides(shifted_peptides, summed_pepint, min_nonan)
    
    return protein_profile, shifted_peptides


In [None]:
#| export
def get_protein_profile_from_shifted_peptides(normalized_peptide_profile_df, summed_pepints, min_nonan):
    intens_vec = get_list_with_protein_value_for_each_sample(normalized_peptide_profile_df, min_nonan)
    intens_vec = np.array(intens_vec)
    summed_intensity = np.nansum(2**intens_vec)
    if summed_intensity == 0: #this means all elements in intens vec are nans
        return None
    intens_conversion_factor = summed_pepints/summed_intensity
    scaled_vec = intens_vec+np.log2(intens_conversion_factor)
    return scaled_vec

def get_list_with_protein_value_for_each_sample(normalized_peptide_profile_df, min_nonan):
    intens_vec = []
    for sample in normalized_peptide_profile_df.columns:
        reps = normalized_peptide_profile_df.loc[:,sample].to_numpy()
        nonan_elems = sum(~np.isnan(reps))
        if(nonan_elems>=min_nonan):
            intens_vec.append(np.nanmedian(reps))
        else:
            intens_vec.append(np.nan)
    return intens_vec

In [None]:
#| export
import pandas as pd
from numba import njit

class ProtvalCutter():
    def __init__(self, protvals_df, maximum_df_length = 100):
        self._protvals_df = protvals_df
        self._maximum_df_length = maximum_df_length
        self._dataframe_too_long = None
        self._sorted_idx = None
        self._check_if_df_too_long_and_sort_index_if_so()


    def _check_if_df_too_long_and_sort_index_if_so(self):
        self._dataframe_too_long =len(self._protvals_df.index)>self._maximum_df_length
        if self._dataframe_too_long:
            self._determine_nansorted_df_index()

    def _determine_nansorted_df_index(self):
        idxs = self._protvals_df.index
        self._sorted_idx =  sorted(idxs, key= lambda idx : self._get_num_nas_in_row(self._protvals_df.loc[idx].to_numpy()))
        
    @staticmethod
    @njit
    def _get_num_nas_in_row(row):
        return sum(np.isnan(row))


    def get_dataframe(self):
        if self._dataframe_too_long:
            return self._get_shortened_dataframe()
        else:
            return self._protvals_df

    def _get_shortened_dataframe(self):
        shortened_index = self._sorted_idx[:self._maximum_df_length]
        return self._protvals_df.loc[shortened_index]


In [None]:
#| export
import numpy as np
import pandas as pd

class OrphanIonRemover(): #removes ions that do not have any overlap with any of the other ions
    def __init__(self, protvals_df : pd.DataFrame):
        self._protvals_df = protvals_df
        
        self._provals_is_not_na_df = None
        self._count_of_nonans_per_position = None
        
        self._orphan_ions = []
        self._non_orphan_ions = []

        self.orphan_removed_df = None

        self._define_protvals_is_not_na_df()
        self._define_count_of_nonans_per_position()
        self._define_orphan_ions_and_non_orphan_ions()
        self._define_orphan_removed_df()

    def _define_protvals_is_not_na_df(self):
        self._provals_is_not_na_df = self._protvals_df.notna()

    def _define_count_of_nonans_per_position(self):
        self._count_of_nonans_per_position = self._provals_is_not_na_df.sum(axis=0)
    
    def _define_orphan_ions_and_non_orphan_ions(self):
        for ion in self._provals_is_not_na_df.index:
            is_nonan_per_position_for_ion = self._provals_is_not_na_df.loc[ion].to_numpy()
            orphan_checked_ion = IonCheckedForOrphan(ion,self._count_of_nonans_per_position, is_nonan_per_position_for_ion)
            self._append_to_orphan_or_non_orphan_list(orphan_checked_ion)

    def _append_to_orphan_or_non_orphan_list(self, orphan_checked_ion):
            if orphan_checked_ion.is_orphan:
                self._orphan_ions.append(orphan_checked_ion)
            else:
                self._non_orphan_ions.append(orphan_checked_ion)
    
    def _define_orphan_removed_df(self):
        ions_to_delete = OrphanIonsForDeletionSelector(self._orphan_ions, self._non_orphan_ions).ion_accessions_for_deletion
        self.orphan_removed_df = self._protvals_df.drop(ions_to_delete, axis='index')



class OrphanIonsForDeletionSelector():
    def __init__(self, orphan_ions : list, non_orphan_ions : list):
        self._orphan_ions = orphan_ions
        self._non_orphan_ions = non_orphan_ions
        
        self.ion_accessions_for_deletion = None

        self._define_orphan_ions_for_deletion()
    
    def _define_orphan_ions_for_deletion(self):
        if len(self._non_orphan_ions)>0:
            self.ion_accessions_for_deletion = self._get_accessions_of_list_of_ions(self._orphan_ions)
        else:
            if len(self._orphan_ions)>1:
                self._sort_list_of_ions_by_num_nonans_descending(self._orphan_ions)
                orphan_ions_to_delete = self._orphan_ions[1:]
                self.ion_accessions_for_deletion = self._get_accessions_of_list_of_ions(orphan_ions_to_delete)
    
    def _get_accessions_of_list_of_ions(self, ions_checked_for_orphan : list):
        return [ion_checked_for_orphan.ion_accession for ion_checked_for_orphan in ions_checked_for_orphan]

    def _sort_list_of_ions_by_num_nonans_descending(self, ions : list):
        ions.sort(key=lambda x: x.num_nonans, reverse=True)
    




class IonCheckedForOrphan():
    def __init__(self, ion_accession, count_of_nonans_per_position : np.array, is_nonan_per_position_for_ion : np.array):
        self.ion_accession = ion_accession
        
        self._count_of_nonans_per_position = count_of_nonans_per_position
        self._is_nonan_per_position_for_ion = is_nonan_per_position_for_ion

        self._count_of_nonans_per_position_for_ion = None

        self.is_orphan = None
        self.num_nonans = None

        self._define_count_of_nonans_per_position_for_ion()
        self._check_if_is_orphan()
        self._define_num_nonans()

    def _define_count_of_nonans_per_position_for_ion(self):
        self._count_of_nonans_per_position_for_ion = self._count_of_nonans_per_position[self._is_nonan_per_position_for_ion]

    def _check_if_is_orphan(self):
        self.is_orphan = np.max(self._count_of_nonans_per_position_for_ion) == 1
    
    def _define_num_nonans(self):
        self.num_nonans = np.sum(self._count_of_nonans_per_position_for_ion)

### Unit Tests

#### Classes for testcase generation

In [None]:
import numpy as np
import pandas as pd

from  numpy.random import MT19937
from numpy.random import RandomState, SeedSequence

class ProteinProfileGenerator():
    def __init__(self, peptide_profiles):
        self._peptide_profiles = peptide_profiles
        
        self.protein_profile_dataframe = None
        self._generate_protein_profile_dataframe()

    def _generate_protein_profile_dataframe(self):
        collected_profiles = [x.peptide_profile_vector for x in self._peptide_profiles]
        protnames_for_index = [x.protein_name for x in self._peptide_profiles]
        pepnames_for_index = [f'{idx}' for idx in range(len(self._peptide_profiles))]
        self.protein_profile_dataframe = pd.DataFrame(collected_profiles,index=[protnames_for_index, pepnames_for_index])
        self.protein_profile_dataframe.index.names = ['protein', 'ion']
        self.protein_profile_dataframe = np.log2(self.protein_profile_dataframe.replace(0, np.nan))



class PeptideProfile():
    def __init__(self, protein_name, fraction_zeros_in_profile, systematic_peptide_shift, add_noise, num_samples = 20, min_intensity = 1e6, max_intensity = 1e10):


        self._fraction_zeros_in_profile = fraction_zeros_in_profile
        self._systematic_peptide_shift = systematic_peptide_shift
        self._add_noise = add_noise
        self._min_intensity = min_intensity
        self._max_intensity = max_intensity
        self._num_samples = num_samples

        self.protein_name = protein_name
        self.peptide_profile_vector = []
        self._define_peptide_profile_vector()

    def _define_peptide_profile_vector(self):
        self.peptide_profile_vector = self._get_single_peptide_profile_template()
        self._scale_profile_vector()
        if self._add_noise:
            self._apply_poisson_noise_to_profilevector()
        self._add_zeros_to_profilevector()

    def _get_single_peptide_profile_template(self):
        rs = RandomState(MT19937(SeedSequence(42312)))
        return rs.randint(low=self._min_intensity, high=self._max_intensity,size=self._num_samples)

    def _scale_profile_vector(self):
        self.peptide_profile_vector = self.peptide_profile_vector*self._systematic_peptide_shift

    def _apply_poisson_noise_to_profilevector(self):
        self.peptide_profile_vector = np.random.poisson(lam=self.peptide_profile_vector, size=len(self.peptide_profile_vector))

    def _add_zeros_to_profilevector(self):
        num_elements_to_set_zero = int(self._num_samples*self._fraction_zeros_in_profile)
        idxs_to_set_zero = np.random.choice(self._num_samples,size=num_elements_to_set_zero, replace=False)
        self.peptide_profile_vector[idxs_to_set_zero] = 0
        

# Unit Tests

In [None]:
import pandas as pd
import numpy as np
#test df cutting

def test_sorting_by_num_nans():
    vals1 = np.array([9, np.nan, np.nan, np.nan])
    vals2 = np.array([5, 6, np.nan, np.nan])
    vals3 = np.array([1, 2, 3,np.nan ])

    df = pd.DataFrame([vals1, vals2, vals3],index=[['P', 'P', 'P'],['A', 'B', 'C']])
    pcutter = ProtvalCutter(df,maximum_df_length=2)
    sorted_idx = pcutter._sorted_idx
    df_sorted = df.loc[sorted_idx]
    
    assert np.allclose(df_sorted.iloc[2].to_numpy(), vals1,equal_nan=True)
    assert np.allclose(df_sorted.iloc[0].to_numpy(), vals3,equal_nan=True)
    

def test_cutting_of_df():
    vals1 = np.array([9, np.nan, np.nan, np.nan])
    vals2 = np.array([5, 6, np.nan, np.nan])
    vals3 = np.array([1, 2, 3,np.nan ])

    df = pd.DataFrame([vals1, vals2, vals3],index=[['A', 'B', 'C']])
    pcutter = ProtvalCutter(df, maximum_df_length=2)
    cut_df = pcutter.get_dataframe()
    ion_idx = [x[0] for x in cut_df.index]
    print(ion_idx)
    assert ion_idx == ['C', 'B']





In [None]:


test_sorting_by_num_nans()
test_cutting_of_df()

['C', 'B']


In [None]:
import pandas as pd
import numpy as np
#test df cutting

def test_orphan_detection():
    df_one_orphan = create_test_df_one_orphan()
    df_three_orphans = create_test_df_three_orphans()
    df_only_orphans = create_test_df_only_orphans()

    compare_df_no_orphans = create_compare_df_no_orphans()
    compare_df_for_only_orphans = create_compare_df_only_orphans()

    df_orphanremoved_one = OrphanIonRemover(df_one_orphan).orphan_removed_df
    df_orphanremoved_three = OrphanIonRemover(df_three_orphans).orphan_removed_df
    df_orphanremoved_only = OrphanIonRemover(df_only_orphans).orphan_removed_df
    df_orphanremoved_compare_df = OrphanIonRemover(compare_df_no_orphans).orphan_removed_df

    display(df_orphanremoved_compare_df)

    assert df_orphanremoved_one.equals(compare_df_no_orphans)
    assert df_orphanremoved_three.equals(compare_df_no_orphans)
    assert df_orphanremoved_only.equals(compare_df_for_only_orphans)
    assert df_orphanremoved_compare_df.equals(compare_df_no_orphans)
    
    print('test_orphan_detection passed')

def create_test_df_one_orphan():
    vals1 = np.array([5, 6, np.nan, np.nan, np.nan, np.nan, np.nan])
    vals2 = np.array([1, 2, 3,np.nan, np.nan, np.nan, np.nan])
    vals3 = np.array([np.nan, np.nan, np.nan, 9, np.nan, np.nan, np.nan])

    return pd.DataFrame([vals1, vals2, vals3],index=[['P', 'P', 'P'],['A', 'B', 'C']])

def create_test_df_three_orphans():
    vals1 = np.array([5, 6, np.nan, np.nan, np.nan, np.nan, np.nan])
    vals2 = np.array([1, 2, 3,np.nan, np.nan, np.nan, np.nan])
    vals3 = np.array([np.nan, np.nan, np.nan, 9, np.nan, np.nan, np.nan])
    vals4 = np.array([np.nan, np.nan, np.nan, np.nan, 10, np.nan, np.nan])
    vals5 = np.array([np.nan, np.nan, np.nan, np.nan, np.nan, 11, 12])
    
    return pd.DataFrame([vals1, vals2, vals3, vals4, vals5],index=[['P', 'P', 'P', 'P', 'P'],['A', 'B', 'C', 'D', 'E']])


def create_compare_df_no_orphans():
    vals1 = np.array([5, 6, np.nan, np.nan, np.nan, np.nan, np.nan])
    vals2 = np.array([1, 2, 3,np.nan, np.nan, np.nan, np.nan])
    
    return pd.DataFrame([vals1, vals2],index=[['P', 'P'],['A', 'B']])

def create_test_df_only_orphans():
    vals3 = np.array([np.nan, np.nan, np.nan, 9, np.nan, np.nan, np.nan])
    vals4 = np.array([np.nan, np.nan, np.nan, np.nan, 10, np.nan, np.nan])
    vals5 = np.array([np.nan, np.nan, np.nan, np.nan, np.nan, 11, 12])
    
    return pd.DataFrame([vals3, vals4, vals5],index=[['P', 'P', 'P'],['C', 'D', 'E']])

def create_compare_df_only_orphans():
    vals5 = np.array([np.nan, np.nan, np.nan, np.nan, np.nan, 11, 12])
    return pd.DataFrame([ vals5],index=[[ 'P'],[ 'E']])
    


In [None]:

test_orphan_detection()

Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6
P,A,5.0,6.0,,,,,
P,B,1.0,2.0,3.0,,,,


test_orphan_detection passed


In [None]:
def test_that_profiles_without_noise_are_shifted_exactly_on_top_of_each_other():
    peptide1= PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0.1, systematic_peptide_shift=3000, add_noise=False)
    peptide2= PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0.9, systematic_peptide_shift=3, add_noise=False)
    peptide3= PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0.1, systematic_peptide_shift=0.1, add_noise=False)
    peptide4= PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0.9, systematic_peptide_shift=100, add_noise=False)
    protein_df = ProteinProfileGenerator([peptide1, peptide2, peptide3, peptide4]).protein_profile_dataframe
    display(protein_df)
    normed_ion_profile = lfqnorm.normalize_ion_profiles(protein_df)
    display(normed_ion_profile)
    column_from_shifted = normed_ion_profile.iloc[:,11].dropna().to_numpy()
    display(column_from_shifted)
    assert np.allclose(column_from_shifted, column_from_shifted[0])
   

In [None]:
 
test_that_profiles_without_noise_are_shifted_exactly_on_top_of_each_other()


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
protein,ion,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
protA,0,,44.474241,41.489085,43.216307,43.505708,42.338404,43.454211,43.102621,41.897534,43.625508,43.26134,40.203349,44.447364,44.554027,44.692343,,43.106703,44.630975,43.38065,43.407413
protA,1,,,31.523301,,,,,,,33.659723,,,,,,,,,,
protA,2,29.73029,29.601566,26.61641,28.343632,28.633033,27.465729,,28.229946,,28.752833,28.388665,25.330674,29.574689,29.681352,29.819668,29.829015,28.234029,29.758301,28.507975,28.534739
protA,3,,,,,,,,,,,,,,,,39.794799,,,38.473759,


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
protein,ion,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
protA,0,,44.474241,41.489085,43.216307,43.505708,42.338404,43.454211,43.102621,41.897534,43.625508,43.26134,40.203349,44.447364,44.554027,44.692343,,43.106703,44.630975,43.38065,43.407413
protA,1,,,41.489085,,,,,,,43.625508,,,,,,,,,,
protA,2,44.602965,44.474241,41.489085,43.216307,43.505708,42.338404,,43.102621,,43.625508,43.26134,40.203349,44.447364,44.554027,44.692343,44.70169,43.106703,44.630975,43.38065,43.407413
protA,3,,,,,,,,,,,,,,,,44.70169,,,43.38065,


array([40.20334853, 40.20334853])

In [None]:
def test_that_profiles_with_noise_are_close():
    peptide1= PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0, systematic_peptide_shift=3000, add_noise=True)
    peptide2= PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0, systematic_peptide_shift=3, add_noise=True)
    peptide3= PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0, systematic_peptide_shift=0.1, add_noise=True)
    peptide4= PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)

    protein_df = ProteinProfileGenerator([peptide1, peptide2, peptide3, peptide4]).protein_profile_dataframe
    display(protein_df)
    
    normed_ion_profile = lfqnorm.normalize_ion_profiles(protein_df)
    display(normed_ion_profile)
    column_from_shifted = normed_ion_profile.iloc[:,9].dropna().to_numpy()

    assert np.allclose(column_from_shifted, column_from_shifted[0],rtol=0.01, atol=0.01)



In [None]:

test_that_profiles_with_noise_are_close()

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
protein,ion,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
protA,0,44.602965,44.474241,41.489084,43.216306,43.505708,42.338405,43.454212,43.102621,41.897534,43.625507,43.261339,40.203349,44.447364,44.554027,44.692343,44.70169,43.106703,44.630975,43.38065,43.407413
protA,1,34.637164,34.508471,31.523309,33.250541,33.539932,32.372641,33.488428,33.136851,31.931758,33.65973,33.295568,30.237636,34.481571,34.588258,34.72655,34.735909,33.140916,34.665196,33.41489,33.44164
protA,2,29.730298,29.601666,26.616563,28.343701,28.633109,27.465632,28.581503,28.229946,27.024903,28.752767,28.388581,25.330811,29.574709,29.681298,29.819694,29.829052,28.234126,29.758298,28.507966,28.534798
protA,3,39.696074,39.56735,36.58219,38.309413,38.598815,37.431515,38.547322,38.195731,36.990643,38.718618,38.354448,35.29646,39.54047,39.647135,39.785452,39.794799,38.199812,39.724086,38.473757,38.500524


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
protein,ion,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
protA,0,44.602965,44.474241,41.489084,43.216306,43.505708,42.338405,43.454212,43.102621,41.897534,43.625507,43.261339,40.203349,44.447364,44.554027,44.692343,44.70169,43.106703,44.630975,43.38065,43.407413
protA,1,44.602938,44.474245,41.489083,43.216316,43.505706,42.338415,43.454202,43.102625,41.897532,43.625504,43.261342,40.20341,44.447345,44.554032,44.692324,44.701683,43.10669,44.63097,43.380664,43.407414
protA,2,44.602943,44.474311,41.489208,43.216346,43.505754,42.338277,43.454148,43.102591,41.897548,43.625412,43.261226,40.203456,44.447354,44.553943,44.692339,44.701697,43.106771,44.630943,43.380611,43.407443
protA,3,44.602964,44.474241,41.489081,43.216304,43.505705,42.338406,43.454212,43.102622,41.897534,43.625509,43.261339,40.203351,44.447361,44.554026,44.692343,44.70169,43.106703,44.630977,43.380647,43.407415


In [None]:
import directlfq.protein_intensity_estimation as intensity_estimation

def test_that_protein_intensities_are_retained():
    peptide1= PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0.1, systematic_peptide_shift=3000, add_noise=True)
    peptide2= PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0, systematic_peptide_shift=3, add_noise=True)
    peptide3= PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0, systematic_peptide_shift=0.1, add_noise=True)
    peptide4= PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)
    
    peptide_profiles = [peptide1, peptide2, peptide3, peptide4]
    summed_intensity_protein = sum([np.nansum(x.peptide_profile_vector) for x in peptide_profiles])
    
    protein_df = ProteinProfileGenerator([peptide1, peptide2, peptide3, peptide4]).protein_profile_dataframe
    protein_df_normed, _ = intensity_estimation.estimate_protein_intensities(protein_df, min_nonan=1, num_samples_quadratic=100, num_cores=1)
    display(protein_df_normed)
    display(protein_df_normed.iloc[0,1:].to_numpy())
    summed_lfq_intensities = np.sum(protein_df_normed.iloc[0,1:].to_numpy())
    assert np.allclose(summed_lfq_intensities, summed_intensity_protein)



In [None]:

test_that_protein_intensities_are_retained()

1 prots total
prot 0


Unnamed: 0,protein,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,protA,25685480000000.0,23492990000000.0,2966971000000.0,9823451000000.0,12005510000000.0,5345491000000.0,11584530000000.0,9079027000000.0,3937967000000.0,...,10134860000000.0,1216952000000.0,23059380000000.0,24828780000000.0,27327120000000.0,27504750000000.0,9104758000000.0,26189130000000.0,11008680000000.0,11214790000000.0


array([25685480068798.477, 23492985729224.617, 2966971322442.3076,
       9823451227982.002, 12005507094436.135, 5345491410602.842,
       11584530276439.023, 9079026570610.525, 3937966565364.505,
       13045002854709.543, 10134862066156.398, 1216952380115.5583,
       23059382371708.06, 24828783469561.055, 27327116905724.043,
       27504745639008.09, 9104757652175.703, 26189128807181.58,
       11008676725692.262, 11214793770630.262], dtype=object)

In [None]:
import directlfq.protein_intensity_estimation as intensity_estimation

def run_with_multiple_proteins():
    peptide1= PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0.1, systematic_peptide_shift=3000, add_noise=True)
    peptide2= PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0, systematic_peptide_shift=3, add_noise=True)
    peptide3= PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0, systematic_peptide_shift=0.1, add_noise=True)
    peptide4= PeptideProfile(protein_name="protB",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)
    peptide5= PeptideProfile(protein_name="protC",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)
    peptide6= PeptideProfile(protein_name="protD",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)
    peptide7= PeptideProfile(protein_name="protD",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)
    peptide8= PeptideProfile(protein_name="protD",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)

    peptide_profiles = [peptide1, peptide2, peptide3, peptide4, peptide5, peptide6, peptide7, peptide8]
    protein_df = ProteinProfileGenerator(peptide_profiles).protein_profile_dataframe
    protein_df_normed, _ = intensity_estimation.estimate_protein_intensities(protein_df, min_nonan=1, num_samples_quadratic=100, num_cores=1)
    display(protein_df_normed)
    


In [None]:
run_with_multiple_proteins()

4 prots total
prot 0


Unnamed: 0,protein,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,protA,22992370000000.0,21029730000000.0,2655975000000.0,8793380000000.0,10746810000000.0,4785012000000.0,10369900000000.0,8127093000000.0,3525069000000.0,...,9072232000000.0,1089350000000.0,20641590000000.0,22225750000000.0,24461850000000.0,24621190000000.0,8150122000000.0,23443130000000.0,9854382000000.0,10038830000000.0
1,protB,890653300000.0,814627500000.0,102881800000.0,340630700000.0,416296300000.0,185357100000.0,401696900000.0,314818500000.0,136550000000.0,...,351430900000.0,42198250000.0,799590900000.0,860949400000.0,947578000000.0,953735400000.0,315711000000.0,908115300000.0,381730000000.0,388877200000.0
2,protC,890653500000.0,814626100000.0,102881800000.0,340630200000.0,416295000000.0,185356400000.0,401698100000.0,314819100000.0,136550300000.0,...,351429900000.0,42198250000.0,799591800000.0,860948900000.0,947576000000.0,953737100000.0,315710600000.0,908116000000.0,381730700000.0,388876700000.0
3,protD,2671958000000.0,2443884000000.0,308644900000.0,1021891000000.0,1248887000000.0,556070200000.0,1205096000000.0,944456500000.0,409653500000.0,...,1054292000000.0,126594200000.0,2398780000000.0,2582853000000.0,2842733000000.0,2861210000000.0,947131300000.0,2724346000000.0,1145187000000.0,1166633000000.0


## Learning tests

In [None]:
import pandas as pd
import numpy as np

def test_that_dataframe_is_generated_as_expected():
    vals1 = np.array([1, 2, 3,4 ])
    vals2 = np.array([5, 6, 7, 8])
    vals3 = np.array([9, 10, 11, 12])
    df = pd.DataFrame([vals1, vals2, vals3],index=['A', 'A', 'A'])
    display(df)
    assert df.iloc[2, 2] == 11
    assert df.iloc[1, 2] == 7




In [None]:

test_that_dataframe_is_generated_as_expected()

Unnamed: 0,0,1,2,3
A,1,2,3,4
A,5,6,7,8
A,9,10,11,12


In [None]:
def test_retrieval_of_numpy_arrays_from_dataframe():
    vals1 = np.array([1, 2, 3,4 ])
    vals2 = np.array([5, 6, 7, 8])
    vals3 = np.array([9, 10, 11, 12])
    df = pd.DataFrame([vals1, vals2, vals3],index=[['A', 'B', 'C'], ['a', 'b', 'a']])
    display(df)
    assert np.allclose(vals2, df.loc['B'])
    assert np.allclose([2, 6, 10], df.loc[:,1])


In [None]:

test_retrieval_of_numpy_arrays_from_dataframe()

Unnamed: 0,Unnamed: 1,0,1,2,3
A,a,1,2,3,4
B,b,5,6,7,8
C,a,9,10,11,12


In [None]:
def test_setting_numpy_seed():
    from numpy.random import MT19937
    from numpy.random import RandomState, SeedSequence

    rs = RandomState(MT19937(SeedSequence(42)))
    res = rs.randint(10,size=20)
    display(res)


In [None]:

test_setting_numpy_seed()

array([2, 6, 8, 8, 3, 3, 3, 3, 4, 7, 2, 7, 5, 4, 0, 8, 1, 3, 7, 1])