In [1]:
# default_exp protein_intensity_estimation

In [2]:
%reload_ext autoreload

%autoreload 2

In [3]:
#export
import pandas as pd
import numpy as np
import directlfq.normalization as lfqnorm

def estimate_protein_intensities(normed_df, min_nonan):
    "derives protein pseudointensities from between-sample normalized data"
    prot_ints = []
    ion_ints = []

    count_prots = 1
    allprots = normed_df.index.get_level_values(0).unique()

    for protein in allprots:
        if(count_prots%100 ==0):
            print(f"prot {count_prots} of {len(allprots)}")
        count_prots+=1

        protvals = pd.DataFrame(normed_df.loc[protein]).copy()#DataFrame definition to avoid pandas Series objects
        
        summed_pepint = np.nansum(2**protvals)

        if(protvals.shape[1]<2):
            normed_protvals = protvals
        else:
            normed_protvals = lfqnorm.normalize_ion_profiles(protvals)
        
        ion_ints.extend(normed_protvals)
        scaled_vec = get_protein_profile_from_shifted_peptides(normed_protvals, summed_pepint, min_nonan)
        prot_ints.append(scaled_vec)
        

    protein_df = 2**pd.DataFrame(prot_ints, index = allprots, columns = normed_df.columns)
    protein_df = protein_df.replace(np.nan, 0)
    
    ion_df = 2**pd.concat(ion_ints)
    ion_df = ion_df.replace(np.nan, 0)
    return protein_df, ion_df

In [4]:
#export
def get_protein_profile_from_shifted_peptides(normalized_peptide_profile_df, summed_pepints, min_nonan):
    intens_vec = []
    for sample in normalized_peptide_profile_df.columns:
        reps = normalized_peptide_profile_df.loc[:,sample].to_numpy()
        nonan_elems = sum(~np.isnan(reps))
        if(nonan_elems>=min_nonan):
            intens_vec.append(np.nanmedian(reps))
        else:
            intens_vec.append(np.nan)
    
    intens_vec = np.array(intens_vec)
    intens_conversion_factor = summed_pepints/np.nansum(2**intens_vec)
    scaled_vec = intens_vec+np.log2(intens_conversion_factor)
    return scaled_vec


In [74]:
#export
import pandas as pd

class ProtvalCutter():
    def __init__(self, protvals_df, maximum_df_length = 100):
        self._protvals_df = protvals_df
        self._maximum_df_length = maximum_df_length
        self._dataframe_too_long = None
        self._sorted_idx = None
        self._check_if_df_too_long_and_sort_index_if_so()


    def _check_if_df_too_long_and_sort_index_if_so(self):
        self._dataframe_too_long =len(self._protvals_df.index)>self._maximum_df_length
        if self._dataframe_too_long:
            self._determine_nansorted_df_index()

    def _determine_nansorted_df_index(self):
        idxs = self._protvals_df.index
        self._sorted_idx =  sorted(idxs, key= lambda idx : self.get_num_nas_in_row(self._protvals_df.loc[idx]))
        
    @staticmethod
    def get_num_nas_in_row(row):
        return sum(np.isnan(row.to_numpy()))

    def get_shortened_dataframe_if_too_long(self):
        shortened_index = self._sorted_idx[:self._maximum_df_length+1]
        return self._protvals_df.loc[shortened_index]

    def remove_rejected_idxs



In [75]:
import pandas as pd
import numpy as np

def test_sorting_by_num_nans():
    vals1 = np.array([9, np.nan, np.nan, np.nan])
    vals2 = np.array([5, 6, np.nan, np.nan])
    vals3 = np.array([1, 2, 3,np.nan ])

    df = pd.DataFrame([vals1, vals2, vals3],index=[['P', 'P', 'P'],['A', 'B', 'C']])
    df_sorted = sort_datframe_by_num_nans_ascending(df)
    display(df)
    display(df_sorted)
    assert np.allclose(df_sorted.iloc[2].to_numpy(), vals1,equal_nan=True)
    assert np.allclose(df_sorted.iloc[0].to_numpy(), vals3,equal_nan=True)
    



test_sorting_by_num_nans()

Unnamed: 0,Unnamed: 1,0,1,2,3
P,A,9.0,,,
P,B,5.0,6.0,,
P,C,1.0,2.0,3.0,


Unnamed: 0,Unnamed: 1,0,1,2,3
P,C,1.0,2.0,3.0,
P,B,5.0,6.0,,
P,A,9.0,,,


ValueError: Can only compare identically-labeled DataFrame objects

### Unit Tests

#### Classes for testcase generation

In [6]:
import numpy as np
import pandas as pd

from  numpy.random import MT19937
from numpy.random import RandomState, SeedSequence

class ProteinProfileGenerator():
    def __init__(self, peptide_profiles):
        self._peptide_profiles = peptide_profiles
        
        self.protein_profile_dataframe = None
        self._generate_protein_profile_dataframe()

    def _generate_protein_profile_dataframe(self):
        collected_profiles = [x.peptide_profile_vector for x in self._peptide_profiles]
        protnames_for_index = [x.protein_name for x in self._peptide_profiles]
        pepnames_for_index = [f'{idx}' for idx in range(len(self._peptide_profiles))]
        self.protein_profile_dataframe = pd.DataFrame(collected_profiles,index=[protnames_for_index, pepnames_for_index])
        self.protein_profile_dataframe = np.log2(self.protein_profile_dataframe.replace(0, np.nan))



class PeptideProfile():
    def __init__(self, protein_name, fraction_zeros_in_profile, systematic_peptide_shift, add_noise, num_samples = 20, min_intensity = 1e6, max_intensity = 1e10):


        self._fraction_zeros_in_profile = fraction_zeros_in_profile
        self._systematic_peptide_shift = systematic_peptide_shift
        self._add_noise = add_noise
        self._min_intensity = min_intensity
        self._max_intensity = max_intensity
        self._num_samples = num_samples

        self.protein_name = protein_name
        self.peptide_profile_vector = []
        self._define_peptide_profile_vector()

    def _define_peptide_profile_vector(self):
        self.peptide_profile_vector = self._get_single_peptide_profile_template()
        self._scale_profile_vector()
        if self._add_noise:
            self._apply_poisson_noise_to_profilevector()
        self._add_zeros_to_profilevector()

    def _get_single_peptide_profile_template(self):
        rs = RandomState(MT19937(SeedSequence(42312)))
        return rs.randint(low=self._min_intensity, high=self._max_intensity,size=self._num_samples)

    def _scale_profile_vector(self):
        self.peptide_profile_vector = self.peptide_profile_vector*self._systematic_peptide_shift

    def _apply_poisson_noise_to_profilevector(self):
        self.peptide_profile_vector = np.random.poisson(lam=self.peptide_profile_vector, size=len(self.peptide_profile_vector))

    def _add_zeros_to_profilevector(self):
        num_elements_to_set_zero = int(self._num_samples*self._fraction_zeros_in_profile)
        idxs_to_set_zero = np.random.choice(self._num_samples,size=num_elements_to_set_zero, replace=False)
        self.peptide_profile_vector[idxs_to_set_zero] = 0
        


#### Tests

In [None]:
def test_that_profiles_without_noise_are_shifted_exactly_on_top_of_each_other():
    peptide1= PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0.1, systematic_peptide_shift=3000, add_noise=False)
    peptide2= PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0.9, systematic_peptide_shift=3, add_noise=False)
    peptide3= PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0.1, systematic_peptide_shift=0.1, add_noise=False)
    peptide4= PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0.9, systematic_peptide_shift=100, add_noise=False)
    protein_df = ProteinProfileGenerator([peptide1, peptide2, peptide3, peptide4]).protein_profile_dataframe
    display(protein_df)
    normed_ion_profile = lfqnorm.normalize_ion_profiles(protein_df)
    display(normed_ion_profile)
    column_from_shifted = normed_ion_profile.iloc[:,11].dropna().to_numpy()
    display(column_from_shifted)
    assert np.allclose(column_from_shifted, column_from_shifted[0])
    
test_that_profiles_without_noise_are_shifted_exactly_on_top_of_each_other()



Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
protA,0,44.602965,44.474241,,43.216307,43.505708,42.338404,43.454211,43.102621,41.897534,43.625508,43.26134,40.203349,44.447364,44.554027,44.692343,44.70169,43.106703,,43.38065,43.407413
protA,1,34.637181,,,,,,,33.136837,,,,,,,,,,,,
protA,2,29.73029,29.601566,26.61641,28.343632,28.633033,27.465729,28.581537,28.229946,27.024859,,28.388665,25.330674,29.574689,,29.819668,29.829015,28.234029,29.758301,28.507975,28.534739
protA,3,39.696074,39.56735,,,,,,,,,,,,,,,,,,


Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
protA,0,44.602965,44.474241,,43.216307,43.505708,42.338404,43.454211,43.102621,41.897534,43.625508,43.26134,40.203349,44.447364,44.554027,44.692343,44.70169,43.106703,,43.38065,43.407413
protA,1,44.602965,,,,,,,43.102621,,,,,,,,,,,,
protA,2,44.602965,44.474241,41.489085,43.216307,43.505708,42.338404,43.454211,43.102621,41.897534,,43.26134,40.203349,44.447364,,44.692343,44.70169,43.106703,44.630975,43.38065,43.407413
protA,3,44.602965,44.474241,,,,,,,,,,,,,,,,,,


array([40.20334853, 40.20334853])

In [None]:
def test_that_profiles_with_noise_are_close():
    peptide1= PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0, systematic_peptide_shift=3000, add_noise=True)
    peptide2= PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0, systematic_peptide_shift=3, add_noise=True)
    peptide3= PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0, systematic_peptide_shift=0.1, add_noise=True)
    peptide4= PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)

    protein_df = ProteinProfileGenerator([peptide1, peptide2, peptide3, peptide4]).protein_profile_dataframe
    
    normed_ion_profile = lfqnorm.normalize_ion_profiles(protein_df)
    display(normed_ion_profile)
    column_from_shifted = normed_ion_profile.iloc[:,9].dropna().to_numpy()

    assert np.allclose(column_from_shifted, column_from_shifted[0],rtol=0.01, atol=0.01)


test_that_profiles_with_noise_are_close()

Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
protA,0,44.602965,44.47424,41.489087,43.216307,43.505707,42.338405,43.454211,43.102621,41.897534,43.625508,43.261339,40.20335,44.447364,44.554028,44.692343,44.70169,43.106703,44.630975,43.38065,43.407413
protA,1,44.602976,44.474239,41.489118,43.216318,43.50569,42.338418,43.454209,43.102632,41.897513,43.625511,43.261338,40.203304,44.447354,44.554026,44.692347,44.701691,43.106699,44.630979,43.380646,43.40744
protA,2,44.60291,44.474247,41.488871,43.216364,43.5058,42.338546,43.454158,43.102638,41.897646,43.62558,43.261252,40.203183,44.44746,44.554141,44.69233,44.701679,43.106673,44.630996,43.380641,43.407325
protA,3,44.602964,44.474242,41.489088,43.216305,43.505705,42.338405,43.454209,43.102622,41.897536,43.625511,43.261338,40.203343,44.447365,44.554026,44.692343,44.70169,43.106704,44.630976,43.380648,43.407413


In [None]:
def test_that_protein_intensities_are_retained():
    peptide1= PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0.1, systematic_peptide_shift=3000, add_noise=True)
    peptide2= PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0, systematic_peptide_shift=3, add_noise=True)
    peptide3= PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0, systematic_peptide_shift=0.1, add_noise=True)
    peptide4= PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)
    
    peptide_profiles = [peptide1, peptide2, peptide3, peptide4]
    summed_intensity_protein = sum([np.nansum(x.peptide_profile_vector) for x in peptide_profiles])
    
    protein_df = ProteinProfileGenerator([peptide1, peptide2, peptide3, peptide4]).protein_profile_dataframe
    
    protein_df_normed, _ = estimate_protein_intensities(protein_df)
    display(protein_df_normed)
    summed_lfq_intensities = np.sum(protein_df_normed.iloc[0].to_numpy())
    assert np.allclose(summed_lfq_intensities, summed_intensity_protein)

test_that_protein_intensities_are_retained()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
protA,24717870000000.0,22607950000000.0,2855226000000.0,9453352000000.0,11553210000000.0,5144123000000.0,11148150000000.0,8737009000000.0,3789607000000.0,12553570000000.0,9753070000000.0,1171102000000.0,22190650000000.0,23893500000000.0,26297640000000.0,26468550000000.0,8761665000000.0,25202460000000.0,10593960000000.0,10792210000000.0


In [None]:
def run_with_multiple_proteins():
    peptide1= PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0.1, systematic_peptide_shift=3000, add_noise=True)
    peptide2= PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0, systematic_peptide_shift=3, add_noise=True)
    peptide3= PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0, systematic_peptide_shift=0.1, add_noise=True)
    peptide4= PeptideProfile(protein_name="protB",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)
    peptide5= PeptideProfile(protein_name="protC",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)
    peptide6= PeptideProfile(protein_name="protD",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)
    peptide7= PeptideProfile(protein_name="protD",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)
    peptide8= PeptideProfile(protein_name="protD",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)

    peptide_profiles = [peptide1, peptide2, peptide3, peptide4, peptide5, peptide6, peptide7, peptide8]
    protein_df = ProteinProfileGenerator(peptide_profiles).protein_profile_dataframe
    protein_df_normed, _ = estimate_protein_intensities(protein_df)
    display(protein_df_normed)
    
run_with_multiple_proteins()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
protA,23481330000000.0,21477070000000.0,2712404000000.0,8980489000000.0,10975350000000.0,4886806000000.0,10590460000000.0,8299937000000.0,3600148000000.0,11925650000000.0,9265232000000.0,1112573000000.0,21080710000000.0,22698420000000.0,24982340000000.0,25144610000000.0,8323479000000.0,23941870000000.0,10064030000000.0,10252350000000.0
protB,890653500000.0,814627000000.0,102881500000.0,340630300000.0,416296500000.0,185357500000.0,401697500000.0,314818300000.0,136549900000.0,452339000000.0,351429700000.0,42198280000.0,799591300000.0,860948200000.0,947577600000.0,953736100000.0,315710500000.0,908115400000.0,381730400000.0,388877400000.0
protC,890654700000.0,814627400000.0,102881500000.0,340630700000.0,416295200000.0,185356900000.0,401698700000.0,314818400000.0,136550100000.0,452340800000.0,351430400000.0,42197920000.0,799590700000.0,860950000000.0,947577100000.0,953737100000.0,315710200000.0,908115600000.0,381729500000.0,388876900000.0
protD,2671964000000.0,2443883000000.0,308644400000.0,1021891000000.0,1248885000000.0,556071300000.0,1205095000000.0,944455100000.0,409650100000.0,1357021000000.0,1054293000000.0,126593500000.0,2398778000000.0,2582847000000.0,2842728000000.0,2861211000000.0,947134400000.0,2724348000000.0,1145187000000.0,1166630000000.0


## Learning tests

In [None]:
import pandas as pd
import numpy as np

def test_that_dataframe_is_generated_as_expected():
    vals1 = np.array([1, 2, 3,4 ])
    vals2 = np.array([5, 6, 7, 8])
    vals3 = np.array([9, 10, 11, 12])
    df = pd.DataFrame([vals1, vals2, vals3],index=['A', 'A', 'A'])
    display(df)
    assert df.iloc[2, 2] == 11
    assert df.iloc[1, 2] == 7



test_that_dataframe_is_generated_as_expected()

Unnamed: 0,0,1,2,3
A,1,2,3,4
A,5,6,7,8
A,9,10,11,12


In [None]:
def test_retrieval_of_numpy_arrays_from_dataframe():
    vals1 = np.array([1, 2, 3,4 ])
    vals2 = np.array([5, 6, 7, 8])
    vals3 = np.array([9, 10, 11, 12])
    df = pd.DataFrame([vals1, vals2, vals3],index=[['A', 'B', 'C'], ['a', 'b', 'a']])
    display(df)
    assert np.allclose(vals2, df.loc['B'])
    assert np.allclose([2, 6, 10], df.loc[:,1])

test_retrieval_of_numpy_arrays_from_dataframe()

Unnamed: 0,Unnamed: 1,0,1,2,3
A,a,1,2,3,4
B,b,5,6,7,8
C,a,9,10,11,12


In [None]:
def test_setting_numpy_seed():
    from numpy.random import MT19937
    from numpy.random import RandomState, SeedSequence

    rs = RandomState(MT19937(SeedSequence(42)))
    res = rs.randint(10,size=20)
    display(res)

test_setting_numpy_seed()

array([2, 6, 8, 8, 3, 3, 3, 3, 4, 7, 2, 7, 5, 4, 0, 8, 1, 3, 7, 1])