In [None]:
#| default_exp protein_intensity_estimation

In [None]:
%reload_ext autoreload

%autoreload 2

# Protein Intenstiity Estimation

This notebook tests the protein LFQ intensity estimation step. It is in principle a wrapper around the functionality of the normalization.py class, which is used to shift precursors or transition intensity traces on top of each other.

# Unit Tests

#### Classes for testcase generation

In [None]:
import pandas as pd
import numpy as np
import directlfq.protein_intensity_estimation as lfq_protint
#test df cutting

def test_sorting_by_num_nans():
    vals1 = np.array([9, np.nan, np.nan, np.nan])
    vals2 = np.array([5, 6, np.nan, np.nan])
    vals3 = np.array([1, 2, 3,np.nan ])

    df = pd.DataFrame([vals1, vals2, vals3],index=[['P', 'P', 'P'],['A', 'B', 'C']])
    pcutter = lfq_protint.ProtvalCutter(df,maximum_df_length=2)
    sorted_idx = pcutter._sorted_idx
    df_sorted = df.loc[sorted_idx]
    
    assert np.allclose(df_sorted.iloc[2].to_numpy(), vals1,equal_nan=True)
    assert np.allclose(df_sorted.iloc[0].to_numpy(), vals3,equal_nan=True)
    

def test_cutting_of_df():
    vals1 = np.array([9, np.nan, np.nan, np.nan])
    vals2 = np.array([5, 6, np.nan, np.nan])
    vals3 = np.array([1, 2, 3,np.nan ])

    df = pd.DataFrame([vals1, vals2, vals3],index=[['A', 'B', 'C']])
    pcutter = lfq_protint.ProtvalCutter(df, maximum_df_length=2)
    cut_df = pcutter.get_dataframe()
    ion_idx = [x[0] for x in cut_df.index]
    print(ion_idx)
    assert ion_idx == ['C', 'B']


test_sorting_by_num_nans()
test_cutting_of_df()


['C', 'B']


In [None]:
import directlfq.normalization as lfq_norm
import directlfq.test_utils as lfq_testutils

def test_that_profiles_without_noise_are_shifted_exactly_on_top_of_each_other():
    peptide1= lfq_testutils.PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0.1, systematic_peptide_shift=3000, add_noise=False)
    peptide2= lfq_testutils.PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0.9, systematic_peptide_shift=3, add_noise=False)
    peptide3= lfq_testutils.PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0.1, systematic_peptide_shift=0.1, add_noise=False)
    peptide4= lfq_testutils.PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0.9, systematic_peptide_shift=100, add_noise=False)
    protein_df = lfq_testutils.ProteinProfileGenerator([peptide1, peptide2, peptide3, peptide4]).protein_profile_dataframe
    display(protein_df)
    normed_ion_profile = lfq_norm.normalize_ion_profiles(protein_df)
    display(normed_ion_profile)
    column_from_shifted = normed_ion_profile.iloc[:,11].dropna().to_numpy()
    display(column_from_shifted)
    assert np.allclose(column_from_shifted, column_from_shifted[0])

def test_that_profiles_with_noise_are_close():
    peptide1= lfq_testutils.PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0, systematic_peptide_shift=3000, add_noise=True)
    peptide2= lfq_testutils.PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0, systematic_peptide_shift=3, add_noise=True)
    peptide3= lfq_testutils.PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0, systematic_peptide_shift=0.1, add_noise=True)
    peptide4= lfq_testutils.PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)

    protein_df = lfq_testutils.ProteinProfileGenerator([peptide1, peptide2, peptide3, peptide4]).protein_profile_dataframe
    display(protein_df)
    
    normed_ion_profile = lfq_norm.normalize_ion_profiles(protein_df)
    display(normed_ion_profile)
    column_from_shifted = normed_ion_profile.iloc[:,9].dropna().to_numpy()

    assert np.allclose(column_from_shifted, column_from_shifted[0],rtol=0.01, atol=0.01)

test_that_profiles_without_noise_are_shifted_exactly_on_top_of_each_other()
test_that_profiles_with_noise_are_close()


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
protein,ion,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
protA,0,,44.474241,41.489085,43.216307,43.505708,42.338404,43.454211,43.102621,41.897534,43.625508,43.26134,40.203349,44.447364,44.554027,44.692343,44.70169,43.106703,44.630975,43.38065,
protA,1,,,,,,,,,,,,,34.481579,,,,,34.665191,,
protA,2,29.73029,29.601566,26.61641,28.343632,28.633033,,28.581537,28.229946,27.024859,28.752833,28.388665,25.330674,29.574689,,29.819668,29.829015,28.234029,29.758301,28.507975,28.534739
protA,3,,,,,38.598817,,,,,,,,39.540473,,,,,,,


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
protein,ion,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
protA,0,,44.474241,41.489085,43.216307,43.505708,42.338404,43.454211,43.102621,41.897534,43.625508,43.26134,40.203349,44.447364,44.554027,44.692343,44.70169,43.106703,44.630975,43.38065,
protA,1,,,,,,,,,,,,,44.447364,,,,,44.630975,,
protA,2,44.602965,44.474241,41.489085,43.216307,43.505708,,43.454211,43.102621,41.897534,43.625508,43.26134,40.203349,44.447364,,44.692343,44.70169,43.106703,44.630975,43.38065,43.407413
protA,3,,,,,43.505708,,,,,,,,44.447364,,,,,,,


array([40.20334853, 40.20334853])

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
protein,ion,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
protA,0,44.602965,44.47424,41.489085,43.216306,43.505708,42.338405,43.454211,43.102621,41.897533,43.625508,43.26134,40.203346,44.447364,44.554027,44.692343,44.70169,43.106703,44.630976,43.38065,43.407413
protA,1,34.637191,34.508449,31.523327,33.250533,33.539906,32.372622,33.48842,33.136837,31.93174,33.659734,33.295572,30.237537,34.481578,34.588245,34.726571,34.735917,33.140931,34.665203,33.414868,33.441634
protA,2,29.730299,29.601596,26.616364,28.343634,28.633164,27.465805,28.581529,28.229941,27.024871,28.752936,28.388685,25.330857,29.574649,29.681455,29.819767,29.828994,28.234112,29.75829,28.508008,28.534752
protA,3,39.696074,39.56735,36.582194,38.30942,38.598821,37.431511,38.547322,38.195732,36.990644,38.718614,38.354449,35.296454,39.540472,39.647136,39.78545,39.794799,38.19981,39.724082,38.473762,38.500524


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
protein,ion,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
protA,0,44.602965,44.47424,41.489085,43.216306,43.505708,42.338405,43.454211,43.102621,41.897533,43.625508,43.26134,40.203346,44.447364,44.554027,44.692343,44.70169,43.106703,44.630976,43.38065,43.407413
protA,1,44.602971,44.47423,41.489107,43.216313,43.505686,42.338402,43.4542,43.102617,41.897521,43.625514,43.261352,40.203317,44.447358,44.554025,44.692351,44.701697,43.106711,44.630983,43.380648,43.407414
protA,2,44.602957,44.474255,41.489023,43.216293,43.505823,42.338464,43.454188,43.1026,41.897529,43.625595,43.261344,40.203515,44.447308,44.554114,44.692426,44.701652,43.106771,44.630949,43.380667,43.407411
protA,3,44.602965,44.47424,41.489085,43.21631,43.505712,42.338402,43.454213,43.102623,41.897535,43.625505,43.26134,40.203344,44.447363,44.554027,44.692341,44.70169,43.106701,44.630973,43.380653,43.407414


In [None]:
import directlfq.protein_intensity_estimation as intensity_estimation
import directlfq.test_utils as lfq_testutils

def test_that_protein_intensities_are_retained():
    peptide1= lfq_testutils.PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0.1, systematic_peptide_shift=3000, add_noise=True)
    peptide2= lfq_testutils.PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0, systematic_peptide_shift=3, add_noise=True)
    peptide3= lfq_testutils.PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0, systematic_peptide_shift=0.1, add_noise=True)
    peptide4= lfq_testutils.PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)
    
    peptide_profiles = [peptide1, peptide2, peptide3, peptide4]
    summed_intensity_protein = sum([np.nansum(x.peptide_profile_vector) for x in peptide_profiles])
    
    protein_df = lfq_testutils.ProteinProfileGenerator([peptide1, peptide2, peptide3, peptide4]).protein_profile_dataframe
    protein_df_normed, _ = intensity_estimation.estimate_protein_intensities(protein_df, min_nonan=1, num_samples_quadratic=100, num_cores=1)
    display(protein_df_normed)
    display(protein_df_normed.iloc[0,1:].to_numpy())
    summed_lfq_intensities = np.sum(protein_df_normed.iloc[0,1:].to_numpy())
    assert np.allclose(summed_lfq_intensities, summed_intensity_protein)

test_that_protein_intensities_are_retained()



2023-12-08 12:22:20,257 - directlfq.protein_intensity_estimation - INFO - 1 prots total
2023-12-08 12:22:20,258 - directlfq.protein_intensity_estimation - INFO - prot 0


Unnamed: 0,protein,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,protA,25349800000000.0,23185890000000.0,2928181000000.0,9695020000000.0,11848610000000.0,5275621000000.0,11433130000000.0,8960380000000.0,3886496000000.0,...,10002410000000.0,1201040000000.0,22757990000000.0,24504320000000.0,26969940000000.0,27145260000000.0,8985770000000.0,25846820000000.0,10864790000000.0,11068230000000.0


array([25349795642833.24, 23185890220888.97, 2928180586448.17,
       9695020357645.807, 11848609977072.47, 5275621107950.519,
       11433132923005.893, 8960379724734.42, 3886495984197.8735,
       12874540472742.57, 10002405236736.053, 1201039877802.1846,
       22757993614330.746, 24504324707272.664, 26969944908322.53,
       27145258441058.31, 8985769914714.105, 25846821137849.547,
       10864793434866.52, 11068226808633.043], dtype=object)

In [None]:
import directlfq.protein_intensity_estimation as intensity_estimation
import directlfq.test_utils as lfq_testutils

def run_with_multiple_proteins():
    peptide1= lfq_testutils.PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0.1, systematic_peptide_shift=3000, add_noise=True)
    peptide2= lfq_testutils.PeptideProfile(protein_name="protA",fraction_zeros_in_profile=0, systematic_peptide_shift=3, add_noise=True)
    peptide3= lfq_testutils.PeptideProfile(protein_name="protA", fraction_zeros_in_profile=0, systematic_peptide_shift=0.1, add_noise=True)
    peptide4= lfq_testutils.PeptideProfile(protein_name="protB",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)
    peptide5= lfq_testutils.PeptideProfile(protein_name="protC",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)
    peptide6= lfq_testutils.PeptideProfile(protein_name="protD",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)
    peptide7= lfq_testutils.PeptideProfile(protein_name="protD",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)
    peptide8= lfq_testutils.PeptideProfile(protein_name="protD",fraction_zeros_in_profile=0, systematic_peptide_shift=100, add_noise=True)

    peptide_profiles = [peptide1, peptide2, peptide3, peptide4, peptide5, peptide6, peptide7, peptide8]
    protein_df = lfq_testutils.ProteinProfileGenerator(peptide_profiles).protein_profile_dataframe
    protein_df_normed, _ = intensity_estimation.estimate_protein_intensities(protein_df, min_nonan=1, num_samples_quadratic=100, num_cores=1)
    display(protein_df_normed)
    assert len(protein_df_normed.index) == 4

run_with_multiple_proteins()



2023-12-08 12:26:12,022 - directlfq.protein_intensity_estimation - INFO - 4 prots total
2023-12-08 12:26:12,024 - directlfq.protein_intensity_estimation - INFO - prot 0


Unnamed: 0,protein,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,protA,25313840000000.0,23152860000000.0,2924049000000.0,9681216000000.0,11831730000000.0,5268128000000.0,11416860000000.0,8947613000000.0,3880908000000.0,...,9988645000000.0,1199318000000.0,22725590000000.0,24469560000000.0,26931630000000.0,27106750000000.0,8972944000000.0,25809990000000.0,10849310000000.0,11052530000000.0
1,protB,890653200000.0,814627100000.0,102881400000.0,340630800000.0,416296200000.0,185356300000.0,401697300000.0,314819000000.0,136550800000.0,...,351431100000.0,42197810000.0,799591800000.0,860947200000.0,947576800000.0,953736300000.0,315710500000.0,908116900000.0,381728200000.0,388877100000.0
2,protC,890654400000.0,814628500000.0,102881900000.0,340629600000.0,416295500000.0,185356000000.0,401697900000.0,314819500000.0,136550600000.0,...,351430700000.0,42197770000.0,799593600000.0,860948600000.0,947575900000.0,953734400000.0,315711200000.0,908115300000.0,381729700000.0,388876000000.0
3,protD,2671963000000.0,2443884000000.0,308645100000.0,1021892000000.0,1248886000000.0,556071100000.0,1205096000000.0,944458600000.0,409651700000.0,...,1054292000000.0,126594300000.0,2398774000000.0,2582846000000.0,2842730000000.0,2861209000000.0,947132000000.0,2724343000000.0,1145188000000.0,1166632000000.0


## Learning tests

In [None]:
import pandas as pd
import numpy as np

def test_that_dataframe_is_generated_as_expected():
    vals1 = np.array([1, 2, 3,4 ])
    vals2 = np.array([5, 6, 7, 8])
    vals3 = np.array([9, 10, 11, 12])
    df = pd.DataFrame([vals1, vals2, vals3],index=['A', 'A', 'A'])
    display(df)
    assert df.iloc[2, 2] == 11
    assert df.iloc[1, 2] == 7




In [None]:

test_that_dataframe_is_generated_as_expected()

Unnamed: 0,0,1,2,3
A,1,2,3,4
A,5,6,7,8
A,9,10,11,12


In [None]:
def test_retrieval_of_numpy_arrays_from_dataframe():
    vals1 = np.array([1, 2, 3,4 ])
    vals2 = np.array([5, 6, 7, 8])
    vals3 = np.array([9, 10, 11, 12])
    df = pd.DataFrame([vals1, vals2, vals3],index=[['A', 'B', 'C'], ['a', 'b', 'a']])
    display(df)
    assert np.allclose(vals2, df.loc['B'])
    assert np.allclose([2, 6, 10], df.loc[:,1])


In [None]:

test_retrieval_of_numpy_arrays_from_dataframe()

Unnamed: 0,Unnamed: 1,0,1,2,3
A,a,1,2,3,4
B,b,5,6,7,8
C,a,9,10,11,12


In [None]:
def test_setting_numpy_seed():
    from numpy.random import MT19937
    from numpy.random import RandomState, SeedSequence

    rs = RandomState(MT19937(SeedSequence(42)))
    res = rs.randint(10,size=20)
    display(res)


In [None]:

test_setting_numpy_seed()

array([2, 6, 8, 8, 3, 3, 3, 3, 4, 7, 2, 7, 5, 4, 0, 8, 1, 3, 7, 1])