In [None]:
# default_exp trace_shifting

In [None]:
#export
import pandas as pd
import numpy as np
import directlfq.normalization as lfqnorm

def estimate_protein_intensities(normed_df):
    "derives protein pseudointensities from between-sample normalized data"
    prot_ints = []
    ion_ints = []

    count_prots = 0
    allprots = normed_df.index.get_level_values(0).unique()

    for protein in allprots:
        if(count_prots%100 ==0):
            print(f"prot {count_prots} of {len(allprots)}")
        count_prots+=1

        protvals = pd.DataFrame(normed_df.loc[protein]).to_numpy().copy()#DataFrame definition to avoid pandas Series objects
        
        summed_pepint = np.nansum(2**protvals)

        if(protvals.shape[1]<2):
            normed_protvals = protvals
        else:
            normed_protvals = lfqnorm.normalize_withincond(protvals)
        
        ion_ints.extend(normed_protvals)
        scaled_vec = get_protein_profile_from_shifted_peptides(normed_protvals.T, summed_pepint)
        prot_ints.append(scaled_vec)
        

    protein_df = 2**pd.DataFrame(prot_ints, index = allprots, columns = normed_df.columns)
    protein_df = protein_df.replace(np.nan, 0)
    ion_df = 2**pd.DataFrame(ion_ints, index = normed_df.index, columns = normed_df.columns)
    ion_df = ion_df.replace(np.nan, 0)
    return protein_df, ion_df

In [None]:
#export
def get_protein_profile_from_shifted_peptides(sample2reps, summed_pepints):
    intens_vec = []
    for sample_idx in range(len(sample2reps)):
        reps = np.array(sample2reps[sample_idx])
        nonan_elems = sum(~np.isnan(reps))
        if(nonan_elems>=1):
            intens_vec.append(np.nanmedian(reps))
        else:
            intens_vec.append(np.nan)
    
    intens_vec = np.array(intens_vec)
    intens_conversion_factor = summed_pepints/np.nansum(2**intens_vec)
    scaled_vec = intens_vec+np.log2(intens_conversion_factor)
    return scaled_vec


In [None]:
import directlfq.diffquant_utils as aqutils
import directlfq.normalization as aqnorm


pepdata = aqutils.import_data("/Users/constantin/workspace/directlfq/test_data/system_tests/sensitivity_tests/MaxQuant/peptides.txt")

display(pepdata)
samples = pepdata.columns
display(pepdata.dtypes)
aqnorm.normalize_within_cond(pepdata,pepdata.columns)


using input type maxquant_peptides


Unnamed: 0,index,protein,ion,1,4,2,7,11,6,8,5,9,3,10
0,0,P55011,SEQ_AAAAAAAAAAAAAAAGAGAGAK_MOD_1595.838_,3973200,2747800,0,3774500,1110200,0,4030100,1273900,2515200,5745000,8049600
1,1,O60341,SEQ_AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPA...,0,1731100,2402300,0,0,0,1334800,0,0,0,1831800
2,2,Q86U42,SEQ_AAAAAAAAAAGAAGGR_MOD_1197.6214_,0,26826000,45828000,31969000,0,32364000,35691000,25037000,24136000,58159000,41647000
3,3,Q9Y4H2,SEQ_AAAAAAAAVPSAGPAGPAPTSAAGR_MOD_2031.0498_,0,0,3872100,0,0,0,0,0,0,4038700,4099500
4,4,O75822,SEQ_AAAAAAAGDSDSWDADAFSVEDPVRK_MOD_2592.1728_,1749100,1599500,1834100,3328700,3060600,2834400,3847000,2746000,1982000,2309300,5061000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58234,58234,Q15648,SEQ_YYVSPSDLLDDK_MOD_1413.6664_,0,1339900,2086300,1754700,2409100,1988700,1962100,0,1313500,2423900,2992400
58235,58235,P68104,SEQ_YYVTIIDAPGHR_MOD_1403.7197_,525160000,683960000,475580000,724540000,748780000,953850000,445740000,1115200000,603590000,564970000,747350000
58236,58236,P07942,SEQ_YYYAVYDMVVR_MOD_1440.6748_,887370,2532000,3790700,3673500,3895400,3129000,4508300,2835400,4224500,4201300,6655000
58237,58237,Q8N183,SEQ_YYYIPQYK_MOD_1136.5542_,1591300,2829700,2604400,4362900,3046700,3632200,3881100,3202500,2613000,2056900,3520800


index       int64
protein    object
ion        object
1           int64
4           int64
2           int64
7           int64
11          int64
6           int64
8           int64
5           int64
9           int64
3           int64
10          int64
dtype: object

TypeError: unsupported operand type(s) for -: 'int' and 'str'

In [None]:

def test_protein_profile():
    
    sample_data = {}

    sample_data['precursor'] = ['Prec_1'] * 2 + ['Prec_2'] * 2 + ['Prec_3'] * 2
    sample_data['shortname'] = ['A','B','A', 'B','A','B'] 
    sample_data['protein'] = ['X'] * 6
    sample_data['int_sum'] = [0.6, 0.8, 0.6, 1.2, 1.6, 1.2]

    test_df = pd.DataFrame(sample_data)
    display(test_df)
    

    protein_df = run_protein_normalization(test_df)[0]
    display(protein_df)
    #sample2feature

    #protein_profile(test_df, ['A','B'], 'int_sum', 'X')

    profile, pre_lfq, file_ids, protein = protein_profile(test_df, ['A','B'], 'int_sum', 'X')
    print(f"profile {profile}")
    print(f"pre LFQ {pre_lfq}")
    print(f"file ids {file_ids}")
    print(f"protein {protein}")
    
    
    # total intensity should be preserved
    assert np.allclose(profile.sum(), pre_lfq.sum())
    
    sample_data = {}

    sample_data['precursor'] = ['Prec_1'] * 2 + ['Prec_2'] * 2 + ['Prec_3'] * 2
    sample_data['shortname'] = ['A','B'] * 3
    sample_data['protein'] = ['X'] * 6
    sample_data['int_sum'] = [0.6, 0.8, 0.6, 1.2, 1.6, 1.2]

    test_df = pd.DataFrame(sample_data)

    protein_profile(test_df, ['A','B'], 'int_sum', 'X')

    profile, pre_lfq, file_ids, protein = protein_profile(test_df, ['A','B'], 'int_sum', 'X')

    assert np.allclose(profile.sum(), pre_lfq.sum())
    
test_protein_profile()

Unnamed: 0,precursor,shortname,protein,int_sum
0,Prec_1,A,X,0.6
1,Prec_1,B,X,0.8
2,Prec_2,A,X,0.6
3,Prec_2,B,X,1.2
4,Prec_3,A,X,1.6
5,Prec_3,B,X,1.2


prot 0 of 1


shortname,A_A_LFQ,B_A_LFQ
protein,Unnamed: 1_level_1,Unnamed: 2_level_1
X,2.6,2.6


NameError: name 'protein_profile' is not defined