In [1]:
#---#| default_exp psm_frag_reader.maxquant_frag_reader

# MaxQuant Fragment Reader

Read PSMs and their fragments from MaxQuant msms.txt.

> Legacy: we observed that using intensity values from msms.txt to train a MS2 model is not a good idea, as the intensities are not the same as those in RAW files. Extract intensities directly from RAW data as much as possible.

In [2]:
from peptdeep.psm_frag_reader.maxquant_frag_reader import *

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [3]:
#|hide
import io
import pandas as pd
import numpy as np

In [4]:
#| hide
mq_str = '''Raw file	Scan number	Scan index	Sequence	Length	Missed cleavages	Modifications	Modified sequence	Phospho (STY) Probabilities	Phospho (STY) Score Diffs	Acetyl (Protein_N-term)	Phospho (STY)	Proteins	Gene Names	Protein Names	Charge	Fragmentation	Mass analyzer	Type	Scan event number	Isotope index	m/z	Mass	Mass Error [ppm]	Simple Mass Error [ppm]	Retention time	PEP	Score	Delta score	Score diff	Localization prob	Combinatorics	PIF	Fraction of total spectrum	Base peak fraction	Precursor Full ScanNumber	Precursor Intensity	Precursor Apex Fraction	Precursor Apex Offset	Precursor Apex Offset Time	Diagnostic peak Phospho (STY) Y	Matches	Intensities	Mass Deviations [Da]	Mass Deviations [ppm]	Masses	Number of Matches	Intensity coverage	Peak coverage	Neutral loss level	ETD identification type	Reverse	All scores	All sequences	All modified sequences	id	Protein group IDs	Peptide ID	Mod. peptide ID	Evidence ID	Phospho (STY) site IDs
200123_SAX_SPAC3_1	17556	14082	AAAADILPVLLK	12	0	Unmodified	_AAAADILPVLLK_			0	0	O13864	kap95	Importin subunit beta-1	2	HCD	FTMS	MULTI-MSMS	7	0	597.87646	1193.7384	0.17929	0.9109951	69.506	0.00016307	99.539	92.608	NaN	NaN	1	0	0	0	-1	0	0	0	0		y1;y2;y3;y5;y6;y7;y8;y9;y10;y11;y7(2+);y8(2+);y9(2+);y10(2+);b2;b3;b4;b5;b5-H2O;b6;b6-H2O;b7;b10	1691;1737.9;2746.5;24815.1;26944.9;16823.3;45272.1;30384.9;18895.4;1119.1;604.5;957.2;966.6;2035.4;15718.7;24674.9;6395.5;7923;1537.9;4618.8;887.6;2099.3;860.1	-0.0001719036;0.0005485709;-0.0004704565;-5.074493E-05;-3.217469E-05;-0.0006849912;-0.0001701818;-0.0001047339;0.0009983118;0.004176553;0.0008543315;0.001996746;0.001480154;-0.0003792124;5.262618E-05;8.755656E-05;0.0001835221;0.0004847084;0.002249124;0.000259138;-0.001699591;0.001376341;1.408576E-05	-1.168514;2.108296;-1.260327;-0.08911967;-0.04714336;-0.8610058;-0.1868903;-0.1066933;0.9483612;3.716772;2.14501;4.380748;3.01261;-0.719787;0.3678058;0.4089163;0.6435859;1.211219;5.885142;0.5048801;-3.43173;2.197401;0.01505603	147.112976074219;260.196319580078;373.281402587891;569.402160644531;682.486206054688;795.570922851563;910.597351074219;981.634399414063;1052.67041015625;1123.70434570313;398.287902832031;455.800231933594;491.319305419922;526.839721679688;143.081451416016;214.118530273438;285.155548095703;400.182189941406;382.169860839844;513.266479492188;495.257873535156;626.349426269531;935.556030273438	23	0.7687168	0.3026316	None	Unknown		99.53857;6.931045;5.725131	AAAADILPVLLK;TLWHRLKLK;HIRTLSARIK	_AAAADILPVLLK_;_TLWHRLKLK_;_HIRTLSARIK_	0	204	0	0	0	
200123_SAX_SPAC3_1	10089	7082	AAARPTVSIYNK	12	1	Acetyl (Protein_N-term)	_(ac)AAARPTVSIYNK_			1	0	Q9P784	rpl4b	60S ribosomal protein L4-B	2	HCD	FTMS	MULTI-MSMS	1	0	666.86715	1331.7197	-0.51651	2.4762469	48.013	0.00068485	73.082	56.42	NaN	NaN	1	0	0	0	-1	0	0	0	0		y1;y2;y3;y4;y5;y8;y1-NH3;y2-NH3;y3-NH3;b2;b4;b7;b8;b8-H2O;b9;b9-H2O;b10;b11	164590.5;494033.8;411033.4;49476.1;87580.2;144666.2;23841.3;130096.5;83541.1;32851.9;97649.7;149771.2;286369.7;168714.4;1080879;100917.2;927992.6;748799.6	0.0001790485;0.0002110847;-0.0005136858;-0.001807504;-0.006095012;0.0005297848;-0.0001859709;0.0003373015;0.0005249706;0.0005159942;-0.001426426;-0.001423944;0.003513171;0.0002991892;-0.0009556375;-0.007416335;0.001034187;0.00577125	1.217085;0.8082718;-1.210896;-3.364019;-9.762309;0.5749135;-1.429595;1.381653;1.289246;2.787778;-3.460254;-2.007249;4.411161;0.3843544;-1.05071;-8.318828;0.9642072;4.863622	147.11262512207;261.155520533145;424.219573841893;537.304931640625;624.341247558594;921.503479003906;130.086441040039;244.128845214844;407.191986083984;185.091552734375;412.231719970703;709.400573730469;796.42766502565;778.420314321062;909.51619781447;891.512093825909;1072.57753652828;1186.61572691238	18	0.1805881	0.2168675	None	Unknown		73.082;16.6623;7.865682	AAARPTVSIYNK;QGLLGTPERYAK;NRSLFTLQPEK	_(ac)AAARPTVSIYNK_;_QGLLGTPERYAK_;_NRSLFTLQPEK_	107	2604	15	16	131	
200115_SPAC1_3	3236	1204	AAAGPSNSSSGTSTPR	16	0	Phospho (STY)	_AAAGPSNSSSGTST(ph)PR_	AAAGPSNS(0.006)S(0.018)S(0.012)GT(0.037)S(0.166)T(0.761)PR	AAAGPS(-50.91)NS(-21.04)S(-16.23)S(-17.92)GT(-13.13)S(-6.61)T(6.61)PR	0	1	O74883	rpc37	DNA-directed RNA polymerase III subunit rpc5	2	HCD	FTMS	MULTI-MSMS	5	0	764.32539	1526.6362	0.040372	2.5009131	27.058	3.09E-07	85.563	85.563	6.6132	0.7607	7	0	0	0	-1	0	0	0	0		y1;y2;y3;y4;y5;y6;y7;y8;y9;y10;y11;y12;y13;y14;y15(2+);b2;b3;b4;b5;b6;b6-H2O;b7;b7-H2O;b7-NH3;b8	311.7;19064.6;1579.2;2432.6;1493.8;3766.8;4935.3;8933.7;10552.5;5357.6;2967.5;36079.4;27868.7;24033;2864.2;2566.3;2115;4843.2;1061.5;1544.9;775.2;1551.7;378.6;1400;2564	-0.001180265;-0.0001376866;-0.001495543;-0.000752534;-0.0009866576;3.028873E-05;-0.001571673;-0.0003049813;-0.0001979581;-0.0006054719;-0.003855382;-0.001239357;-0.001152213;-0.002655332;-0.003468621;-0.0002525496;-6.503133E-05;9.742274E-05;-0.004145561;-0.003275599;-0.004807082;-0.002462409;-0.0008810994;0.0026047;-0.002904702	-6.739747;-0.5058812;-3.300056;-1.393018;-1.538608;0.04337576;-2.001313;-0.3496084;-0.206339;-0.5640553;-3.322283;-0.9855627;-0.8765184;-1.916415;-4.759291;-1.765072;-0.3037163;0.3593079;-11.25908;-7.19551;-10.99467;-4.325554;-1.598343;4.71662;-4.425858	175.120132446289;272.171853719647;453.187220458853;540.218505859375;641.266418457031;698.286865234375;785.320495605469;872.351257324219;959.383178710938;1073.42651367188;1160.46179199219;1257.51193981881;1314.5333163981;1385.57193330488;728.810302734375;143.081756591797;214.118682861328;271.139984130859;368.196990966797;455.228149414063;437.219116210938;569.270263671875;551.258117675781;552.238647460938;656.302734375	25	0.5122163	0.2066116	None	Unknown		85.56325	AAAGPSNSSSGTSTPR	_AAAGPSNSSSGTST(ph)PR_	48	1130	7	7	56	3562;3563;3564;9844;9845
200116_SPAC2_4	12669	10068	MDSVSNVSVNEQGK	14	0	Acetyl (Protein_N-term),2 Phospho (STY)	_(ac)MDS(ph)VSNVS(ph)VNEQGK_	MDS(0.995)VS(0.005)NVS(1)VNEQGK	MDS(23.35)VS(-23.35)NVS(33.01)VNEQGK	1	2	O60113	SPBC15C4.04c	Uncharacterized amino-acid permease C15C4.04c	2	HCD	FTMS	MULTI-MSMS	2	0	856.31773	1710.6209	0.080964	3.4697316	59.682	0.00014247	62.739	61.368	33.011	0.9995	3	0	0	0	-1	0	0	0	0		y2;y3;y4;y5;y6;y7;y9;y10;y5-H2O;y7*;y7-NH3;y9*;y10*;y10-NH3;b2;b5;b7;b7-H2O;b3*;b4*;b5*;b5-H2O;b6*;b7*	2599.4;1474.7;1583.7;3379;965;7366.5;2061.5;5733.5;401.4;2281.2;787.3;1643.1;4011;491.2;1048;500.2;5472.9;641.4;3808.4;6472.9;1034.5;904.9;775.4;1457.6	9.736198E-05;0.0001574173;-0.003392065;0.001205305;-0.005576092;0.001632824;0.0003032891;-0.002946621;-0.006063483;-0.0008970362;-0.01182114;0.004487296;-0.001570232;-0.009076364;0.001234439;-0.01120099;-0.001971442;-0.01070507;-0.0004951972;-0.001295148;-0.004636611;0.0008509484;-0.008095883;-0.01268001	0.4769509;0.4738735;-7.354248;2.095172;-8.26881;1.940734;0.2876261;-2.581378;-10.88062;-1.206717;-16.27463;4.691493;-1.504755;-8.8421;4.046292;-17.01786;-2.262665;-12.54564;-1.323695;-2.737165;-8.276608;1.569463;-12.00719;-16.39672	204.134170532227;332.192687988281;461.238830566406;575.277160644531;674.352355957031;841.343505859375;1054.45617675781;1141.49145507813;557.273864746094;743.369140625;726.353515625;956.47509765625;1043.51318359375;1026.494140625;305.078948974609;658.190185546875;871.292297363281;853.290466308594;374.102142333984;473.171356201172;560.206726074219;542.190673828125;674.253112792969;773.326110839844	24	0.482179	0.3076923	Once	Unknown		62.73941;1.371648;0.8473398	MDSVSNVSVNEQGK;KTNRYYNDELR;DSQECILTETEAR	_(ac)MDS(ph)VSNVS(ph)VNEQGK_;_KT(ph)NRY(ph)Y(ph)NDELR_;_DS(ph)QECILT(ph)ETEAR_	114258	812	12946	14424	109552	2514;2515;2516
200116_SPAC2_4	14769	12037	ELQTSPIVSPTTSPK	15	0	3 Phospho (STY)	_ELQTS(ph)PIVS(ph)PTTS(ph)PK_	ELQT(0.111)S(0.889)PIVS(0.916)PT(0.103)T(0.093)S(0.888)PK	ELQT(-9.09)S(9.09)PIVS(13.02)PT(-11.33)T(-12.18)S(11.33)PK	0	3	Q9UUJ6	ned1	Nuclear elongation and deformation protein 1	2	HCD	FTMS	MULTI-MSMS	8	1	912.8771	1823.7396	0.069888	3.8401561	67.673	8.03E-07	97.271	96.717	13.022	0.91604	20	0	0	0	-1	0	0	0	0		y2;y3;y5;y6;y7;y8;y10;y3*;y7*;y10*;y12*;y12-H2O;y10(2+);a2;b2;b2-H2O;b3;b3-H2O;b4;b4-H2O;b5*;b5-H2O	1051;1613.3;816.6;2865.2;5194.7;2482.5;6341.7;412.7;1335.7;1653.1;1492.5;1954.2;435.7;845.1;596.3;367.1;1818.9;1685.7;953.5;841.8;1028.8;1682.9	0.0002695607;0.0005662452;-0.006646387;0.0007683442;-0.001605259;0.00540729;-0.001560841;8.106261E-05;0.009048474;0.003963074;-0.02187429;-0.02075101;0.001452572;0.001079468;-0.0005421648;0.00135958;0.0003723827;-0.0005792665;-0.003462815;-0.002735998;-0.00305622;-0.003916317	1.104009;1.377178;-10.83769;1.081701;-1.829747;5.538137;-1.315482;0.2588314;11.61066;3.640742;-16.12427;-15.50208;2.446395;5.017562;-2.229897;6.0393;1.003207;-1.640133;-7.332687;-6.023344;-5.646443;-7.484528	244.165298461914;411.163360595703;613.265930175781;710.311279296875;877.31201171875;976.373413085938;1186.51720904948;313.186950683594;779.324462890625;1088.53479003906;1356.60666469299;1338.59497672386;593.760009765625;215.137939453125;243.134475708008;225.122009277344;371.192138671875;353.182525634766;472.24365234375;454.232360839844;541.264709472656;523.255004882813	22	0.3637594	0.247191	Once	Unknown		97.27094;0.5537065	ELQTSPIVSPTTSPK;EERVENDWFETYK	_ELQTS(ph)PIVS(ph)PTTS(ph)PK_;_EERVENDWFET(ph)YK_	41087	3091	4421	4973	38407	8662;8663;8664;11068;11069;11070
'''
raw_df = pd.read_table(io.StringIO(mq_str))

mq_reader = psm_w_frag_reader_provider.get_reader('maxquant')
mq_reader._score_thres=60
mq_reader.import_file(io.StringIO(mq_str))
assert 'frag_start_idx' in mq_reader.psm_df.columns
assert 'frag_stop_idx' in mq_reader.psm_df.columns
assert mq_reader.psm_df.mods.values[0] == ''
assert mq_reader.psm_df.mod_sites.values[0] == ''
assert mq_reader.psm_df.mods.values[1] in ('Acetyl@Protein_N-term', 'Acetyl@Protein N-term')
assert mq_reader.psm_df.mod_sites.values[1] == '0'
seq = 'AAAGPSNSSSGTSTPR'
frag_types = raw_df[raw_df['Sequence']==seq]['Matches'].values[0].split(';')
frag_intens = raw_df[raw_df['Sequence']==seq]['Intensities'].values[0].split(';')
frag_intens = np.array([float(frag_intens[i]) for i,frag_type in enumerate(frag_types) if '-' not in frag_type and frag_type[0] in 'by'])
frag_types = [frag_type for frag_type in frag_types if '-' not in frag_type and frag_type[0] in 'by']
frag_intens /= np.max(frag_intens)
df = pd.DataFrame(
    {'frag':frag_types, 'inten':frag_intens}
)

def query(frag_type, intens, nAA):
    if frag_type[0] == 'y':
        if frag_type[-1].isdigit():
            return intens[nAA-int(frag_type[1:])-1,2]
        else:
            return intens[nAA-int(frag_type[1:-4])-1,3]
    else:
        if frag_type[-1].isdigit():
            return intens[int(frag_type[1:])-1,0]
        else:
            return intens[int(frag_type[1:-4])-1,1]

start,end = (mq_reader.psm_df[mq_reader.psm_df.sequence==seq])[['frag_start_idx','frag_stop_idx']].values[0]
intens = mq_reader.fragment_intensity_df.values[start:end,:]
for frag_type, frag_inten in zip(frag_types, frag_intens):
    assert abs(query(frag_type, intens, len(seq))-frag_inten) < 1e-5, (
        display(mq_reader.fragment_intensity_df.iloc[start:end,:]),
        display(df),
        frag_type, frag_inten
    )


In [5]:
#| hide
for seq, phos_sites in mq_reader.psm_df[['sequence','phos_sites']].values:
    if len(phos_sites) == 0: continue
    for site in phos_sites.split(';'):
        assert seq[int(site)-1] in 'STY'
mq_reader.psm_df

Unnamed: 0,sequence,charge,rt,scan_num,raw_name,precursor_mz,score,proteins,genes,decoy,phos_probs,phos_sites,spec_idx,mods,mod_sites,nAA,rt_norm,frag_start_idx,frag_stop_idx
0,AAAADILPVLLK,2,69.506,17556,200123_SAX_SPAC3_1,597.87646,99.539,O13864,kap95,0,,,17555,,,12,1.0,0,11
1,AAARPTVSIYNK,2,48.013,10089,200123_SAX_SPAC3_1,666.86715,73.082,Q9P784,rpl4b,0,,,10088,Acetyl@Protein_N-term,0,12,0.690775,11,22
2,AAAGPSNSSSGTSTPR,2,27.058,3236,200115_SPAC1_3,764.32539,85.563,O74883,rpc37,0,0.761,14,3235,Phospho@T,14,16,1.0,22,37
3,MDSVSNVSVNEQGK,2,59.682,12669,200116_SPAC2_4,856.31773,62.739,O60113,SPBC15C4.04c,0,0.995;1,3;8,12668,Acetyl@Protein_N-term;Phospho@S;Phospho@S,0;3;8,14,0.881917,37,50
4,ELQTSPIVSPTTSPK,2,67.673,14769,200116_SPAC2_4,912.8771,97.271,Q9UUJ6,ned1,0,0.889;0.916;0.888,5;9;13,14768,Phospho@S;Phospho@S;Phospho@S,5;9;13,15,1.0,50,64


In [6]:
#| hide
mq_reader.fragment_intensity_df[(mq_reader.fragment_intensity_df.y_modloss_z1>0)|(mq_reader.fragment_intensity_df.b_modloss_z1>0)]

Unnamed: 0,b_z1,b_z2,y_z1,y_z2,b_modloss_z1,b_modloss_z2,y_modloss_z1,y_modloss_z2
39,0.0,0.0,0.0,0.0,0.516989,0.0,0.0,0.0
40,0.0,0.0,0.778321,0.0,0.878694,0.0,0.544492,0.0
41,0.067902,0.0,0.279848,0.0,0.140433,0.0,0.22305,0.0
42,0.0,0.0,0.0,0.0,0.10526,0.0,0.0,0.0
43,0.742944,0.0,1.0,0.0,0.197869,0.0,0.309672,0.0
52,0.286816,0.0,0.0,0.0,0.0,0.0,0.235347,0.0
54,0.0,0.0,1.0,0.068704,0.162228,0.0,0.260671,0.0
57,0.0,0.0,0.819134,0.0,0.0,0.0,0.210622,0.0
61,0.0,0.0,0.254396,0.0,0.0,0.0,0.065077,0.0
