In [None]:
#---#| default_exp psm_reader.maxquant_reader

# MaxQuant PSM reader

In [None]:
from alphabase.psm_reader.maxquant_reader import *

### Column and modification mapping from alphabase to MaxQuant

In [None]:
psm_reader_yaml['maxquant']['column_mapping']

{'sequence': 'Sequence',
 'charge': 'Charge',
 'rt': 'Retention time',
 'ccs': 'CCS',
 'mobility': ['Mobility', 'IonMobility', 'K0', '1/K0'],
 'scan_num': ['Scan number', 'MS/MS scan number', 'Scan index'],
 'raw_name': 'Raw file',
 'precursor_mz': 'm/z',
 'score': 'Score',
 'proteins': 'Proteins',
 'genes': ['Gene Names', 'Gene names'],
 'decoy': 'Reverse'}

In [None]:
psm_reader_yaml['maxquant']['modification_mapping']

{'Acetyl@Protein N-term': ['_(Acetyl (Protein N-term))',
  '_(ac)',
  '_(UniMod:1)'],
 'Carbamidomethyl@C': ['C(Carbamidomethyl (C))', 'C(UniMod:4)'],
 'Oxidation@M': ['M(Oxidation (M))', 'M(ox)', 'M(UniMod:35)'],
 'Phospho@S': ['S(Phospho (S))',
  'S(Phospho (ST))',
  'S(Phospho (STY))',
  'S(ph)',
  'S(UniMod:21)',
  'pS'],
 'Phospho@T': ['T(Phospho (T))',
  'T(Phospho (ST))',
  'T(Phospho (STY))',
  'T(ph)',
  'T(UniMod:21)',
  'pT'],
 'Phospho@Y': ['Y(Phospho (Y))',
  'Y(Phospho (STY))',
  'Y(ph)',
  'Y(UniMod:21)',
  'pY'],
 'Deamidated@N': ['N(Deamidation (NQ))', 'N(de)'],
 'Deamidated@Q': ['Q(Deamidation (NQ))', 'Q(de)'],
 'GlyGly@K': ['K(GlyGly (K))', 'K(gl)']}

In [None]:
#| hide
assert ('HAESVMTMGLK','M(ox);M(ox);_(x@Cterm)', '6;8;-1') == parse_mod_seq("_HAESVM(ox)TM(ox)GLK_(x@Cterm)")
assert ('HAESVMTMGLK','M(ox);M(ox);pS;pT', '6;8;4;7') == parse_mod_seq("_HAEpSVM(ox)pTM(ox)GLK_")
assert ('HAESVHTGLK','pS;pT', '4;7') == parse_mod_seq("_HAEpSVHpTGLK_")
assert ('HAESVHTGLK','pS;pT', '4;7') == parse_mod_seq("HAEpSVHpTGLK", underscore_for_ncterm=False)
assert ('HAEMVHTGLK','M(Oxidation (M))', '4') == parse_mod_seq("_HAEM(Oxidation (M))VHTGLK_")
assert ('ACLDYPVTSVLPPASLMK','C(Cys-Cys);M(Oxidation (M));C(Carbamidomethyl (C))', '2;17;2') == parse_mod_seq("_AC(Cys-Cys)LDYPVTSVLPPASLM(Oxidation (M))K_")
assert ('VSHGSSPSLLEALSSDFLACK','_(Acetyl (N-term));C(Carbamidomethyl (C))', '0;20') == parse_mod_seq("_(Acetyl (N-term))VSHGSSPSLLEALSSDFLAC(Carbamidomethyl (C))K_", fixed_C57=False)
assert ('VSHGSSPSLLEALSSDFLACK','_(Acetyl (N-term));C(Carbamidomethyl (C));C(Carbamidomethyl (C))', '0;20;20') == parse_mod_seq("_(Acetyl (N-term))VSHGSSPSLLEALSSDFLAC(Carbamidomethyl (C))K_", fixed_C57=True)
assert ('EKPLLEKSHCIC','E(Glu->pyro-Glu);C(Carbamidomethyl (C));C(Carbamidomethyl (C))', '1;10;12') == parse_mod_seq("_E(Glu->pyro-Glu)KPLLEKSHCIC_", fixed_C57=True)
assert ('HAEMVHTGLK','M[Oxidation (M)]', '4') == parse_mod_seq("_HAEM[Oxidation (M)]VHTGLK_", mod_sep='[]')
assert ('ACLDYPVTSVLPPASLMK','C[Cys-Cys];M[Oxidation (M)];C[Carbamidomethyl (C)]', '2;17;2') == parse_mod_seq("_AC[Cys-Cys]LDYPVTSVLPPASLM[Oxidation (M)]K_", mod_sep='[]')
assert ('VSHGSSPSLLEALSSDFLACK','_[Acetyl (N-term)];C[Carbamidomethyl (C)]', '0;20') == parse_mod_seq("_[Acetyl (N-term)]VSHGSSPSLLEALSSDFLAC[Carbamidomethyl (C)]K_", fixed_C57=False, mod_sep='[]')
assert ('VSHGSSPSLLEALSSDFLACK','_[Acetyl (N-term)];C[Carbamidomethyl (C)];C[Carbamidomethyl (C)]', '0;20;20') == parse_mod_seq("_[Acetyl (N-term)]VSHGSSPSLLEALSSDFLAC[Carbamidomethyl (C)]K_", fixed_C57=True, mod_sep='[]')
assert ('EKPLLEKSHCIC','E[Glu->pyro-Glu];C[Carbamidomethyl (C)];C[Carbamidomethyl (C)]', '1;10;12') == parse_mod_seq("_E[Glu->pyro-Glu]KPLLEKSHCIC_", fixed_C57=True, mod_sep='[]')
assert ('HAEMVHTGLK','M(UniMod:35)', '4') == parse_mod_seq("HAEM(UniMod:35)VHTGLK",underscore_for_ncterm=False)
assert ('VSHGSSPSLLEALSSDFLACK','C(UniMod:4);C(Carbamidomethyl (C))', '20;20') == parse_mod_seq("VSHGSSPSLLEALSSDFLAC(UniMod:4)K",fixed_C57=True, underscore_for_ncterm=False)
assert ('VSHGSSPSLLEALSSDFLACK','C(UniMod:4)', '20') == parse_mod_seq("VSHGSSPSLLEALSSDFLAC(UniMod:4)K",fixed_C57=False, underscore_for_ncterm=False)
assert ('AAAAAAGAGPEMVR','(UniMod:1);M(UniMod:35)','0;12') == parse_mod_seq('(UniMod:1)AAAAAAGAGPEM(UniMod:35)VR', underscore_for_ncterm=False)

### Testing

In [None]:
import io
import numpy as np

In [None]:
mq_tsv = io.StringIO('''Raw file	Scan number	Scan index	Sequence	Length	Missed cleavages	Modifications	Modified sequence	Oxidation (M) Probabilities	Oxidation (M) Score diffs	Acetyl (Protein N-term)	Oxidation (M)	Proteins	Charge	Fragmentation	Mass analyzer	Type	Scan event number	Isotope index	m/z	Mass	Mass error [ppm]	Mass error [Da]	Simple mass error [ppm]	Retention time	PEP	Score	Delta score	Score diff	Localization prob	Combinatorics	PIF	Fraction of total spectrum	Base peak fraction	Precursor full scan number	Precursor Intensity	Precursor apex fraction	Precursor apex offset	Precursor apex offset time	Matches	Intensities	Mass deviations [Da]	Mass deviations [ppm]	Masses	Number of matches	Intensity coverage	Peak coverage	Neutral loss level	ETD identification type	Reverse	All scores	All sequences	All modified sequences	Reporter PIF	Reporter fraction	id	Protein group IDs	Peptide ID	Mod. peptide ID	Evidence ID	Oxidation (M) site IDs
20190402_QX1_SeVW_MA_HeLa_500ng_LC11	81358	73979	AAAAAAAAAPAAAATAPTTAATTAATAAQ	29	0	Unmodified	_(Acetyl (Protein N-term))AAAAAAAAM(Oxidation (M))PAAAATAPTTAATTAATAAQ_			0	0	sp|P37108|SRP14_HUMAN	3	HCD	FTMS	MULTI-MSMS	13	1	790.07495	2367.203	0.35311	0.00027898	-0.061634807	70.261	0.012774	41.423	36.666	NaN	NaN	1	0	0	0	81345	10653955	0.0338597821787898	-11	0.139877319335938	y1;y2;y3;y4;y11;y1-NH3;y2-NH3;a2;b2;b3;b4;b5;b6;b7;b8;b9;b11;b12;b6(2+);b8(2+);b13(2+);b18(2+)	2000000;2000000;300000;400000;200000;1000000;400000;300000;600000;1000000;2000000;3000000;3000000;3000000;3000000;2000000;600000;500000;1000000;2000000;300000;200000	5.2861228709844E-06;-6.86980268369553E-05;-0.00238178789771837;0.000624715964988809;-0.0145624692099773;-0.000143471782706683;-0.000609501446461991;-0.000524972720768346;0.00010190530804266;5.8620815195809E-05;0.000229901232955854;-0.000108750048696038;-0.000229593152369034;0.00183148682538103;0.00276641182404092;0.000193118923334623;0.00200988580445483;0.000102216846016745;5.86208151389656E-05;0.000229901232955854;-0.00104559184393338;0.00525030008475369	0.0359413365445091;-0.314964433555295;-8.23711898839045;1.60102421155213;-14.8975999917227;-1.10320467763838;-3.03102462870716;-4.56152475051625;0.712219104095465;0.273777366204575;0.806231096969562;-0.305312183824154;-0.537399178230218;3.67572664689217;4.85930954169285;0.301587577451224;2.48616190909398;0.116225745519871;0.273777365939099;0.806231096969562;-2.19774169175011;7.53961026980589	147.076413378177;218.113601150127;289.153028027798;390.197699998035;977.50437775671;130.050013034583;201.087592852046;115.087114392821;143.081402136892;214.118559209185;285.155501716567;356.192954155649;427.230188786552;498.265241494374;569.301420357176;640.341107437877;808.429168310795;879.468189767554;214.118559209185;285.155501716567;475.757386711244;696.362265007215	22	0.262893575628735	0.0826446280991736	None	Unknown		41.4230894199432;4.75668724862449;3.9515580701967	AAAAAAAAAPAAAATAPTTAATTAATAAQ;FHRGPPDKDDMVSVTQILQGK;PVTLWITVTHMQADEVSVWR	_AAAAAAAAAPAAAATAPTTAATTAATAAQ_;_FHRGPPDKDDMVSVTQILQGK_;_PVTLWITVTHMQADEVSVWR_			0	1443	0	0	0	
20190402_QX1_SeVW_MA_HeLa_500ng_LC11	81391	74010	AAAAAAAAAAPAAAATAPTTAATTAATAAQ	29	0	Unmodified	_AAAAAAAAAPAAAATAPTTAATTAATAAQ_			0	0	sp|P37108|SRP14_HUMAN	2	HCD	FTMS	MULTI-MSMS	14	0	1184.6088	2367.203	0.037108	4.3959E-05	1.7026696	70.287	7.1474E-09	118.21	100.52	NaN	NaN	1	0	0	0	81377	9347701	0.166790347889974	-10	0.12664794921875	y1;y2;y3;y4;y5;y9;y12;y13;y14;y20;y13-H2O;y20-H2O;y1-NH3;y20-NH3;b3;b4;b5;b6;b7;b8;b9;b11;b12;b13;b14;b15;b16;b19;b15-H2O;b16-H2O	500000;600000;200000;400000;200000;100000;200000;1000000;200000;300000;200000;100000;100000;70000;300000;900000;2000000;3000000;5000000;8000000;6000000;600000;800000;600000;200000;300000;200000;300000;300000;1000000	-0.000194444760495571;0.000149986878682284;0.000774202587820128;-0.0002445094036716;0.000374520568641401;-0.00694293246522193;-0.0109837291331587;-0.0037745820627606;-0.000945546471939451;0.00152326440706929;0.00506054832726477;0.00996886361417637;6.25847393393997E-05;-0.024881067836759;-3.11821549132674E-05;-0.000183099230639527;0.000161332473453513;0.000265434980121881;0.000747070697229901;0.000975534518261156;0.00101513939785036;0.00651913000274362;0.0058584595163893;0.00579536744021425;0.00131097834105276;-0.0131378531671089;0.00472955218901916;-0.00161006322559842;-0.00201443239325272;0.0227149399370319	-1.32206444236914;0.687655553213019;2.6775131607882;-0.626628140021726;0.811995006209331;-8.6203492854282;-10.1838066275079;-3.21078702288986;-0.758483069159249;0.881072738747222;4.37168212373889;5.82682888353564;0.481236695337485;-14.5343986203644;-0.145630261806375;-0.642102166533079;0.452935954800214;0.621293379181583;1.49934012872483;1.71355878380837;1.58531240493271;8.06399202403175;6.6614096214532;6.09718023739784;1.28333378040908;-11.7030234519348;3.96235146626144;-1.07856912288932;-1.82370619437775;19.3220953109188	147.07661310906;218.113382465221;289.149872037312;390.198569223404;461.235063981231;805.411965958065;1078.54847749073;1175.59403219566;1246.62831694787;1728.87474561429;1157.57463237897;1710.85573532879;130.049806978061;1711.87460084504;214.118649012155;285.155914717031;356.192684073126;427.22969375842;498.266325910503;569.303211234482;640.340285417402;808.424659066597;879.462433524883;950.49961040476;1021.54120858166;1122.60333588727;1193.62258226971;1492.77704268533;1104.58164778019;1175.59403219566	30	0.474003002083763	0.167630057803468	None	Unknown		118.209976573419;17.6937689289157;17.2534171481793	AAAAAAAAAPAAAATAPTTAATTAATAAQ;SELKQEAMQSEQLQSVLYLK;VGSSVPSKASELVVMGDHDAARR	_AAAAAAAAAPAAAATAPTTAATTAATAAQ_;_SELKQEAM(Oxidation (M))QSEQLQSVLYLK_;_VGSSVPSKASELVVMGDHDAARR_			1	1443	0	0	1	
20190402_QX1_SeVW_MA_HeLa_500ng_LC11	107307	98306	AAAAAAAGDSDSWDADAFSVEDPVRK	26	1	Acetyl (Protein N-term)	_(Acetyl (Protein N-term))AAAAAAAGDSDSWDADAFSVEDPVRK_			1	0	sp|O75822|EIF3J_HUMAN	3	HCD	FTMS	MULTI-MSMS	10	2	879.06841	2634.1834	-0.93926	-0.00082567	-3.2012471	90.978	2.1945E-12	148.95	141.24	NaN	NaN	1	0	0	0	107297	10193939	0.267970762043589	-8	0.10211181640625	y1;y2;y4;y5;y6;y7;y8;y9;y10;y11;y12;y13;y14;y15;y17;y18;y19;y20;y21;y23;y21-H2O;y1-NH3;y19-NH3;y14(2+);y16(2+);y22(2+);a2;b2;b3;b4;b5;b6;b7	300000;200000;3000000;600000;1000000;500000;2000000;1000000;1000000;1000000;90000;1000000;400000;900000;1000000;400000;3000000;2000000;1000000;400000;100000;200000;200000;80000;100000;200000;200000;2000000;5000000;5000000;5000000;2000000;300000	1.34859050149316E-07;-6.05140996867704E-06;2.27812602133781E-05;0.00128986659160546;-0.00934536073077652;0.000941953783126337;-0.00160424237344614;-0.00239257341399934;-0.00111053968612396;-0.00331340710044969;0.00330702864630439;0.000963683996815234;0.00596290290945944;-0.00662057038289277;-0.0117122701335575;0.00777853472800416;0.0021841542961738;0.000144322111736983;-0.00087403893667215;0.0197121595674616;-0.021204007680808;-0.000308954599830713;-0.026636719419912;-0.0137790992353075;0.00596067266928912;-0.0077053835773313;9.11402199221811E-06;-0.000142539300128419;-0.000251999832926231;1.90791054137662E-05;-0.00236430185879044;-9.54583337602344E-05;-0.000556959493223985	0.000916705048437201;-0.0199575598103408;0.0456231928690862;2.09952637717462;-12.5708704058425;1.11808305811426;-1.72590731777249;-2.22239181008062;-0.967696370445928;-2.62418809422166;2.47964286628144;0.665205752892023;3.64753748704453;-3.84510115530963;-6.08782672045773;3.81508105974837;1.04209904973991;0.0666012719936656;-0.390545453668809;8.28224925531311;-9.55133250134922;-2.37499239179248;-12.8127653858411;-16.846761946123;6.48662354975264;-6.67117082062383;0.0580151981289049;-0.770098855873447;-0.983876895688683;0.0583162347158579;-5.93738717724506;-0.203431522818505;-1.03087538746314	147.112804035741;303.21392125011;499.33507018564;614.360746132308;743.413974455831;842.472101057517;929.506675663573;1076.57587791081;1147.61170966489;1262.6408555643;1333.67134891635;1448.700635293;1634.77494902759;1721.81956091078;1923.88362405243;2038.89107627957;2095.9181343836;2166.95728800359;2237.99542015244;2380.04906152953;2220.00518543488;130.0865640237;2078.92040615582;817.907873297785;918.917619246831;1155.02717356753;157.097144992378;185.0922112678;256.129434516133;327.166277224995;398.205774393759;469.240619338034;540.278194626993	33	0.574496146107112	0.14410480349345	None	Unknown		148.951235201399;7.71201258444522;7.36039532447559	AAAAAAAGDSDSWDADAFSVEDPVRK;PSRQESELMWQWVDQRSDGER;HTLTSFWNFKAGCEEKCYSNR	_(Acetyl (Protein N-term))AAAAAAAGDSDSWDADAFSVEDPVRK_;_PSRQESELM(Oxidation (M))WQWVDQRSDGER_;_HTLTSFWNFKAGCEEKCYSNR_			2	625	1	1	2	'''
)

mq_reader = psm_reader_provider.get_reader('maxquant')
mq_reader.import_file(mq_tsv)
assert len(mq_reader.psm_df) == 3
assert mq_reader.psm_df.mods.values[1] == 'Acetyl@Protein N-term;Oxidation@M'
assert mq_reader.psm_df.mod_sites.values[1] == '0;9'
assert mq_reader.psm_df.mods.values[2] == ''
assert mq_reader.psm_df.mod_sites.values[2] == ''
assert np.all(np.array(mq_reader.modification_mapping['Phospho@S'])==np.array([
    'S(Phospho (S))',
    'S(Phospho (ST))',
    'S(Phospho (STY))',
    'S(ph)',
    'S(UniMod:21)',
    'pS',
    'S[Phospho (S)]',
    'S[Phospho (ST)]',
    'S[Phospho (STY)]',
    'S[ph]',
    'S[UniMod:21]'])
)
mq_reader.psm_df

Unnamed: 0,sequence,charge,rt,scan_num,raw_name,precursor_mz,score,proteins,decoy,spec_idx,mods,mod_sites,nAA,rt_norm
0,AAAAAAAGDSDSWDADAFSVEDPVRK,3,90.978,107307,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,879.06841,148.95,sp|O75822|EIF3J_HUMAN,0,107306,Acetyl@Protein N-term,0,26,1.0
1,AAAAAAAAAPAAAATAPTTAATTAATAAQ,3,70.261,81358,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,790.07495,41.423,sp|P37108|SRP14_HUMAN,0,81357,Acetyl@Protein N-term;Oxidation@M,0;9,29,0.772286
2,AAAAAAAAAAPAAAATAPTTAATTAATAAQ,2,70.287,81391,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,1184.6088,118.21,sp|P37108|SRP14_HUMAN,0,81390,,,30,0.772571
