In [1]:
pwd

'/tf/workspace/notebooks'

In [2]:
MGF_FILE = "../datasets/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39.mgf"
PARQUET_FILE = "../dumps/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39.parquet"
PADDING_CHARACTER = '_'
PADDING_VALUE = 0

In [3]:
STATS_FILE = '.'.join(PARQUET_FILE.split('.')[:-1]) + '_stats.json'
STATS_FILE

'../dumps/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_stats.json'

In [4]:
from pyteomics import parser, mgf
import pandas as pd
import numpy as np
import json

In [5]:
mgf_iterator = mgf.read(MGF_FILE)
print(type(mgf_iterator))
mgf_entries = list(mgf_iterator)
print("Length:", len(mgf_entries))
print("Example:")
print()
print(mgf_entries[0])

<class 'pyteomics.mgf.IndexedMGF'>
Length: 30296
Example:

{'params': {'title': 'controllerType=0 controllerNumber=1 scan=1482', 'scans': '1482', 'rtinseconds': 348.313842, 'pepmass': (461.811859130859, None), 'charge': [3]}, 'm/z array': array([ 167.0551453,  224.9559021,  285.8164368,  286.3955383,
        298.1065979,  325.8096619,  341.0180054,  343.8196106,
        355.8206787,  359.0282593,  397.7958984,  415.8049011,
        429.0879822,  734.7467041,  965.3400879, 1172.9144287]), 'intensity array': array([ 3050.9699707 ,   405.12173462,   475.8833313 ,   398.45022583,
         389.35595703,  2266.70996094,  1942.8548584 ,   606.1138916 ,
         689.03265381, 20964.71484375,  5848.22558594,  5498.08740234,
        5830.62695312,   459.71224976,   442.84848022,   535.77941895]), 'charge array': masked_array(data=[--, --, --, --, --, --, --, --, --, --, --, --, --, --,
                   --, --],
             mask=[ True,  True,  True,  True,  True,  True,  True,  True,
        

In [18]:
from pyteomics import mzid

In [14]:
def extract_features_from_mgf_entry(entry):
    try:
        sequence = entry['params']['seq']
    except KeyError:
        sequence = None
    try:
        mz = entry['m/z array']
    except KeyError:
        mz = None
    try:
        intensities = entry['intensity array']
    except KeyError:
        intensities = None
    return {"seq": sequence, "mz": mz, "intensities": intensities}

In [15]:
mgf_df = pd.DataFrame(data=[extract_features_from_mgf_entry(entry) for entry in mgf_entries])
mgf_df

Unnamed: 0,seq,mz,intensities
0,,"[167.0551453, 224.9559021, 285.8164368, 286.39...","[3050.9699707031, 405.1217346191, 475.88333129..."
1,,"[161.0452271, 167.0554504, 250.7462463, 251.62...","[401.9281921387, 1717.8507080078, 401.93594360..."
2,,"[108.639679, 109.028656, 113.2680435, 127.0392...","[435.8223571777, 832.341003418, 439.3319091797..."
3,,"[107.4489059, 111.4401627, 113.8058624, 127.03...","[483.9910583496, 474.264465332, 469.7008361816..."
4,,"[101.3885803, 101.4111481, 102.8573303, 109.02...","[471.0510253906, 433.3070983887, 443.325103759..."
...,...,...,...
30291,,"[110.3639221, 118.4531784, 179.4602509, 197.79...","[422.1659545898, 370.2566223145, 452.190948486..."
30292,,"[104.2466736, 112.1446457, 116.2739944, 152.17...","[430.9320373535, 380.4105834961, 441.941101074..."
30293,,"[183.4539185, 204.9055481, 524.3153687]","[447.3238220215, 421.7596130371, 439.0889587402]"
30294,,"[154.9004974, 197.8742371, 241.8902588, 401.34...","[396.438079834, 410.5616149902, 399.2908630371..."


In [None]:
# drop non-AA characters
mgf_df.seq = mgf_df.seq.str.replace(r"[^A-Z]",'')

In [None]:
stats = dict()

In [None]:
for col in mgf_df.columns:
    stats[col] = dict()

In [None]:
for col in mgf_df.columns:
    stats[col]['max_length'] = mgf_df[col].str.len().max()
stats

In [None]:
for col in ['mz', 'intensities']:
    stats[col]['min_value'] = mgf_df[col].apply(min).min()
    stats[col]['max_value'] = mgf_df[col].apply(max).max()
stats

In [None]:
# pad sequences with padding char
for col in ['seq']:
    mgf_df[col] = mgf_df[col].str.pad(width=stats[col]['max_length'], side='right', fillchar=PADDING_CHARACTER)
    assert mgf_df[col].str.len().min() == mgf_df[col].str.len().max()

In [None]:
# pad value columns with zeros
for col in ['mz', 'intensities']:
    mgf_df[col] = pd.DataFrame.from_records(mgf_df[col]).fillna(PADDING_VALUE).values.tolist()
    assert mgf_df[col].str.len().min() == mgf_df[col].str.len().max()

In [None]:
mgf_df

In [None]:
alphabet = list(PADDING_CHARACTER)
alphabet += parser.std_amino_acids
stats['alphabet'] = alphabet
stats['padding_character'] = PADDING_CHARACTER
stats

In [None]:
aa_to_idx = {aa:idx for idx, aa in enumerate(stats['alphabet'])}
idx_to_aa = {idx:aa for aa, idx in aa_to_idx.items()}
stats['aa_to_idx'] = aa_to_idx
stats['idx_to_aa'] = idx_to_aa
stats

In [None]:
stats

In [None]:
def denumpyfy_dict(d):
    if type(d) == np.int64:
        return int(d)
    if type(d) == np.float64:
        return float(d)
    if type(d) == int or type(d) == str:
        return d
    if type(d) == dict:
        return {k:denumpyfy_dict(v) for k, v in d.items()}
    if type(d) == list:
        return [denumpyfy_dict(v) for v in d]
    raise NotImplementedError(type(d))

In [None]:
stats = denumpyfy_dict(stats)
stats

In [None]:
with open(STATS_FILE, 'w') as file:
    json.dump(stats, file, indent=4)

In [None]:
mgf_df.to_parquet(PARQUET_FILE, engine='pyarrow')

In [None]:
mgf_df = pd.read_parquet(PARQUET_FILE)
mgf_df

In [None]:
with open(STATS_FILE, 'r') as file:
    stats = json.load(file)
stats

In [None]:
# merge_mz_and_intensities
mgf_df['mz_intensities'] = mgf_df.apply(lambda row: pd.DataFrame(zip(row['mz'], row['intensities'])).values, axis=1)
mgf_df

In [None]:
mgf_df.mz_intensities.apply(np.shape)

In [None]:
mgf_df = mgf_df.drop(columns=['mz', 'intensities'])
mgf_df