In [1]:
import pandas as pd
import re

In [2]:
#Parse the dataset
df = pd.read_csv('./first_sequences_only.csv')

In [3]:
#Parse crosslinks
def parse_crosslinks(crosslink_string):
    crosslinks = []
    if not isinstance(crosslink_string, str):
        return crosslinks
    entries = [entry.strip() for entry in crosslink_string.split("CROSSLNK") if entry.strip()]
    for entry in entries:
        parts = [part.strip() for part in entry.split(';')]
        position = parts[0]
        note_match = parts[1][6:].strip('"')
        entry_list = []
        if 'Glycyl lysine isopeptide' in note_match:
            entry_list.append(f"{position};glycyl_lysine_isopeptide;{note_match}")
        if 'SUMO' in note_match:
            entry_list.append(f"{position};sumoylation;{note_match}")
        if 'ubiquitin' in note_match:
            entry_list.append(f"{position};ubiquitination;{note_match}")
        if 'Tryptophyl-tyrosyl-methioninium' in note_match:
            entry_list.append(f"{position};tryptophyl_tyrosyl_methioninium;{note_match}")
        if '5-imidazolinone' in note_match:
            entry_list.append(f"{position};cyclization;{note_match}")
        if 'Isoglutamyl lysine isopeptide' in note_match:
            entry_list.append(f"{position};transglutamination;{note_match}")
        if '(S-cysteinyl)-tyrosine' in note_match:
            entry_list.append(f"{position};cysteine_tyrosine;{note_match}")
        if 'His-Tyr' in note_match:
            entry_list.append(f"{position};histidine_tyrosine;{note_match}")
        if 'Pyrroloquinoline quinone' in note_match:
            entry_list.append(f"{position};pyrroloquinoline_quinone;{note_match}")
        if "2'-(S-cysteinyl)-histidine" in note_match:
            entry_list.append(f"{position};cysteine_histidine;{note_match}")
        if 'tryptophylquinone' in note_match:
            entry_list.append(f"{position};tryptophylquinone;{note_match}")
        if len(entry_list) == 0:
            entry_list.append(f"{position};unclassified;{note_match}")
        crosslinks.extend(entry_list)
    return crosslinks

#Parse disulfides
def parse_disulfides(disulfide_string):
    disulfide_bonds = []
    if isinstance(disulfide_string, str):
        disulfide_bonds = [entry.split(';')[0].strip() for entry in disulfide_string.split("DISULFID") if entry.strip()]
    return disulfide_bonds

#Parse lipidation
def parse_lipidations(lipidation_string):
    lipidations = []
    if not isinstance(lipidation_string, str):
        return lipidations
    entries = [entry.strip() for entry in lipidation_string.split("LIPID") if entry.strip()]
    for entry in entries:
        parts = [part.strip() for part in entry.split(';')]
        position = parts[0]
        note_match = parts[1][6:].strip('"')
        entry_list = []
        if 'S-palmitoyl' in note_match:
            entry_list.append(f"{position};palmitoylation;{note_match}")
        if 'N-myristoyl' in note_match:
            entry_list.append(f"{position};n_myristoylation;{note_match}")
        if 'S-diacylglycerol' in note_match:
            entry_list.append(f"{position};s_diacylglycerol;{note_match}")
        if 'amidated glycine' in note_match:
            entry_list.append(f"{position};amidated_glycine;{note_match}")
        if 'Phosphatidylserine amidated glycine' in note_match:
            entry_list.append(f"{position};phosphatidylserine_amidated_glycine;{note_match}")
        if 'N-palmitoyl' in note_match:
            entry_list.append(f"{position};n_palmitoylation;{note_match}")
        if re.search('S-farnesyl cysteine|S-geranylgeranyl cysteine|S-12-hydroxyfarnesyl cysteine', note_match):
            entry_list.append(f"{position};prenylation;{note_match}")
        if 'GPI-anchor' in note_match:
            entry_list.append(f"{position};gpi_anchor;{note_match}")
        if len(entry_list) == 0:
            entry_list.append(f"{position};unclassified;{note_match}")
        lipidations.extend(entry_list)
    return lipidations

#Parse glycosylation
def parse_glycosylations(glycosylation_string):
    glycosylations = []
    if not isinstance(glycosylation_string, str):
        return glycosylations
    entries = [entry.strip() for entry in glycosylation_string.split("CARBOHYD") if entry.strip()]
    for entry in entries:
        parts = [part.strip() for part in entry.split(';')]
        position = parts[0]
        note_match = parts[1][6:].strip('"')
        entry_list = []
        if 'N-linked' in note_match:
            entry_list.append(f"{position};n_linked;{note_match}")
        if 'O-linked' in note_match:
            entry_list.append(f"{position};o_linked;{note_match}")
        if len(entry_list) == 0:
            entry_list.append(f"{position};unclassified;{note_match}")
        glycosylations.extend(entry_list)
    return glycosylations

#Parse modified residues
def parse_modified_residues(modified_residues_string):
    modified_residues = []
    if not isinstance(modified_residues_string, str):
        return modified_residues
    entries = [entry.strip() for entry in modified_residues_string.split("MOD_RES") if entry.strip()]
    for entry in entries:
        parts = [part.strip() for part in entry.split(';')]
        position = parts[0]
        note_match = parts[1][6:].strip('"')
        entry_list = []
        if re.search('Phospho|Tele-phosphohistidine|4-aspartylphosphate|Diphosphoserine', note_match):
            entry_list.append(f"{position};phosphorylation;{note_match}")
        if re.search('N-acetyl|N6-acetyl|N2-acetyl', note_match):
            entry_list.append(f"{position};acetylation;{note_match}")
        if re.search(' amide|1-amide', note_match):
            entry_list.append(f"{position};amidation;{note_match}")
        if re.search('Pyrrolidone carboxylic acid', note_match):
            entry_list.append(f"{position};pyrrolidone_carboxylic_acid;{note_match}")
        if re.search('hydroxyproline|hydroxylysine|hydroxyphenylalanine|hydroxyarginine|hydroxyasparagine|hydroxyaspartate|hydroxytryptophan|hydroxyisoleucine|hydroxyhistidine|hydroxyleucine|hydroxyvaline|hydroxyglutamate|hydroxy-3-methylproline|hydroxy-D-valine|hydroxy-D-asparagine', note_match, re.IGNORECASE):
            entry_list.append(f"{position};hydroxylation;{note_match}")
        if re.search('Sulfo', note_match):
            entry_list.append(f"{position};sulfation;{note_match}")
        if re.search('FAD|FMN', note_match):
            entry_list.append(f"{position};flavin_binding;{note_match}")
        if re.search('Cysteine sulfinic acid|Cysteine sulfenic acid|S-nitrosocysteine', note_match):
            entry_list.append(f"{position};cysteine_oxidation_and_nitrosylation;{note_match}")
        if re.search('methyl ester|-methyl|-dimethyl|-trimethyl|Methylhistidine|Dimethylated arginine|dimethylarginine', note_match):
            entry_list.append(f"{position};methylation;{note_match}")
        if re.search('N6-carboxylysine|4-carboxyglutamate', note_match):
            entry_list.append(f"{position};carboxylation;{note_match}")
        if 'N6-(pyridoxal phosphate)lysine' in note_match:
            entry_list.append(f"{position};pyridoxal_phosphate;{note_match}")
        if re.search('Pyruvic acid', note_match):
            entry_list.append(f"{position};pyruvic_acid;{note_match}")
        if re.search('-lipo', note_match):
            entry_list.append(f"{position};lipoylation;{note_match}")
        if re.search("pantetheine 4'-phosphoryl", note_match):
            entry_list.append(f"{position};phosphopantetheine;{note_match}")
        if '2-(S-cysteinyl)pyruvic acid O-phosphothioketal' in note_match:
            entry_list.append(f"{position};2_(s_cysteinyl)pyruvic_acid_o_phosphothioketal;{note_match}")
        if re.search("succinyl", note_match):
            entry_list.append(f"{position};succinylation;{note_match}")
        if re.search("dipyrrolylmethanemethyl", note_match):
            entry_list.append(f"{position};dipyrrolylmethanemethyl;{note_match}")
        if re.search("-AMP", note_match):
            entry_list.append(f"{position};ampylation;{note_match}")
        if re.search("-UMP", note_match):
            entry_list.append(f"{position};umpylation;{note_match}")
        if re.search("Glycine radical", note_match):
            entry_list.append(f"{position};glycine_radical;{note_match}")
        if re.search("quinone", note_match):
            entry_list.append(f"{position};quinone;{note_match}")
        if re.search("dehydro", note_match):
            entry_list.append(f"{position};dehydrogenation;{note_match}")
        if re.search("retinylidene", note_match):
            entry_list.append(f"{position};retinal_addition;{note_match}")
        if re.search("persulfide", note_match):
            entry_list.append(f"{position};persulfidation;{note_match}")
        if re.search("oxoalanine", note_match):
            entry_list.append(f"{position};oxoalanine;{note_match}")
        if re.search("1-thioglycine", note_match):
            entry_list.append(f"{position};1_thioglycine;{note_match}")
        if re.search("phosphoribosyl dephospho-coenzyme A", note_match):
            entry_list.append(f"{position};phosphoribosyl_dephospho_coenzyme_a;{note_match}")
        if re.search("ADP", note_match):
            entry_list.append(f"{position};adp_ribosylation;{note_match}")
        if re.search("Deamidated", note_match):
            entry_list.append(f"{position};deamidation;{note_match}")
        if re.search("Hypusine", note_match):
            entry_list.append(f"{position};hypusine;{note_match}")
        if re.search("Diphthamide", note_match):
            entry_list.append(f"{position};diphthamide;{note_match}")
        if re.search("Glycyl adenylate", note_match):
            entry_list.append(f"{position};glycyl_adenylate;{note_match}")
        if re.search("hydroxyisobutyryl", note_match):
            entry_list.append(f"{position};hydroxyisobutyrylation;{note_match}")
        if re.search("peptidoglycan", note_match):
            entry_list.append(f"{position};murein_peptidoglycan;{note_match}")
        if re.search("Allysine", note_match):
            entry_list.append(f"{position};allysine;{note_match}")
        if re.search("lactoyl", note_match):
            entry_list.append(f"{position};lactoylation;{note_match}")
        if len(entry_list) == 0:
            entry_list.append(f"{position};unclassified;{note_match}")
        modified_residues.extend(entry_list)
    return modified_residues

#Parse returned ptm lists
def process_ptms(ptm_dict, ptm_list, entry, entry_name, sequence, sequence_len):
    for ptm_item in ptm_list:
        parts = ptm_item.split(';')
        ptm_position = parts[0]
        ptm_type = parts[1]
        ptm_name = parts[2]
        if ptm_type not in ptm_dict:
            ptm_dict[ptm_type] = []
        ptm_dict[ptm_type].append({
            'entry': entry,
            'entry_name': entry_name,
            'sequence': sequence,
            'sequence_len': sequence_len,
            'position': ptm_position,
            'name': ptm_name
        })

In [5]:
crosslink_dict = {}
lipidation_dict = {}
glycosylation_dict = {}
modres_dict = {}
disulfidebond_dict = {'disulfide_bond':[]}
for index, row in df.iterrows():
    entry = row['Entry']
    entry_name = row['Entry Name']
    sequence = row['Sequence']
    sequence_len = len(sequence)

    crosslinks = parse_crosslinks(row['Cross-link'])
    process_ptms(crosslink_dict, crosslinks, entry, entry_name, sequence, sequence_len)

    disulfide_bonds = parse_disulfides(row['Disulfide bond'])
    for position in disulfide_bonds:
        disulfidebond_dict['disulfide_bond'].append({
            'entry': entry,
            'entry_name': entry_name,
            'sequence': sequence,
            'sequence_len': sequence_len,
            'position': position,
            'name': 'disulfide_bond'
        })

    lipidations = parse_lipidations(row['Lipidation'])
    process_ptms(lipidation_dict, lipidations, entry, entry_name, sequence, sequence_len)

    glycosylations = parse_glycosylations(row['Glycosylation'])
    process_ptms(glycosylation_dict, glycosylations, entry, entry_name, sequence, sequence_len)

    modified_residues = parse_modified_residues(row['Modified residue'])
    process_ptms(modres_dict, modified_residues, entry, entry_name, sequence, sequence_len)

for crosslink_type, data_list in crosslink_dict.items():
    filepath = f"./ptm_processed_v1/crosslinks/{crosslink_type}.csv"
    df_type = pd.DataFrame(data_list)
    df_type.to_csv(filepath, index=False)
    print(f"Saved {len(data_list)} rows to {filepath}")

for disulfidebond_type, data_list in disulfidebond_dict.items():
    filepath = f"./ptm_processed_v1/disulfide_bonds/{disulfidebond_type}.csv"
    df_type = pd.DataFrame(data_list)
    df_type.to_csv(filepath, index=False)
    print(f"Saved {len(data_list)} rows to {filepath}")

for lipidation_type, data_list in lipidation_dict.items():
    filepath = f"./ptm_processed_v1/lipidations/{lipidation_type}.csv"
    df_type = pd.DataFrame(data_list)
    df_type.to_csv(filepath, index=False)
    print(f"Saved {len(data_list)} rows to {filepath}")

for glycosylation_type, data_list in glycosylation_dict.items():
    filepath = f"./ptm_processed_v1/glycosylations/{glycosylation_type}.csv"
    df_type = pd.DataFrame(data_list)
    df_type.to_csv(filepath, index=False)
    print(f"Saved {len(data_list)} rows to {filepath}")

for modres_type, data_list in modres_dict.items():
    filepath = f"./ptm_processed_v1/modified_residues/{modres_type}.csv"
    df_type = pd.DataFrame(data_list)
    df_type.to_csv(filepath, index=False)
    print(f"Saved {len(data_list)} rows to {filepath}")

Saved 58308 rows to ./ptm_processed_v1/crosslinks/glycyl_lysine_isopeptide.csv
Saved 35024 rows to ./ptm_processed_v1/crosslinks/sumoylation.csv
Saved 19803 rows to ./ptm_processed_v1/crosslinks/ubiquitination.csv
Saved 5753 rows to ./ptm_processed_v1/crosslinks/transglutamination.csv
Saved 1320 rows to ./ptm_processed_v1/crosslinks/unclassified.csv
Saved 4689 rows to ./ptm_processed_v1/crosslinks/cysteine_tyrosine.csv
Saved 4518 rows to ./ptm_processed_v1/crosslinks/histidine_tyrosine.csv
Saved 11226 rows to ./ptm_processed_v1/crosslinks/cyclization.csv
Saved 824 rows to ./ptm_processed_v1/crosslinks/tryptophylquinone.csv
Saved 18651 rows to ./ptm_processed_v1/crosslinks/tryptophyl_tyrosyl_methioninium.csv
Saved 1482 rows to ./ptm_processed_v1/crosslinks/cysteine_histidine.csv
Saved 1031 rows to ./ptm_processed_v1/crosslinks/pyrroloquinoline_quinone.csv
Saved 3937149 rows to ./ptm_processed_v1/disulfide_bonds/disulfide_bond.csv
Saved 3644 rows to ./ptm_processed_v1/lipidations/prenyla