# Process Merops Data

## Import Libraries

In [1]:
# import libraries
import numpy as np
import pandas as pd
import blosum as bl

# constant
SAVE = True

## Process Data

In [2]:
# delete the quotes in the data
data = pd.read_csv("./Data/Substrate_search.txt", sep='\t', header=None, encoding='utf-8')
data = data.map(lambda x: str(x).strip("'") if isinstance(x, str) else x)

if SAVE is True:
    data.to_csv("./Data/Substrate_search_processed.csv", sep='\t', index=False, header=False, encoding="utf-8")
data.head(5)

  data = pd.read_csv("./Data/Substrate_search.txt", sep='\t', header=None, encoding='utf-8')


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,CLE0166975,A01.001,ac-Phe-Tyr(I2),ac-Phe+Tyr(I2),-,-,Ac,Phe,TyI,-,...,,,pepsin A,,,,,,synthetic,
1,CLE0166506,A01.001,alcohol dehydrogenase,peptide-Ala107+Val-peptide,Arg,Thr,Ile,Ala,Val,Asn,...,107.0,Drosophila melanogaster,pepsin A,,,1-256,NT,VT,non-physiological,
2,CLE0166510,A01.001,alcohol dehydrogenase,peptide-Ala119+Ile-peptide,Thr,Thr,Thr,Ala,Ile,Leu,...,119.0,Drosophila melanogaster,pepsin A,,,1-256,NT,VT,non-physiological,
3,CLE0166517,A01.001,alcohol dehydrogenase,peptide-Ala178+Tyr-peptide,Gly,Val,Thr,Ala,Tyr,Thr,...,178.0,Drosophila melanogaster,pepsin A,,,1-256,NT,VT,non-physiological,
4,CLE0166523,A01.001,alcohol dehydrogenase,peptide-Ala218+Cys-peptide,Pro,Ser,Leu,Ala,Cys,Ala,...,218.0,Drosophila melanogaster,pepsin A,,,1-256,NT,VT,non-physiological,


In [3]:
# build amino acid map
amino_table = pd.read_csv("./Data/amino_table.csv", sep="\t", header=None)
amino_table.columns = ["chinese", "english", "one_abbr", "three_abbr"]
amino_three2one, amino_one2three = dict(), dict()
for row_i, row in amino_table.iterrows():
    three_abbr = row["three_abbr"].lower()
    one_abbr = row["one_abbr"].lower()
    amino_three2one[three_abbr] = one_abbr
    amino_one2three[one_abbr] = three_abbr

In [4]:
# only keep the protease and peptide information
data = pd.read_csv("./Data/Substrate_search_processed.csv", sep='\t', header=None, encoding='utf-8')

protease_peptide = pd.concat((data[[1]], data.iloc[:, 4:12]), axis=1)  # keep protease and peptide info
protease_peptide = protease_peptide.dropna()  # drop nan values
nan_row_ids = set()  # some nan are str types, so we can only drop them by checking the rows
for row_i, row in protease_peptide.iterrows():
    if "NAN" in row.tolist():
        nan_row_ids |= {row_i}
protease_peptide = protease_peptide.drop(list(nan_row_ids))  # drop nan rows

protease_peptide.columns = ["protease"] + [i for i in range(8)]  # modify column names
protease_peptide.iloc[:, 1:] = protease_peptide.iloc[:, 1:].map(lambda x: x.lower())  # turn amino acids to lower case
protease_peptide.iloc[:, 1:] = protease_peptide.iloc[:, 1:].map(lambda x: "-" if "-" in x else x)  # turn "/-/" to "-"

# filter proteases in human body
human_protease = pd.read_csv("./Data/human_protease.txt", sep="\t")
human_protease = set(human_protease["MEROPS ID"].tolist())
human_animos = set(bl.BLOSUM(62).keys())
protease_peptide = protease_peptide[protease_peptide["protease"].isin(human_protease)]

# filter peptides
valid_row_ids = []
for row_i, row in protease_peptide.iterrows():
    if len(set(row.iloc[1:].map(lambda x: x.lower()).tolist()) - (amino_three2one.keys() | {"-"})) == 0:
        valid_row_ids.append(row_i)
protease_peptide = protease_peptide.loc[valid_row_ids]

if SAVE is True:
    protease_peptide.to_csv("./Data/Protease_Peptides.csv", sep='\t', header=True, index=False)  # save the data
protease_peptide.head(5)

  data = pd.read_csv("./Data/Substrate_search_processed.csv", sep='\t', header=None, encoding='utf-8')


Unnamed: 0,protease,0,1,2,3,4,5,6,7
998,A01.003,phe,gly,asp,leu,ser,val,thr,tyr
999,A01.003,leu,gly,glu,phe,leu,arg,thr,his
1000,A01.003,phe,thr,ser,asp,tyr,ser,lys,tyr
1001,A01.003,val,gln,trp,leu,met,asn,thr,-
1002,A01.003,gln,gly,thr,phe,thr,ser,asp,tyr
