In [6]:
# import libraries
import numpy as np
import pandas as pd
import pickle as pkl

import torch
import torch.nn as nn
from torch.nn.functional import one_hot 
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim

from TorchCRF import CRF
import tqdm
import blosum as bl

SAVE = True

In [7]:
# 去除数据中的引号
data = pd.read_csv("./Data/Substrate_search.txt", sep='\t', header=None, encoding='utf-8')
data = data.map(lambda x: str(x).strip("'") if isinstance(x, str) else x)

if SAVE is True:
    data.to_csv("./Data/Substrate_search_processed.csv", sep='\t', index=False, header=False, encoding="utf-8")
data.head(5)

  data = pd.read_csv("./Data/Substrate_search.txt", sep='\t', header=None, encoding='utf-8')


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,CLE0166975,A01.001,ac-Phe-Tyr(I2),ac-Phe+Tyr(I2),-,-,Ac,Phe,TyI,-,...,,,pepsin A,,,,,,synthetic,
1,CLE0166506,A01.001,alcohol dehydrogenase,peptide-Ala107+Val-peptide,Arg,Thr,Ile,Ala,Val,Asn,...,107.0,Drosophila melanogaster,pepsin A,,,1-256,NT,VT,non-physiological,
2,CLE0166510,A01.001,alcohol dehydrogenase,peptide-Ala119+Ile-peptide,Thr,Thr,Thr,Ala,Ile,Leu,...,119.0,Drosophila melanogaster,pepsin A,,,1-256,NT,VT,non-physiological,
3,CLE0166517,A01.001,alcohol dehydrogenase,peptide-Ala178+Tyr-peptide,Gly,Val,Thr,Ala,Tyr,Thr,...,178.0,Drosophila melanogaster,pepsin A,,,1-256,NT,VT,non-physiological,
4,CLE0166523,A01.001,alcohol dehydrogenase,peptide-Ala218+Cys-peptide,Pro,Ser,Leu,Ala,Cys,Ala,...,218.0,Drosophila melanogaster,pepsin A,,,1-256,NT,VT,non-physiological,


In [8]:
# 构建氨基酸缩写间的映射
amino_table = pd.read_csv("./Data/amino_table.csv", sep="\t", header=None)
amino_table.columns = ["chinese", "english", "one_abbr", "three_abbr"]
amino_three2one, amino_one2three = dict(), dict()
for row_i, row in amino_table.iterrows():
    three_abbr = row["three_abbr"].lower()
    one_abbr = row["one_abbr"].lower()
    amino_three2one[three_abbr] = one_abbr
    amino_one2three[one_abbr] = three_abbr

In [9]:
# 只保留蛋白酶名和肽链信息
data = pd.read_csv("./Data/Substrate_search_processed.csv", sep='\t', header=None, encoding='utf-8')

protease_peptide = pd.concat((data[[1]], data.iloc[:, 4:12]), axis=1)  # 拼接蛋白酶和肽链信息
protease_peptide = protease_peptide.dropna()  # 删除有nan的行
nan_row_ids = set()  # 获取nan行的索引，部分数据有"NAN"的值，只能手动删除
for row_i, row in protease_peptide.iterrows():
    if "NAN" in row.tolist():
        nan_row_ids |= {row_i}
protease_peptide = protease_peptide.drop(list(nan_row_ids))  # 删除有nan行

protease_peptide.columns = ["protease"] + [i for i in range(8)]  # 修改列名
protease_peptide.iloc[:, 1:] = protease_peptide.iloc[:, 1:].map(lambda x: x.lower())  # 将氨基酸转为小写
protease_peptide.iloc[:, 1:] = protease_peptide.iloc[:, 1:].map(lambda x: "-" if "-" in x else x)  # 将"/-/"转为"-"

# 筛选人体蛋白酶
human_protease = pd.read_csv("./Data/human_protease.txt", sep="\t")
human_protease = set(human_protease["MEROPS ID"].tolist())
human_animos = set(bl.BLOSUM(62).keys())
protease_peptide = protease_peptide[protease_peptide["protease"].isin(human_protease)]

# # 筛选只含人体内氨基酸的肽链
# valid_row_ids = []
# for row_i, row in protease_peptide.iterrows():
#     if len(set(row.iloc[1:].map(lambda x: x.lower()).tolist()) - (amino_three2one.keys() | {"-"})) == 0:
#         valid_row_ids.append(row_i)
# protease_peptide = protease_peptide.loc[valid_row_ids]

# if SAVE is True:
#     protease_peptide.to_csv("./Data/Protease_Peptides.csv", sep='\t', header=True, index=False)  # 保存数据
protease_peptide.head(5)

  data = pd.read_csv("./Data/Substrate_search_processed.csv", sep='\t', header=None, encoding='utf-8')


Unnamed: 0,protease,0,1,2,3,4,5,6,7
995,A01.003,-,ac,phe,phe,leu,val,his,-
996,A01.003,-,ac,phe,phe,leu,val,nh2,-
997,A01.003,-,-,ac,tyr,leu,val,his,-
998,A01.003,phe,gly,asp,leu,ser,val,thr,tyr
999,A01.003,leu,gly,glu,phe,leu,arg,thr,his


In [10]:
# 查找MMP3独有的肽链
MMP3_protease = "M10.005"

# 构建从protease到肽链的映射
proteases = set(protease_peptide["protease"].tolist())
protease2peptides = dict()
for protease in proteases:
    peptides = protease_peptide[protease_peptide["protease"] == protease].iloc[:, 1:].values.tolist()
    peptides = {tuple(x) for x in peptides}
    protease2peptides[protease] = peptides

def search_unique_peptides(protease: str, protease2peptides: dict = protease2peptides) -> set:
    peptides = protease2peptides[protease]
    for p in protease2peptides.keys():
        if p != protease:
            peptides -= protease2peptides[p]
    return peptides

len(search_unique_peptides(MMP3_protease))  # original: 1945

1946

In [2]:
from extract_features import load_sparse_matrix
load_sparse_matrix("./Cache/binary.npz")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,150,151,152,153,154,155,156,157,158,159
0,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18578,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
18579,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18580,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
18581,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
