# Peptides10

## Import Libraries

In [5]:
# import libraries
import numpy as np
import pandas as pd
import pickle as pkl

import sklearn
from sklearn.model_selection import KFold

from catboost import CatBoostRegressor

import torch
import torch.nn as nn
from torch.nn.functional import one_hot 
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim

from TorchCRF import CRF
import tqdm
import blosum as bl

SAVE = True

In [2]:
# import data
data = pd.read_excel("./Data/peptides10.xlsx")  # load data
data.iloc[:, 0] = data.iloc[:, 0].map(lambda x: x.strip()).map(lambda x: x[1:-1])  # 删除首尾的氨基酸和空格
data = data.iloc[:, :-1]  # 删除最后一列
data.head()

  warn(msg)


Unnamed: 0.1,Unnamed: 0,MMP1,MMP2,MMP3,MMP7,MMP8,MMP9,MMP10,MMP11,MMP12,MMP13,MMP14,MMP15,MMP16,MMP17,MMP19,MMP20,MMP24,MMP25
0,ASGGMGNK,-0.59,-0.8,-1.0,-1.35,-1.39,-1.14,-0.97,-0.87,-1.37,-1.27,-0.26,-0.6,-0.84,-0.74,-1.55,-0.72,-1.14,0.07
1,KIYNYDCE,-1.08,-1.11,-0.37,-1.07,-0.81,-0.52,-0.58,-0.77,-0.41,-0.59,-0.93,-1.0,-1.4,0.57,0.28,0.45,-0.64,-0.19
2,KIYDYDCE,-0.93,-0.39,0.17,-0.42,-0.83,-0.02,-0.39,-0.58,-0.02,-0.16,-0.76,-0.57,-0.82,0.95,0.96,0.24,-0.66,1.15
3,KIYDLDCE,-1.28,-1.29,-0.66,-0.16,-0.41,-1.04,-0.55,-0.34,-0.23,-0.58,-0.41,-0.76,-1.7,0.33,-0.98,-0.44,-1.09,-0.17
4,ASGGLGNK,-1.28,-1.0,-1.42,-1.64,-0.16,-1.07,-0.93,-1.19,-1.71,-1.35,-0.46,-1.25,-1.12,-0.31,-0.29,-0.7,-0.95,0.3


In [3]:
# 得到氨基酸序列
amino_x = data.iloc[:, 0].values.tolist()  # input数据是氨基酸序列
amino_x = [[c for c in x] for x in amino_x]

In [4]:
# 用最简单的方法对氨基酸进行编码，将字母转换为1-26的数字
amino2num = {c: i for i, c in enumerate("ACDEFGHIKLMNPQRSTVWY")}
amino_label_x = [[amino2num[amino] for amino in peptides] for peptides in amino_x]  # 用离散的label得到肽链的输入数据
amino_label_x = np.array(amino_label_x)
mmp3_y = data.iloc[:, 3].values  # 得到mmp3的y

In [7]:
# 测试xgb
kf = KFold(n_splits=5)
errors = []
for train_id, test_id in kf.split(amino_label_x):
    train_x, train_y = amino_label_x[train_id], mmp3_y[train_id]
    test_x, test_y = amino_label_x[test_id], mmp3_y[test_id]

    model = CatBoostRegressor(iterations=100, depth=5, learning_rate=1e-3)

    model.fit(train_x, train_y)
    pred = model.predict(test_x)
    error = sklearn.metrics.mean_squared_error(test_y, pred)
    errors.append(error)
avg_error = np.average(errors)
print(avg_error)

0:	learn: 0.9957686	total: 1.6ms	remaining: 159ms
1:	learn: 0.9956485	total: 3.31ms	remaining: 162ms
2:	learn: 0.9955365	total: 4.81ms	remaining: 155ms
3:	learn: 0.9954289	total: 6.38ms	remaining: 153ms
4:	learn: 0.9953064	total: 8.04ms	remaining: 153ms
5:	learn: 0.9951878	total: 9.67ms	remaining: 151ms
6:	learn: 0.9950653	total: 11.3ms	remaining: 149ms
7:	learn: 0.9949507	total: 12.8ms	remaining: 148ms
8:	learn: 0.9948475	total: 14.3ms	remaining: 144ms
9:	learn: 0.9947285	total: 15.8ms	remaining: 142ms
10:	learn: 0.9946130	total: 17.3ms	remaining: 140ms
11:	learn: 0.9944912	total: 18.7ms	remaining: 137ms
12:	learn: 0.9943715	total: 20.1ms	remaining: 135ms
13:	learn: 0.9942555	total: 21.5ms	remaining: 132ms
14:	learn: 0.9941541	total: 22.8ms	remaining: 129ms
15:	learn: 0.9940287	total: 24.2ms	remaining: 127ms
16:	learn: 0.9939090	total: 25.6ms	remaining: 125ms
17:	learn: 0.9937899	total: 27ms	remaining: 123ms
18:	learn: 0.9936844	total: 28.6ms	remaining: 122ms
19:	learn: 0.9935821	tota

In [9]:
pred

array([-0.07461157, -0.07460934, -0.0856685 , ..., -0.08423327,
       -0.07917369, -0.1023775 ])

In [10]:
test_y

array([-1.5 , -1.25, -0.33, ..., -0.2 ,  0.33, -1.57])