# Peptides10

## Import Libraries

In [1]:
# import libraries
import numpy as np
import pandas as pd
import pickle as pkl

import sklearn
from sklearn.model_selection import KFold

from catboost import CatBoostRegressor

import torch
import torch.nn as nn
from torch.nn.functional import one_hot 
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim

from TorchCRF import CRF
import tqdm
import blosum as bl

# 导入其他文件
from extract_features import load_features

SAVE = True

  warn(msg)


## 导入数据

In [2]:
# import data
data = pd.read_excel("./Data/peptides10.xlsx")  # load data
data.iloc[:, 0] = data.iloc[:, 0].map(lambda x: x.strip()).map(lambda x: x[1:-1])  # 删除首尾的氨基酸和空格
data = data.iloc[:, :-1]  # 删除最后一列
data.head()

  warn(msg)


Unnamed: 0.1,Unnamed: 0,MMP1,MMP2,MMP3,MMP7,MMP8,MMP9,MMP10,MMP11,MMP12,MMP13,MMP14,MMP15,MMP16,MMP17,MMP19,MMP20,MMP24,MMP25
0,ASGGMGNK,-0.59,-0.8,-1.0,-1.35,-1.39,-1.14,-0.97,-0.87,-1.37,-1.27,-0.26,-0.6,-0.84,-0.74,-1.55,-0.72,-1.14,0.07
1,KIYNYDCE,-1.08,-1.11,-0.37,-1.07,-0.81,-0.52,-0.58,-0.77,-0.41,-0.59,-0.93,-1.0,-1.4,0.57,0.28,0.45,-0.64,-0.19
2,KIYDYDCE,-0.93,-0.39,0.17,-0.42,-0.83,-0.02,-0.39,-0.58,-0.02,-0.16,-0.76,-0.57,-0.82,0.95,0.96,0.24,-0.66,1.15
3,KIYDLDCE,-1.28,-1.29,-0.66,-0.16,-0.41,-1.04,-0.55,-0.34,-0.23,-0.58,-0.41,-0.76,-1.7,0.33,-0.98,-0.44,-1.09,-0.17
4,ASGGLGNK,-1.28,-1.0,-1.42,-1.64,-0.16,-1.07,-0.93,-1.19,-1.71,-1.35,-0.46,-1.25,-1.12,-0.31,-0.29,-0.7,-0.95,0.3


In [3]:
# 得到氨基酸序列
amino_x = data.iloc[:, 0].values.tolist()  # input数据是氨基酸序列
amino_x = [[c for c in x] for x in amino_x]

mmp3_y = data.iloc[:, 3].values  # 得到mmp3的y

In [4]:
# 用最简单的方法对氨基酸进行编码，将字母转换为1-26的数字
amino2num = {c: i for i, c in enumerate("ACDEFGHIKLMNPQRSTVWY")}
amino_label_x = [[amino2num[amino] for amino in peptides] for peptides in amino_x]  # 用离散的label得到肽链的输入数据
amino_label_x = np.array(amino_label_x)

In [4]:
features_x = load_features().iloc[:, 1:].values

## 构建模型

In [21]:
# 测试cat(粗调) 1h
def kf_test(lr: float, iter: int) -> float:
    kf = KFold(n_splits=5)
    errors = []
    for train_id, test_id in kf.split(features_x):
        train_x, train_y = features_x[train_id], mmp3_y[train_id]
        test_x, test_y = features_x[test_id], mmp3_y[test_id]

        model = CatBoostRegressor(iterations=iter, depth=10, learning_rate=lr, loss_function="RMSE")

        model.fit(train_x, train_y)
        pred = model.predict(test_x)
        error = sklearn.metrics.mean_squared_error(test_y, pred)
        errors.append(error)
    avg_error = np.average(errors)
    return avg_error

iterations = [500, 1000, 1500]
lrs = [0.01, 0.1, 1]
errors = dict()
for iteration in iterations:
    for lr in lrs:
        avg_error = kf_test(lr, iteration)
        errors[(lr, iteration)] = avg_error

0:	learn: 0.9935694	total: 48ms	remaining: 24s
1:	learn: 0.9912648	total: 97.7ms	remaining: 24.3s
2:	learn: 0.9892149	total: 149ms	remaining: 24.7s
3:	learn: 0.9868096	total: 201ms	remaining: 24.9s
4:	learn: 0.9847176	total: 254ms	remaining: 25.1s
5:	learn: 0.9826460	total: 307ms	remaining: 25.2s
6:	learn: 0.9806320	total: 358ms	remaining: 25.2s
7:	learn: 0.9783633	total: 421ms	remaining: 25.9s
8:	learn: 0.9762318	total: 476ms	remaining: 26s
9:	learn: 0.9743956	total: 531ms	remaining: 26s
10:	learn: 0.9723617	total: 582ms	remaining: 25.9s
11:	learn: 0.9702653	total: 634ms	remaining: 25.8s
12:	learn: 0.9681831	total: 686ms	remaining: 25.7s
13:	learn: 0.9660479	total: 739ms	remaining: 25.6s
14:	learn: 0.9641647	total: 794ms	remaining: 25.7s
15:	learn: 0.9621433	total: 847ms	remaining: 25.6s
16:	learn: 0.9602725	total: 902ms	remaining: 25.6s
17:	learn: 0.9585462	total: 954ms	remaining: 25.6s
18:	learn: 0.9567874	total: 1.01s	remaining: 25.5s
19:	learn: 0.9549930	total: 1.06s	remaining: 25

In [22]:
errors

{(0.01, 500): 0.6562976223926196,
 (0.1, 500): 0.5540895155266845,
 (1, 500): 0.7918202606685946,
 (0.01, 1000): 0.6077085439090827,
 (0.1, 1000): 0.5511657178360376,
 (1, 1000): 0.8021398958193586,
 (0.01, 1500): 0.5810863866378174,
 (0.1, 1500): 0.5532692812010497,
 (1, 1500): 0.8025975024267054}

In [23]:
# 测试cat
kf = KFold(n_splits=5)
errors = []
for train_id, test_id in kf.split(features_x):
    train_x, train_y = features_x[train_id], mmp3_y[train_id]
    test_x, test_y = features_x[test_id], mmp3_y[test_id]

    model = CatBoostRegressor(iterations=1000, depth=10, learning_rate=0.1, loss_function="RMSE")

    model.fit(train_x, train_y)
    pred = model.predict(test_x)
    error = sklearn.metrics.mean_squared_error(test_y, pred)
    errors.append(error)
avg_error = np.average(errors)
print(avg_error)

0:	learn: 0.9734561	total: 63.9ms	remaining: 1m 3s
1:	learn: 0.9546981	total: 124ms	remaining: 1m 2s
2:	learn: 0.9392692	total: 183ms	remaining: 1m
3:	learn: 0.9247670	total: 243ms	remaining: 1m
4:	learn: 0.9122572	total: 303ms	remaining: 1m
5:	learn: 0.8998286	total: 365ms	remaining: 1m
6:	learn: 0.8906351	total: 423ms	remaining: 1m
7:	learn: 0.8809426	total: 482ms	remaining: 59.8s
8:	learn: 0.8710687	total: 542ms	remaining: 59.7s
9:	learn: 0.8635751	total: 601ms	remaining: 59.5s
10:	learn: 0.8568961	total: 662ms	remaining: 59.5s
11:	learn: 0.8505684	total: 722ms	remaining: 59.5s
12:	learn: 0.8436176	total: 783ms	remaining: 59.4s
13:	learn: 0.8384545	total: 845ms	remaining: 59.5s
14:	learn: 0.8330639	total: 905ms	remaining: 59.5s
15:	learn: 0.8285515	total: 966ms	remaining: 59.4s
16:	learn: 0.8245972	total: 1.03s	remaining: 59.6s
17:	learn: 0.8210680	total: 1.09s	remaining: 59.6s
18:	learn: 0.8167802	total: 1.16s	remaining: 59.6s
19:	learn: 0.8123308	total: 1.22s	remaining: 59.7s
20:	

In [10]:
test_y

array([-1.5 , -1.25, -0.33, ..., -0.2 ,  0.33, -1.57])

In [11]:
pred

array([-0.10985887, -0.08802654, -0.09440269, ..., -0.07117601,
       -0.06751979, -0.06942579])