# Result Analysis

## Import Libraries

In [1]:
# import libraries
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

import sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, mean_squared_error, mean_absolute_error

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertTokenizerFast, BertModel


# 导入其他文件
from extract_features import load_features
from models import BioNN, BioDeepNN, BioResNet

# constant
SAVE = True
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Model Conparation

In [2]:
# 处理数据
# import data and preprocess
data = pd.read_csv("./Data/processed_peptides10.csv")  # load data

# 得到氨基酸序列
peptides = data.iloc[:, 0].values.tolist()  # 肽链的列表（字符串）

# load extracted features
features_x = load_features().iloc[:, 1:].values

# 处理mmp y的数据
all_mmp_y = data.iloc[:, 1:].values

In [3]:
# 定义验证模型效果的函数
def validate(model: object, x: np.array, y: np.array):
    # 用5折交叉验证来验证模型效果
    kf = KFold(n_splits=5, random_state=33, shuffle=True)
    rg_errors = np.zeros((5, y.shape[1], 3))
    cl_errors = np.zeros((5, y.shape[1], 4))  # 评价指标包括auc, f1, precision, recall
    for i, (train_id, test_id) in tqdm(enumerate(kf.split(features_x)), desc="Testing on all MMPs", total=5):
        for mmp_i in range(y.shape[1]):
            train_x, train_y = x[train_id], y[:, mmp_i][train_id]
            test_x, test_y = x[test_id], y[:, mmp_i][test_id]

            model.fit(train_x, train_y)
            pred = model.predict(test_x)
            # 记录regression error
            mse = mean_squared_error(test_y, pred)
            mae = mean_absolute_error(test_y, pred)
            rmse = np.sqrt(mse)
            rg_errors[i, mmp_i, 0] = mse
            rg_errors[i, mmp_i, 1] = mae
            rg_errors[i, mmp_i, 2] = rmse
            
            # 记录classification error
            cl_pred = pred > 1.65
            cl_test_y = test_y > 1.65
            auc = roc_auc_score(cl_test_y, cl_pred)
            f1 = f1_score(cl_test_y, cl_pred)
            precision = precision_score(cl_test_y, cl_pred)
            recall = recall_score(cl_test_y, cl_pred)
            cl_errors[i, mmp_i, 0] = auc
            cl_errors[i, mmp_i, 1] = f1
            cl_errors[i, mmp_i, 2] = precision
            cl_errors[i, mmp_i, 3] = recall
    return rg_errors, cl_errors

In [4]:
# linear regression
lr_model = Ridge(alpha=1.0)
lr_rg_error, lr_cl_error = validate(lr_model, features_x, all_mmp_y)

Testing on all MMPs: 100%|██████████| 5/5 [02:42<00:00, 32.42s/it]


In [5]:
# svr
svt_model = 

SyntaxError: invalid syntax (1606605017.py, line 2)