# ML Experiment

We test the performance of several machine learning models on the MMP dataset. The models include linear regression, decision tree regression, XGBoost, CatBoost, and a simple neural network. We use 5-fold cross-validation to evaluate the performance of the models. The evaluation metrics include mean squared error, mean absolute error, root mean squared error, area under the curve, f1 score, precision, and recall. The results are saved in the `Result` folder.

## Import Libraries

In [1]:
# import libraries
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

import sklearn
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, mean_squared_error, mean_absolute_error

from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim


# import other files
from extract_features import load_features
from model_training import PeptidesDataLoader

# constant
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Model Conparation

In [2]:
# load data and preprocess
data = pd.read_csv("./Data/processed_peptides10.csv")  # load data

# get peptide sequences
peptides = data.iloc[:, 0].values.tolist()  # ['Peptide1'...]

# load extracted features
features_x = load_features().iloc[:, 1:].values

# extract mmp labels
all_mmp_y = data.iloc[:, 1:].values

### validate ML models

In [3]:
# define the function to validate
def validate(model: object, x: np.array, y: np.array):
    # validate with KFold cross-validation
    validation = np.zeros([2] + list(y.shape))  # record prediction and truth
    kf = KFold(n_splits=5, random_state=33, shuffle=True)
    for i, (train_id, test_id) in tqdm(enumerate(kf.split(x)), desc="Testing on all MMPs", total=5):
        for mmp_i in range(y.shape[1]):
            train_x, train_y = x[train_id], y[:, mmp_i][train_id]
            test_x, test_y = x[test_id], y[:, mmp_i][test_id]

            model.fit(train_x, train_y)
            pred = model.predict(test_x)
            test_y = test_y
            pred = pred

            # record true labels and prediction
            validation[0, test_id, mmp_i] = pred
            validation[1, test_id, mmp_i] = test_y
    return validation

In [4]:
# linear regression
lr_model = LinearRegression()
lr_validation = validate(lr_model, features_x, all_mmp_y)
np.save("./Result/lr_validation.npy", lr_validation)

Testing on all MMPs: 100%|██████████| 5/5 [04:08<00:00, 49.70s/it]


In [28]:
# DTR
dtr_model = DecisionTreeRegressor(criterion="squared_error", max_depth=10)
dtr_validation = validate(dtr_model, features_x, all_mmp_y)
np.save("./Result/dtr_validation.npy", dtr_validation)

Testing on all MMPs: 100%|██████████| 5/5 [00:47<00:00,  9.53s/it]


In [5]:
# XGBoost
xgb_model = XGBRegressor(n_estimators=100, max_depth=10)
xgb_validation = validate(xgb_model, features_x, all_mmp_y)
np.save("./Result/xgb_validation.npy", xgb_validation)

Testing on all MMPs: 100%|██████████| 5/5 [02:11<00:00, 26.27s/it]


In [4]:
# Catboost
cat_model = CatBoostRegressor(iterations=1000, depth=10, learning_rate=0.1, loss_function="RMSE", verbose=False)
cat_validation = validate(cat_model, features_x, all_mmp_y)
np.save("./Result/cat_validation.npy", cat_validation)

Testing on all MMPs: 100%|██████████| 5/5 [1:38:47<00:00, 1185.47s/it]


In [5]:
# Neural Network
class NN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    
    def forward(self, x):
        output = F.leaky_relu(self.fc1(x))  # use leaky_relu activation
        output = self.dropout1(output)
        output = F.leaky_relu(self.fc2(output))
        output = self.dropout2(output)
        output = self.fc3(output)
        return output
    
    def fit(self, x: np.array, y: np.array):
        self.to(device)
        self.train()
        x = torch.from_numpy(x).float()
        y = torch.from_numpy(y).float()
        dataloader = PeptidesDataLoader([0] * x.shape[0], x, y, batch_size=512, shuffle=True)
        optimizer = optim.Adam(self.parameters(), lr=0.001)
        criterion = nn.MSELoss()
        for i in range(100):
            for epoch_peptides, epoch_x, epoch_y in dataloader:
                optimizer.zero_grad()
                epoch_x, epoch_y = epoch_x.to(self.device), epoch_y.to(self.device)
                output = self.forward(epoch_x)
                loss = criterion(output, epoch_y)
                loss.backward()
                optimizer.step()
        return
        
    def predict(self, x):
        self.to(device)
        self.eval()
        x = torch.from_numpy(x).float()
        dataloader = PeptidesDataLoader([0] * x.shape[0], x, np.zeros_like(x), batch_size=8192, shuffle=False)
        pred = []
        with torch.no_grad():
            for _, epoch_x, _ in dataloader:
                epoch_x = epoch_x.to(self.device)
                output = self(epoch_x)
                pred.append(output.to("cpu").detach().numpy())
        pred = np.concatenate(pred, axis=0)
        return pred

# test the performance of NN
kf = KFold(n_splits=5, random_state=33, shuffle=True)
nn_validation = np.zeros([2] + list(all_mmp_y.shape))
for i, (train_id, test_id) in tqdm(enumerate(kf.split(features_x)), desc="Testing on all MMPs", total=5):
    nn_model = NN(features_x.shape[1], 512, all_mmp_y.shape[1])
    train_x, train_y = features_x[train_id], all_mmp_y[train_id]
    test_x, test_y = features_x[test_id], all_mmp_y[test_id]
    nn_model.fit(train_x, train_y)
    pred = nn_model.predict(test_x)
    nn_validation[0, test_id, :] = pred
    nn_validation[1, test_id, :] = test_y
np.save("./Result/nn_validation.npy", nn_validation)

Testing on all MMPs: 100%|██████████| 5/5 [00:58<00:00, 11.63s/it]
