# Predicting peptide antibiofilm potential
Reference article: https://github.com/davidanastasiu/antibiofilm

In [None]:
# Packages for analysis
import pandas as pd
import numpy as np
import subprocess
import sys
import os
from pathlib import Path

# Pickle package
import pickle

# Packages for visuals
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(font_scale=1.2)
#Ml
from sklearn import svm, datasets
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV



# Dataset for Exp 1 - peptides linked with Homo sapiens immune system

In [None]:
#dataset 
peptides = pd.read_csv('meus_peptides.csv', encoding='cp1252') 
peptides = peptides.round(3)
peptides= peptides.dropna(subset=peptides.columns[3:]) # drop NA value if they exists
peptides

#more negative samples taht positive
#capacity antibiofilm type=1

In [None]:
#how many peptides associate to a positive class- antibiofilm capacity
count = sum(peptides['Type'] == 1)
count 


In [None]:
# X, i.e. the features or attributes
characters=peptides.to_numpy() 
print(characters)
#each row

In [None]:
# y, i.e. the class attribute where 0=negative and 1=positive
type_label=peptides['Type'].to_numpy()
print(type_label)
#the type existent

In [None]:
#common mapping
X=characters #rest of data
Y=type_label #0 or 1

# Split dataset 80/20

Train and test splitting such that 80% of the dataset goes to training and 20% to test.

In [None]:
X_tr, X_te, y_tr, y_te = train_test_split(X, Y, stratify=Y,  test_size=0.2, random_state=42, shuffle=True)

In [None]:
print(X_te.shape) #to confirm the correct division
print(y_te.shape) 

# Normalization

In [None]:
#normalize data between 0-min and 1-max
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)) 

# Stratitified k-fold

In [None]:
sss = StratifiedKFold(n_splits =10, random_state=42, shuffle=True) #

# SVM

### Optimize hyperparameters

In [None]:
#hyper optimization
parameters = {
    'C': [0.1, 1, 10,100],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.1, 0.01, 0.001]
}
svm_classifier = SVC()
grid_search = GridSearchCV(svm_classifier, parameters, scoring='f1',cv=5)
grid_search.fit(X_tr[:,3:], y_tr)
best_params = grid_search.best_params_
best_params


In [None]:
SS_classifier = svm.SVC(kernel='linear', C=0.1, gamma=0.01,probability=True) #hiperparameters

In [None]:
#in each split we save some metrics
sum_SS_f1=0 
scores_ss = [] 
mccs_ss = []
f1s_ss = []
n=0
for train_index, test_index in sss.split(X_tr, y_tr):
    positive=0
    negative=0
    for i in range(y_tr.shape[0]):
        if (y_tr[i]):
            positive=positive+1
        else:
            negative=negative+1
    print('positive sample', positive)
    print('negative sample', negative)
    
    X_SS_train, X_SS_test, y_SS_train, y_SS_test = X_tr[train_index,3:], X_tr[test_index,3:], y_tr[train_index], y_tr[test_index]
    


    X_SS_train= min_max_scaler.fit_transform(X_SS_train)
   

    X_SS_test = min_max_scaler.transform(X_SS_test)
   

    SS_classifier.fit(X_SS_train, y_SS_train)
    scores_ss.append(SS_classifier.score(X_SS_test, y_SS_test))
    ypred=(SS_classifier.predict(X_SS_test))
    mcc=matthews_corrcoef(y_SS_test, ypred)
    mccs_ss.append (mcc)
    f1=f1_score(y_SS_test, ypred)
    f1s_ss.append(f1)

print("*************************************")
print("Scores: ",np.min(scores_ss), np.max(scores_ss), np.std(scores_ss))
print("F1s: ", np.min(f1s_ss), np.max(f1s_ss), np.std(f1s_ss))
print("MCCs: ", np.min(mccs_ss), np.max(mccs_ss), np.std(mccs_ss))
print ( "avg cross-validation accuracy:", (sum(scores_ss)/10))
print ( "avg cross-validation f1:", (sum(f1s_ss)/10))
print ( "avg cross-validation mcc:", (sum(mccs_ss)/10))
print("*************************************")

### Train and see the performance of SVM model in train dataset

In [None]:
X_new = X_tr[:,3:]
X_new = min_max_scaler.fit_transform(X_new)
SS_classifier.fit(X_new,y_tr)
y_tr_predict = SS_classifier.predict(X_new)

print('f1 on Train set: ', f1_score(y_tr, y_tr_predict))
print('MCC on Train set: ', matthews_corrcoef(y_tr, y_tr_predict))

tn, fp, fn, tp = confusion_matrix(y_tr, y_tr_predict).ravel()
print("tn, fp, tp, fn", tn, fp, tp, fn)
specificity = tn / (tn+fp)
print('Specificity on Train set(tn / (tn+fp)): ', specificity)
sensitivity = tp / (tp+fn)
print('Sensitivity on Train set(tp / (tp+fn)): ', sensitivity)
accuracy = (tp+tn) /(tp+tn+fp+fn)
print('Accuracy on Train set: ', accuracy)

### Test and see the performance of SVM model in test dataset

In [None]:
X_te_new = X_te[:,3:]
X_te_new = min_max_scaler.transform(X_te_new)
y_SS_pred=SS_classifier.predict(X_te_new)

print("*************************************")
print('f1 on Test set: ', f1_score(y_te, y_SS_pred))
print('MCC on Test set: ', matthews_corrcoef(y_te, y_SS_pred))
tn, fp, fn, tp = confusion_matrix(y_te, y_SS_pred).ravel()
print("tn, fp, tp, fn", tn, fp, tp, fn)
specificity = tn / (tn+fp)
print('Specificity on Test set(tn / (tn+fp)): ', specificity)
sensitivity = tp / (tp+fn)
print('Sensitivity on Test set(tp / (tp+fn)): ', sensitivity)
accuracy = (tp+tn) /(tp+tn+fp+fn)
print('Accuracy on Test set: ', accuracy)
precision=tp/(tp+fp)
print("Precision on Test set: ", precision)



### Confusion matrix

In [None]:
confusion = confusion_matrix(y_te, y_SS_pred)


tn, fp, fn, tp = confusion.ravel()

print("Confusion Matrix:")
print(confusion)

plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, cmap='Greens', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Random Forest


### Optimize hyperparameters

In [None]:
parameters = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

RF_classifier = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(RF_classifier, parameters, scoring='f1', cv=5)

grid_search.fit(X_tr[:, 3:], y_tr)

best_params_RF = grid_search.best_params_
best_params_RF


In [None]:
RF_classifier = RandomForestClassifier(random_state=42, max_depth=None, min_samples_split=5, n_estimators=100)

In [None]:
sum_SS_f1=0 
scores_rf = [] 
mccs_rf = [] 
f1s_rf = []
n=0

for train_index, test_index in sss.split(X_tr, y_tr):
    positive = 0
    negative = 0
    for i in range(y_tr.shape[0]):
        if y_tr[i] == 1:
            positive += 1
        else:
            negative += 1
    print('Positive samples:', positive)
    print('Negative samples:', negative)
    
    X_RF_train, X_RF_test, y_RF_train, y_RF_test = X_tr[train_index,3:], X_tr[test_index,3:], y_tr[train_index], y_tr[test_index]
    
    X_RF_train_new = X_RF_train
    X_RF_train_new = min_max_scaler.fit_transform(X_RF_train_new)
    
    X_RF_test_new = X_RF_test
    X_RF_test_new = min_max_scaler.transform(X_RF_test_new)
    
    RF_classifier.fit(X_RF_train_new, y_RF_train)
    scores_rf.append(RF_classifier.score(X_RF_test_new, y_RF_test))
    y_pred = RF_classifier.predict(X_RF_test_new)
    mcc = matthews_corrcoef(y_RF_test, y_pred)
    mccs_rf.append(mcc)
    f1 = f1_score(y_RF_test, y_pred)
    f1s_rf.append(f1)

print("*************************************")
print("Scores: ", np.min(scores_rf), np.max(scores_rf), np.std(scores_rf))
print("F1s: ", np.min(f1s_rf), np.max(f1s_rf), np.std(f1s_rf))
print("MCCs: ", np.min(mccs_rf), np.max(mccs_rf), np.std(mccs_rf))
print("avg cross-validation accuracy:", (sum(scores_rf) / 10))
print("avg cross-validation f1:", (sum(f1s_rf) / 10))
print("avg cross-validation mcc:", (sum(mccs_rf) / 10))
print("*************************************")



### Train and see the performance of RF model in train dataset

In [None]:
X_new = X_tr[:,3:]
X_new = min_max_scaler.fit_transform(X_new)
RF_classifier.fit(X_new, y_tr)
y_tr_predict = RF_classifier.predict(X_new)

print('f1 on Train set: ', f1_score(y_tr, y_tr_predict))
print('MCC on Train set: ', matthews_corrcoef(y_tr, y_tr_predict))

tn, fp, fn, tp = confusion_matrix(y_tr, y_tr_predict).ravel()
print("tn, fp, tp, fn", tn, fp, tp, fn)
specificity = tn / (tn+fp)
print('Specificity on Train set(tn / (tn+fp)): ', specificity)
sensitivity = tp / (tp+fn)
print('Sensitivity on Train set(tp / (tp+fn)): ', sensitivity)
accuracy = (tp+tn) /(tp+tn+fp+fn)
print('Accuracy on Train set: ', accuracy)


### Test and see the performance of RF model in test dataset


In [None]:
X_te_new = X_te[:,3:]
X_te_new = min_max_scaler.transform(X_te_new)
y_RF_pred=RF_classifier.predict(X_te_new)

print("*************************************")
print('f1 on Test set: ', f1_score(y_te, y_RF_pred))
print('MCC on Test set: ', matthews_corrcoef(y_te, y_RF_pred))
tn, fp, fn, tp = confusion_matrix(y_te, y_RF_pred).ravel()
print("tn, fp, tp, fn", tn, fp, tp, fn)
specificity = tn / (tn+fp)
print('Specificity on Test set(tn / (tn+fp)): ', specificity)
sensitivity = tp / (tp+fn)
print('Sensitivity on Test set(tp / (tp+fn)): ', sensitivity)
accuracy = (tp+tn) /(tp+tn+fp+fn)
print('Accuracy on Test set: ', accuracy)
precision=tp/(tp+fp)
print("Precision on Test set: ", precision)


### Confusion matrix

In [None]:
confusion = confusion_matrix(y_te, y_RF_pred)


tn, fp, fn, tp = confusion.ravel()

print("Confusion Matrix:")
print(confusion)

plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Naive Bayes

### Optimize hyperparameters

In [None]:
parameters = {
    'var_smoothing': [1e-12,1e-10,1e-9, 1e-8, 1e-7,1e-5]
}

NB_classifier = GaussianNB()

grid_search = GridSearchCV(NB_classifier, parameters, scoring='f1', cv=5)
grid_search.fit(X_tr[:, 3:], y_tr)

best_params_NB = grid_search.best_params_
best_params_NB


In [None]:
NB_classifier = GaussianNB(var_smoothing=1e-12)

sum_nb_f1=0 
scores_nb = [] 
mccs_nb = [] 
f1s_nb = []
n=0

for train_index, test_index in sss.split(X_tr, y_tr):
    positive = 0
    negative = 0
    for i in range(y_tr.shape[0]):
        if y_tr[i] == 1:
            positive += 1
        else:
            negative += 1
    print('Positive samples:', positive)
    print('Negative samples:', negative)
    
    X_NB_train, X_NB_test, y_NB_train, y_NB_test = X_tr[train_index,3:], X_tr[test_index,3:], y_tr[train_index], y_tr[test_index]
    
    X_NB_train_new = X_NB_train
    X_NB_train_new = min_max_scaler.fit_transform(X_NB_train_new)
    
    X_NB_test_new = X_NB_test
    X_NB_test_new = min_max_scaler.transform(X_NB_test_new)
    
    NB_classifier.fit(X_NB_train_new, y_NB_train)
    scores_nb.append(NB_classifier.score(X_NB_test_new, y_NB_test))
    y_pred = NB_classifier.predict(X_NB_test_new)
    mcc = matthews_corrcoef(y_NB_test, y_pred)
    mccs_nb.append(mcc)
    f1 = f1_score(y_NB_test, y_pred)
    f1s_nb.append(f1)

print("*************************************")
print("Scores: ", np.min(scores_nb), np.max(scores_nb), np.std(scores_nb))
print("F1s: ", np.min(f1s_nb), np.max(f1s_nb), np.std(f1s_nb))
print("MCCs: ", np.min(mccs_nb), np.max(mccs_nb), np.std(mccs_nb))
print("avg cross-validation accuracy:", (sum(scores_nb) / 10))
print("avg cross-validation f1:", (sum(f1s_nb) / 10))
print("avg cross-validation mcc:", (sum(mccs_nb) / 10))
print("*************************************")

### Train and see the performance of NB model in train dataset


In [None]:
X_new = X_tr[:,3:]
X_new = min_max_scaler.fit_transform(X_new)
NB_classifier.fit(X_new, y_tr)
y_tr_predict = NB_classifier.predict(X_new)

print('f1 on Train set: ', f1_score(y_tr, y_tr_predict))
print('MCC on Train set: ', matthews_corrcoef(y_tr, y_tr_predict))

tn, fp, fn, tp = confusion_matrix(y_tr, y_tr_predict).ravel()
print("tn, fp, tp, fn", tn, fp, tp, fn)
specificity = tn / (tn+fp)
print('Specificity on Train set(tn / (tn+fp)): ', specificity)
sensitivity = tp / (tp+fn)
print('Sensitivity on Train set(tp / (tp+fn)): ', sensitivity)
accuracy = (tp+tn) /(tp+tn+fp+fn)
print('Accuracy on Train set: ', accuracy)

### Test and see the performance of NB model in test dataset


In [None]:
X_te_new = X_te[:,3:]
X_te_new = min_max_scaler.transform(X_te_new)
y_NB_pred = NB_classifier.predict(X_te_new)


In [None]:
print("*************************************")
print('f1 on Test set: ', f1_score(y_te, y_NB_pred))
print('MCC on Test set: ', matthews_corrcoef(y_te, y_NB_pred))
tn, fp, fn, tp = confusion_matrix(y_te, y_NB_pred).ravel()
print("tn, fp, tp, fn", tn, fp, tp, fn)
specificity = tn / (tn + fp)
print('Specificity on Test set(tn / (tn+fp)): ', specificity)
sensitivity = tp / (tp + fn)
print('Sensitivity on Test set(tp / (tp+fn)): ', sensitivity)
accuracy = (tp + tn) / (tp + tn + fp + fn)
print('Accuracy on Test set: ', accuracy)
precision=tp/(tp+fp)
print("Precision on Test set: ", precision)

### Confusion matrix

In [None]:
confusion = confusion_matrix(y_te, y_NB_pred)


tn, fp, fn, tp = confusion.ravel()

print("Confusion Matrix:")
print(confusion)

plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Decision tree

### Optimize hyperparameters


In [None]:
parameters = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

dt_classifier = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dt_classifier, parameters, scoring='f1', cv=5)
grid_search.fit(X_tr[:, 3:], y_tr)

best_params_DT = grid_search.best_params_
best_params_DT


In [None]:
DT_classifier = DecisionTreeClassifier(random_state=42, criterion='gini', max_depth=5, min_samples_leaf=4, min_samples_split=2)

sum_dt_f1=0 
scores_dt = [] 
mccs_dt = [] 
f1s_dt = []

for train_index, test_index in sss.split(X_tr, y_tr):
    positive = 0
    negative = 0
    for i in range(y_tr.shape[0]):
        if y_tr[i] == 1:
            positive += 1
        else:
            negative += 1
    print('Positive samples:', positive)
    print('Negative samples:', negative)
    
    X_DT_train, X_DT_test, y_DT_train, y_DT_test = X_tr[train_index,3:], X_tr[test_index,3:], y_tr[train_index], y_tr[test_index]
    
    X_DT_train_new = X_DT_train
    X_DT_train_new = min_max_scaler.fit_transform(X_DT_train_new)
    
    X_DT_test_new = X_DT_test
    X_DT_test_new = min_max_scaler.transform(X_DT_test_new)
    
    DT_classifier.fit(X_DT_train_new, y_DT_train)
    scores_dt.append(DT_classifier.score(X_DT_test_new, y_DT_test))
    y_pred = DT_classifier.predict(X_DT_test_new)
    mcc = matthews_corrcoef(y_DT_test, y_pred)
    mccs_dt.append(mcc)
    f1 = f1_score(y_DT_test, y_pred)
    f1s_dt.append(f1)

print("*************************************")
print("Scores: ", np.min(scores_dt), np.max(scores_dt), np.std(scores_dt))
print("F1s: ", np.min(f1s_dt), np.max(f1s_dt), np.std(f1s_dt))
print("MCCs: ", np.min(mccs_dt), np.max(mccs_dt), np.std(mccs_dt))
print("avg cross-validation accuracy:", (sum(scores_dt) / 10))
print("avg cross-validation f1:", (sum(f1s_dt) / 10))
print("avg cross-validation mcc:", (sum(mccs_dt) / 10))
print("*************************************")


### Train and see the performance of DT model in train dataset


In [None]:
X_new =X_tr[:,3:]
X_new = min_max_scaler.fit_transform(X_new)
DT_classifier.fit(X_new, y_tr)
y_tr_predict = DT_classifier.predict(X_new)

print('f1 on Train set: ', f1_score(y_tr, y_tr_predict))
print('MCC on Train set: ', matthews_corrcoef(y_tr, y_tr_predict))

tn, fp, fn, tp = confusion_matrix(y_tr, y_tr_predict).ravel()
print("tn, fp, tp, fn", tn, fp, tp, fn)
specificity = tn / (tn+fp)
print('Specificity on Train set(tn / (tn+fp)): ', specificity)
sensitivity = tp / (tp+fn)
print('Sensitivity on Train set(tp / (tp+fn)): ', sensitivity)
accuracy = (tp+tn) /(tp+tn+fp+fn)
print('Accuracy on Train set: ', accuracy)


### Test and see the performance of DT model in test dataset


In [None]:
X_te_new = X_te[:,3:]
X_te_new = min_max_scaler.transform(X_te_new)
y_DT_pred = DT_classifier.predict(X_te_new)

print("*************************************")
print('f1 on Test set: ', f1_score(y_te, y_DT_pred))
print('MCC on Test set: ', matthews_corrcoef(y_te, y_DT_pred))
tn, fp, fn, tp = confusion_matrix(y_te, y_DT_pred).ravel()
print("tn, fp, tp, fn", tn, fp, tp, fn)
specificity = tn / (tn + fp)
print('Specificity on Test set(tn / (tn+fp)): ', specificity)
sensitivity = tp / (tp + fn)
print('Sensitivity on Test set(tp / (tp+fn)): ', sensitivity)
accuracy = (tp + tn) / (tp + tn + fp + fn)
print('Accuracy on Test set: ', accuracy)
precision=tp/(tp+fp)
print("Precision on Test set: ", precision)


### Confusion matrix

In [None]:
confusion = confusion_matrix(y_te, y_DT_pred)

tn, fp, fn, tp = confusion.ravel()

print("Confusion Matrix:")
print(confusion)

plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, cmap='Greens', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Logistic Regression

### Optimize hyperparameters


In [None]:
parameters = {
    'penalty': ['l1', 'l2', 'none'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs', 'saga'],
    'max_iter': [10,100, 200]
}

logistic_regression = LogisticRegression()

grid_search = GridSearchCV(logistic_regression, parameters, scoring='f1', cv=5)

grid_search.fit(X_tr[:, 3:], y_tr)

best_params_LT = grid_search.best_params_
best_params_LT



In [None]:
LR_classifier = LogisticRegression(random_state=42, C=1, max_iter=10, penalty='l1', solver='liblinear')

sum_lr_f1=0 
scores_lr = [] 
mccs_lr = [] 
f1s_lr = []

for train_index, test_index in sss.split(X_tr, y_tr):
    positive = 0
    negative = 0
    for i in range(y_tr.shape[0]):
        if y_tr[i] == 1:
            positive += 1
        else:
            negative += 1
    print('Positive samples:', positive)
    print('Negative samples:', negative)
    
    X_LR_train, X_LR_test, y_LR_train, y_LR_test =  X_tr[train_index,3:], X_tr[test_index,3:], y_tr[train_index], y_tr[test_index]
    
    X_LR_train_new = X_LR_train
    X_LR_train_new = min_max_scaler.fit_transform(X_LR_train_new)
    
    X_LR_test_new = X_LR_test
    X_LR_test_new = min_max_scaler.transform(X_LR_test_new)
    
    LR_classifier.fit(X_LR_train_new, y_LR_train)
    scores_lr.append(LR_classifier.score(X_LR_test_new, y_LR_test))
    y_pred = LR_classifier.predict(X_LR_test_new)
    mcc = matthews_corrcoef(y_LR_test, y_pred)
    mccs_lr.append(mcc)
    f1 = f1_score(y_LR_test, y_pred)
    f1s_lr.append(f1)

print("*************************************")
print("Scores: ", np.min(scores_lr), np.max(scores_lr), np.std(scores_lr))
print("F1s: ", np.min(f1s_lr), np.max(f1s_lr), np.std(f1s_lr))
print("MCCs: ", np.min(mccs_lr), np.max(mccs_lr), np.std(mccs_lr))
print("avg cross-validation accuracy:", (sum(scores_lr) / 10))
print("avg cross-validation f1:", (sum(f1s_lr) / 10))
print("avg cross-validation mcc:", (sum(mccs_lr) / 10))
print("*************************************")


### Train and see the performance of LR model in train dataset



In [None]:
X_new = X_tr[:,3:]
X_new = min_max_scaler.fit_transform(X_new)
LR_classifier.fit(X_new, y_tr)
y_tr_predict = LR_classifier.predict(X_new)

print('f1 on Train set: ', f1_score(y_tr, y_tr_predict))
print('MCC on Train set: ', matthews_corrcoef(y_tr, y_tr_predict))

tn, fp, fn, tp = confusion_matrix(y_tr, y_tr_predict).ravel()
print("tn, fp, tp, fn", tn, fp, tp, fn)
specificity = tn / (tn+fp)
print('Specificity on Train set(tn / (tn+fp)): ', specificity)
sensitivity = tp / (tp+fn)
print('Sensitivity on Train set(tp / (tp+fn)): ', sensitivity)
accuracy = (tp+tn) /(tp+tn+fp+fn)
print('Accuracy on Train set: ', accuracy)



### Test and see the performance of SVM model in test dataset


In [None]:
X_te_new = X_te[:,3:]
X_te_new = min_max_scaler.transform(X_te_new)
y_LR_pred = LR_classifier.predict(X_te_new)

print("*************************************")
print('f1 on Test set: ', f1_score(y_te, y_LR_pred))
print('MCC on Test set: ', matthews_corrcoef(y_te, y_LR_pred))
tn, fp, fn, tp = confusion_matrix(y_te, y_LR_pred).ravel()
print("tn, fp, tp, fn", tn, fp, tp, fn)
specificity = tn / (tn + fp)
print('Specificity on Test set(tn / (tn+fp)): ', specificity)
sensitivity = tp / (tp + fn)
print('Sensitivity on Test set(tp / (tp+fn)): ', sensitivity)
accuracy = (tp + tn) / (tp + tn + fp + fn)
print('Accuracy on Test set: ', accuracy)
precision=tp/(tp+fp)
print("Precision on Test set: ", precision)




### Confusion matrix

In [None]:
confusion = confusion_matrix(y_te, y_LR_pred)

tn, fp, fn, tp = confusion.ravel()

print("Confusion Matrix:")
print(confusion)

plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


# ROC curve

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# SVM
SS_classifier.fit(X_tr[:,3:], y_tr)
y_svm_pred_prob = SS_classifier.predict_proba(X_te[:,3:])[:, 1]
fpr_svm, tpr_svm, _ = roc_curve(y_te, y_svm_pred_prob)
auc_svm = roc_auc_score(y_te, y_svm_pred_prob)

# # Random Forest~
# RF_classifier.fit(X_tr[:,3:], y_tr)
# y_rf_pred_prob = RF_classifier.predict_proba(X_te[:,3:])[:, 1]
# fpr_rf, tpr_rf, _ = roc_curve(y_te, y_rf_pred_prob)
# auc_rf = roc_auc_score(y_te, y_rf_pred_prob)

# from sklearn.naive_bayes import GaussianNB
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.linear_model import LogisticRegression

# # Naive Bayes
# NB_classifier.fit(X_tr[:,3:], y_tr)
# y_nb_pred_prob = NB_classifier.predict_proba(X_te[:,3:])[:, 1]
# fpr_nb, tpr_nb, _ = roc_curve(y_te, y_nb_pred_prob)
# auc_nb = roc_auc_score(y_te, y_nb_pred_prob)

# # Decision Tree
# DT_classifier.fit(X_tr[:,3:], y_tr)
# y_dt_pred_prob = DT_classifier.predict_proba(X_te[:,3:])[:, 1]
# fpr_dt, tpr_dt, _ = roc_curve(y_te, y_dt_pred_prob)
# auc_dt = roc_auc_score(y_te, y_dt_pred_prob)

# # Logistic Regression
# LR_classifier.fit(X_tr[:,3:], y_tr)
# y_lr_pred_prob = LR_classifier.predict_proba(X_te[:,3:])[:, 1]
# fpr_lr, tpr_lr, _ = roc_curve(y_te, y_lr_pred_prob)
# auc_lr = roc_auc_score(y_te, y_lr_pred_prob)

# Plotando as curvas ROC
plt.plot(fpr_svm, tpr_svm, label='SVM (AUC = %0.2f)' % auc_svm)
#plt.plot(fpr_rf, tpr_rf, label='Random Forest (AUC = %0.2f)' % auc_rf)
#plt.plot(fpr_nb, tpr_nb, label='Naive Bayes (AUC = %0.2f)' % auc_nb)
#plt.plot(fpr_dt, tpr_dt, label='Decision Tree (AUC = %0.2f)' % auc_dt)
#plt.plot(fpr_lr, tpr_lr, label='Logistic Regression (AUC = %0.2f)' % auc_lr)
plt.plot([0, 1], [0, 1], 'k--')  # Linha diagonal para referência
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic-ROC curve')
plt.legend(loc='lower right')
plt.show()



# New dataset - sintethic e stample peptides


## best model is SVM

After choosing the best model, we analyze its predictive capacity in a dataset that has not been trained (concern with overfitting)

In [None]:
#best model 
SS_classifier = svm.SVC(kernel='linear', C=0.1, gamma=0.01,probability=True)

In [None]:
#new dataset
peptides_novo = pd.read_csv('meus_peptides_dataset2.csv', encoding='cp1252') #latin encoding
peptides_novo = peptides_novo.round(3)
peptides_novo= peptides_novo.dropna(subset=peptides.columns[3:])
peptides_novo


### Test the new datset and see the performance

In [None]:
X_new = X[:,3:]
X_new = min_max_scaler.fit_transform(X_new)
SS_classifier.fit(X_new, Y)
X_peptides = peptides_novo.to_numpy()[:, 3:]  
type_label_2=peptides_novo['Type'].to_numpy()
Y_novo=type_label_2
X_peptides_normalizado = min_max_scaler.transform(X_peptides)
y_novo_pred=SS_classifier.predict(X_peptides_normalizado)

In [None]:

print("*************************************")
print('f1 on Test set: ', f1_score(Y_novo, y_novo_pred))
print('MCC on Test set: ', matthews_corrcoef(Y_novo, y_novo_pred))
tn, fp, fn, tp = confusion_matrix(Y_novo, y_novo_pred).ravel()
print("tn, fp, tp, fn", tn, fp, tp, fn)
specificity = tn / (tn+fp)
print('Specificity on Test set(tn / (tn+fp)): ', specificity)
sensitivity = tp / (tp+fn)
print('Sensitivity on Test set(tp / (tp+fn)): ', sensitivity)
accuracy = (tp+tn) /(tp+tn+fp+fn)
print('Accuracy on Test set: ', accuracy)
precision=tp/(tp+fp)
print("Precision on Test set: ", precision)

### Confusion matrix

In [None]:
confusion = confusion_matrix(Y_novo, y_novo_pred)


tn, fp, fn, tp = confusion.ravel()

print("Confusion Matrix:")
print(confusion)

plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, cmap='Greens', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Repurposing

new dataset with anticacer, insecticidial, antiviral peptides
Note:this dataset was treated in exp2 archive
can this peptides also be linked with antibiofilm capcity

In [None]:
peptides_3 = pd.read_csv('anti_viral_cancer_parasita_ins.csv', encoding='cp1252') #latin encoding
peptides_3 = peptides_3.round(3)
peptides_3= peptides_3.dropna(subset=peptides.columns[3:])
peptides_3

In [None]:
SS_classifier.fit(X_new, Y)
X_peptides_3 = peptides_3.to_numpy()[:, 3:]  
X_peptides_normalizado_3 = min_max_scaler.transform(X_peptides_3)
y_peptides_predict = SS_classifier.predict(X_peptides_normalizado_3)


In [None]:
counts = np.bincount(y_peptides_predict)

colors = ['blue', 'green']

plt.bar([0, 1], counts, color=colors)
plt.xticks([0, 1])
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Histogram')

legend_labels = ['No Repurposing', 'Repurposing']
legend_handles = [plt.bar([0], [0], color=colors[i])[0] for i in range(len(colors))]
plt.legend(legend_handles, legend_labels)

plt.show()


Guilherme Sousa; Anália lourenço; Maria oliveira