In [1]:
import pandas as pd
import numpy as np
from importnb import Notebook
import sys, io
import matplotlib
import glob, os
# add the folder vonheijine that contains our vonheijine functions
sys.path.append('../Feature_Selection/')
# avoid to print useless things when importing another notebook
matplotlib.use('Agg')  # blocca la visualizzazione grafica
old_stdout = sys.stdout
sys.stdout = io.StringIO()  

with Notebook():
    import custom_features, feauture_selection
sys.stdout = old_stdout

#remove temporary files created by feature_selection
for f in glob.glob("*_features_*.npz"):
    os.remove(f)

  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()


In [2]:
print(feauture_selection.features_to_use)

['VhonHeijne', 'C', 'tm_tendency_max', 'chou_fasman_h_mean', 'max_miyazawa_mean', 'D', 'T', 'R', 'chou_fasma_b_max', 'N', 'flexibility_max', 'punta_max', 'bulkiness_mean', 'M', 'argos_max']


In [3]:
def update_vonheijne(sets, matrix): 
    seq_features=[]
    for seq in sets:
        seq=seq.replace("X" , "")
        seq=seq.replace("U" , "C")
        vonhejine=custom_features.vonheijne_feature(matrix, seq) #get the von heijne feature for that sequence
        seq_features.append(vonhejine)
    hejine_col = np.array(seq_features) #transform the list that contains all the features in an array
    return hejine_col

In [4]:
#Load the npz files of training, testing and validation sets for each iteration
# load training
# 5th iteration was: validation set 1 , training set 2,3,4 , testing set 5
loaded_data_train = np.load('../Feature_Selection/training_features_5.npz')
x_train = loaded_data_train['matrix']
y_train = loaded_data_train['target']

# load test
loaded_data_test = np.load('../Feature_Selection/testing_features_5.npz')
x_test = loaded_data_test['matrix']
y_test = loaded_data_test['target']

# load validation
loaded_data_validation = np.load('../Feature_Selection/validation_features_5.npz')
x_validation = loaded_data_validation['matrix']
y_validation = loaded_data_validation['target']

#concatenate the matrices in the correct order
x_training_conc = np.concatenate((x_train , x_test), axis=0) #order is maintained: 1,2,3,4,5
y_training_conc = np.concatenate((y_train, y_test), axis=0)


In [5]:
#Load the benchmark set and encode it
dataset = pd.read_csv("../Data_Preparation/train_bench.tsv", sep = "\t")
benchmark=dataset.query("Set=='Benchmark'")
training= pd.concat([dataset.query("Set=='2'"), dataset.query(" Set=='3' or Set=='4' or Set=='5' "), dataset.query("Set=='1'")],axis=0,ignore_index=True)
matrix_training=custom_features.get_pswm(training , 13 , 2)
#replace the old VonHejine feature with the new VonHejine basing on the updated PSWM
x_training_conc[:, 17] = update_vonheijne(training["Sequence"], matrix_training)
feature_set_benchmark , feature_order_training = custom_features.get_all_features(benchmark["Sequence"] , matrix_training, 15 )
vector_neg_pos = benchmark["Class"]
vector_proper = vector_neg_pos.map({"Positive": 1, "Negative": 0})
target_benchmark_vector = vector_proper.to_numpy()

In [6]:
np.savez('benchmark_features.npz', matrix=x_benchmark, target=target_benchmark_vector)
np.savez('training_features.npz', matrix=x_training_conc, target=y_training_conc)
np.savez('validation_features.npz', matrix=x_training_conc, target=y_training_conc)

## Hyperparameter Tuning

In [7]:
import numpy as np
from sklearn.metrics import matthews_corrcoef
#! pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import matthews_corrcoef , confusion_matrix

In [8]:
#create the pipeline
pipeline = Pipeline([("scaler" , StandardScaler()) , ("svm" , SVC(cache_size=1500))])

In [9]:
# parameter search using BayesSearchCV

# search space for rbf kernel
search_space = {
        "svm__kernel": Categorical(["rbf"]),
        "svm__C": Real(0.01, 100, prior="log-uniform"),                
        "svm__gamma": Real(0.01, 100, prior="log-uniform"), 
    }


In [10]:
#set up the BayesSearch
bayes = BayesSearchCV(
    estimator=pipeline,
    search_spaces=search_space,
    scoring="matthews_corrcoef",   
    n_jobs=-1,
    refit=False,                 
    random_state=42,
    verbose=2,
    cv=5,
    n_iter=60
)
bayes.fit(x_training_conc, y_training_conc)  # here we perform the bayes search

print("\n[Best parameters found:] ")
print(bayes.best_params_)
print(f"[Best MCC validation] {bayes.best_score_:.4f}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [11]:
#predict the benchmark set
pipeline.set_params(**bayes.best_params_).fit(x_training_conc, y_training_conc) #parameters of BayesSearchCV
bench_pred = pipeline.predict(x_benchmark)
#compute the mcc
mcc_bayes = matthews_corrcoef(target_benchmark_vector , bench_pred)
print(f"MCC on testing set (bayesian search): {mcc_bayes}")

MCC on testing set (bayesian search): 0.8081691772560898


In [12]:
conf_mat = confusion_matrix(target_benchmark_vector , bench_pred)
print(conf_mat)

[[1749   38]
 [  37  182]]


Save the model for later use

In [13]:
import pickle
with open('SignalPeptideSVM.pkl', 'wb') as f:
    pickle.dump(pipeline, f)