__ProPythia__

This file is a simulation for the antioxidant dataset;

I intend to run propythia here so that in the future I can make comparisons with the results obtained with omnia.

i will do this simulation with the antioxidant data, where the data in unbalanced

# imports

In [0]:
import sys
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
import csv
import numpy as np
import pandas as pd
import tensorflow as tf
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

from scipy.stats import loguniform
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import roc_curve, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, matthews_corrcoef,f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier

from propythia.protein.descriptors import ProteinDescritors
from propythia.protein.encoding import Encoding
from propythia.ml.shallow_ml import ShallowML
from propythia.protein.sequence import ReadSequence

from propythia.ml.deep_ml import DeepML

from propythia.feature_selection import FeatureSelection

from propythia.preprocess import Preprocess

from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from itertools import product
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix


# Encoding

__Note__: 600 is a high value for our dataset, but is the value more mentioned in literature

# read the data is the first step !! do not forget

In [0]:
read_seqs = ReadSequence()
x_train_bert = read_seqs.par_preprocessing(dataset= x_train_bert, col = 'sequence', B ='N', Z = 'Q', U = 'C', O = 'K', J = 'I', X = '')
x_test_bert = read_seqs.par_preprocessing(dataset= x_test_bert, col = 'sequence', B ='N', Z = 'Q', U = 'C', O = 'K', J = 'I', X = '')

In [0]:
def pad_and_truncate_sequences(df, seq_col, max_length, padding_value='#'):
    """
    Pad and truncate the protein sequences in a DataFrame to a specific length.

    :param df: DataFrame containing the protein sequences.
    :param seq_col: Name of the column in df that contains the protein sequences.
    :param max_length: The maximum length for all sequences.
    :param padding_value: The value to use for padding the sequences.
    :return: DataFrame with the padded and truncated sequences.
    """
    def pad_and_truncate(seq):
        # Truncate the sequence if it's too long
        if len(seq) > max_length:
            seq = seq[:max_length]
        # Pad the sequence if it's not long enough
        elif len(seq) < max_length:
            seq += padding_value * (max_length - len(seq))
        #seq=" ".join(seq)
        return seq

    df['padded_and_truncated_sequence'] = df[seq_col].apply(pad_and_truncate)
    return df

# Use the function
x_train_bert_encode = pad_and_truncate_sequences(x_train_bert, 'sequence', 600)
x_test_bert_encode = pad_and_truncate_sequences(x_test_bert, 'sequence', 600)

In [0]:
enconde_train_df = Encoding(dataset= x_train_bert_encode ,  col= 'padded_and_truncated_sequence')
encode_test_df=Encoding(dataset= x_test_bert_encode ,  col= 'padded_and_truncated_sequence')

In [0]:
protbert_train = enconde_train_df.get_protbert()
x_train_bert=np.array([x.astype(np.float64) for x in protbert_train['protbert']])

protbert_test = encode_test_df.get_protbert()
x_test_bert=np.array([x.astype(np.float64) for x in protbert_test['protbert']])



# Deep Learning

The aim is to recreate the deepLearning models implemented in omnia as well as the possibility of optimising hyperparameters by grid search (using a param_grid).

# RNN

In [0]:
#RNN and propythia
def create_rnn_model(rnn_type='LSTM', bidirectional=False, num_rnn_layers=1, hidden_dim=64, num_dense_layers=1, neurons_dense=32, output_dim=1, drop=0.3, activation='relu', last_layers_activations='sigmoid'):
    model = tf.keras.models.Sequential()

    # RNN layers
    for i in range(num_rnn_layers):
        current_hidden_dim =hidden_dim // (2**i)
        if rnn_type == 'LSTM':
            rnn_layer = tf.keras.layers.LSTM(current_hidden_dim, return_sequences=(i != num_rnn_layers - 1), activation=activation)
        elif rnn_type == 'GRU':
            rnn_layer = tf.keras.layers.GRU(current_hidden_dim, return_sequences=(i != num_rnn_layers - 1), activation=activation)
        elif rnn_type == 'SimpleRNN':
            rnn_layer = tf.keras.layers.SimpleRNN(current_hidden_dim, return_sequences=(i != num_rnn_layers - 1), activation=activation)
        else:
            raise ValueError("Invalid RNN type. Supported types are 'LSTM', 'GRU', and 'SimpleRNN'.")

        if bidirectional:
            rnn_layer = tf.keras.layers.Bidirectional(rnn_layer)

        model.add(rnn_layer)
        model.add(tf.keras.layers.Dropout(drop))

    # Dense layers
    for i in range(num_dense_layers):
        current_neurons_dense = neurons_dense if i == 0 else neurons_dense // (2**i)
        model.add(tf.keras.layers.Dense(current_neurons_dense, activation=activation))
        model.add(tf.keras.layers.Dropout(drop))

    model.add(tf.keras.layers.Dense(output_dim, activation=last_layers_activations))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

dl=DeepML(x_train_bert, y_train_bert, x_test_bert, y_test_bert, number_classes=2, problem_type='binary',
          x_dval=None, y_dval=None, epochs=100, batch_size=32,
          path='', report_name=None, verbose=1,
         early_stopping_patience=20, reduce_lr_patience=10, reduce_lr_factor=0.2, reduce_lr_min=0.00001,
                 )
model = KerasClassifier(build_fn=create_rnn_model)


In [0]:
%%capture captured

def generate_param_grid(num_rnn_layers:list, num_dense_layers:list):
    param_grid = {
        'rnn_type': ['LSTM', 'GRU', 'SimpleRNN'],
        'bidirectional': [True, False],
        'num_rnn_layers': num_rnn_layers,
        'hidden_dim': [64,128],
        'num_dense_layers': num_dense_layers,
        'neurons_dense': [64,128],
        'output_dim': [1],
        'drop': [0.1, 0.3],
        'activation': ['relu'],
        'last_layers_activations': ['sigmoid']
    }
    return param_grid
num_rnn_layers = [2,3]
num_dense_layers = [2,3]
param_grid = generate_param_grid(num_rnn_layers, num_dense_layers)
best_classifier_rnn=dl.get_opt_params(param_grid,model,scoring=make_scorer(f1_score),optType='randomizedSearch',cv=5,n_iter_search=80)
best_classifier_rnn

In [0]:
with open('output_data_rnn_protbert.log', 'w') as f:
    f.write(str(captured))

In [0]:
scores, report, cm, cm2=dl.score_testset_classification(best_classifier_rnn)
scores

In [0]:
from sklearn.metrics import confusion_matrix


y_pred = best_classifier_rnn.predict(dl.x_test)

conf_mat = confusion_matrix(dl.y_test, y_pred)

dl.conf_matrix_seaborn_table(conf_matrix=conf_mat)