In [0]:
from omnia.generics import Pipeline, pd, np
from omnia.generics.pipeline_optimization.pipeline_optimization import PipelineOptimization
from sklearn.model_selection import train_test_split
from omnia.proteins.standardization import ProteinStandardizer
from omnia.proteins.encoding import Esm2Encoder
from omnia.proteins.feature_extraction import ProteinDescriptor
from omnia.generics import RNNModelClassifier, Pipeline, pd, TabularPredictor, MLPClassifier,GradientBoostingClassifier,RandomForestClassifier,CNN1DModelClassifier
from omnia.generics.model.autogluon_models.random_forest import RandomForestModel
from omnia.generics.model.autogluon_models.mlp import MultilayerPerceptronNN
from omnia.generics.model.autogluon_models.cat_boost import CatBoostModel
from omnia.generics.model.autogluon_models.knn import KNNModel
from omnia.generics.model.autogluon_models.lgb import LGBModel
from omnia.generics.model.autogluon_models.linear import LinearModel
from omnia.generics.model.autogluon_models.nn import FastAINN
from omnia.generics.model.autogluon_models.svm import SupportVectorMachineModel
from omnia.generics.model.autogluon_models.xg_boost import XGBoostModel
from omnia.generics.model.autogluon_models.xt import XTModel
from omnia.generics.model.autogluon_models import VowpalWabbitModel


In [0]:
df_antioxidant_case_study=pd.read_csv("df_antioxidant_case_study.csv")
x = df_antioxidant_case_study.drop(['label'], axis=1)
y = df_antioxidant_case_study.loc[:, ['label']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42,stratify=y)
x_train.to_csv('x_train_antioxidant.csv', index=False)
x_test.to_csv('x_test_antioxidant.csv', index=False)
y_train.to_csv('y_train_antioxidant.csv', index=False)
y_test.to_csv('y_test_antioxidant.csv', index=False)


x_train = pd.read_csv( 'x_train_antioxidant.csv')
x_test = pd.read_csv('x_test_antioxidant.csv')
y_train = pd.read_csv('y_train_antioxidant.csv')
y_test = pd.read_csv('y_test_antioxidant.csv')

x_train, x_val, y_train, y_val = train_test_split(x_train,y_train, test_size=0.15,stratify=y_train, random_state=42)



In [0]:
standardizer = ProteinStandardizer()

esm_encoder = Esm2Encoder(max_seq_len=600, pretrained_model="35M", two_dimensional_embeddings=False, preset="features")

predictor = TabularPredictor(models=[RandomForestModel,
                                     MultilayerPerceptronNN,
                                     CatBoostModel,
                                     KNNModel,
                                     LGBModel,
                                     LinearModel,
                                     FastAINN,
                                     VowpalWabbitModel,
                                     XTModel,
                                     XGBoostModel])

pipeline = Pipeline(steps=[('standardizer', standardizer),
                            ('protein_encoder', esm_encoder),
                            ('prediction', predictor)],
                    path="antioxidant_esm_pipeline")

pipeline.fit(x_train, y_train,x_val, y_val, problem_type='binary')

pipeline.score(x_test, y_test, metrics=['accuracy','balanced_accuracy','roc_auc','f1','recall','matthews_corrcoef'])


In [0]:
standardizer = ProteinStandardizer()
protein_descriptor = ProteinDescriptor()

predictor =MLPClassifier()

pipeline = Pipeline(steps=[('standardizer', standardizer),
                            ('protein_encoder', protein_descriptor),
                            ('prediction', predictor)],
                    path="antioxidant_pd_pipeline")

pipeline.fit(x_train, y_train, x_val, y_val, problem_type='binary')

pipeline.score(x_test, y_test, metrics=['accuracy','balanced_accuracy','roc_auc','f1','recall','matthews_corrcoef'])

In [0]:
standardizer = ProteinStandardizer()
protein_descriptor = ProteinDescriptor()

predictor =GradientBoostingClassifier()

pipeline = Pipeline(steps=[('standardizer', standardizer),
                            ('protein_encoder', protein_descriptor),
                            ('prediction', predictor)],
                    path="antioxidant_pd_pipeline")

pipeline.fit(x_train, y_train, x_val, y_val, problem_type='binary')

pipeline.score(x_test, y_test, metrics=['accuracy','balanced_accuracy','roc_auc','f1','recall','matthews_corrcoef'])

In [0]:
import os
import numpy as np
from omnia.generics import Pipeline, pd, np
from omnia.generics.pipeline_optimization.pipeline_optimization import PipelineOptimization
from sklearn.model_selection import train_test_split
from omnia.proteins.standardization import ProteinStandardizer
from omnia.proteins.encoding import Esm2Encoder
from omnia.proteins.feature_extraction import ProteinDescriptor
from omnia.generics import RNNModelClassifier, Pipeline, pd, TabularPredictor, MLPClassifier,GradientBoostingClassifier,RandomForestClassifier,CNN1DModelClassifier

x_train = pd.read_csv('x_train_deepalgpro.csv')
y_train = pd.read_csv('y_train_deepalgpro.csv')
x_test = pd.read_csv('x_test_deepalgpro.csv')
y_test = pd.read_csv('y_test_deepalgpro.csv')

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.15, random_state=42)

metrics_df = pd.DataFrame(columns=['trial','accuracy','balanced_accuracy','roc_auc','f1','recall','mcc'])
standardizer = ProteinStandardizer()
protein_descriptor = ProteinDescriptor()

predictor =RandomForestClassifier()

pipeline = Pipeline(steps=[('standardizer', standardizer),
                            ('protein_encoder', protein_descriptor),
                            ('prediction', predictor)],
                    path="antioxidant_pd_pipeline")

pipeline.fit(x_train, y_train, x_val, y_val, problem_type='binary')

metrics = pipeline.score(x_test, y_test, metrics=['accuracy','roc_auc','f1','recall'])
metrics_df.loc[0] = ["trial"] + [metrics[metric] for metric in metrics]
metrics_df.to_csv('results.csv')