In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, f1_score, precision_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

le = LabelEncoder()

df_merged = pd.read_csv('data/dataset_merged.csv')

df_merged

Unnamed: 0,cluster_name,BGC_id,PKSI-KR_m1,PKSI-KS_m3,PKSI-KS_m4,PKSI-KS_m5,PKSI-KS_m6,PKSI-AT-mM_m1,PKSI-AT-mM_m2,PKSI-AT-M_m3,...,GO:0098657,GO:0043937,GO:0030436,GO:0034219,GO:0006974,GO:0000287,GO:0019732,is_antibac,is_antifung,cytotoxic
0,methymycin,BGC0000094,5.0,7.0,3.0,7.0,7.0,6.0,7.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
1,7_deoxypactamycin,BGC0000118,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2,kinamycin,BGC0000236,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0,0
3,lactonamycin,BGC0000238,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,1
4,lactonamycin_z,BGC0000238,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,TglA-thiaGlu,BGC0002027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
214,frigocyclinone,BGC0002028,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1
215,kitasetaline,BGC0002031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
216,pentamycin,BGC0002032,14.0,14.0,0.0,14.0,14.0,13.0,14.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0


Antibacterial

In [2]:
steps = [('scaler', StandardScaler()), ('model', SVC(C = 10, gamma = 0.001, kernel = 'rbf', probability = True, random_state = 42))]
model = Pipeline(steps = steps)

X = df_merged.iloc[:,2:-3]
y = df_merged['is_antibac']

scores_acc = cross_val_score(model, X, y, cv = 10, scoring = 'accuracy', n_jobs = -1)
scores_auc = cross_val_score(model, X, y, cv = 10, scoring = 'roc_auc', n_jobs = -1)

print('ACC: {:.4f}, AUC: {:.4f}'.format(scores_acc.mean(), scores_auc.mean()))

ACC: 0.6799, AUC: 0.6507


Antifungi

In [3]:
y = df_merged['is_antifung']

scores_acc = cross_val_score(model, X, y, cv = 10, scoring = 'accuracy', n_jobs = -1)
scores_auc = cross_val_score(model, X, y, cv = 10, scoring = 'roc_auc', n_jobs = -1)

print('ACC: {:.4f}, AUC: {:.4f}'.format(scores_acc.mean(), scores_auc.mean()))

ACC: 0.9268, AUC: 0.7025


Cytoxic

In [4]:
y = df_merged['cytotoxic']

scores_acc = cross_val_score(model, X, y, cv = 10, scoring = 'accuracy', n_jobs = -1)
scores_auc = cross_val_score(model, X, y, cv = 10, scoring = 'roc_auc', n_jobs = -1)

print('ACC: {:.4f}, AUC: {:.4f}'.format(scores_acc.mean(), scores_auc.mean()))

ACC: 0.8853, AUC: 0.7774
