# Train the model with complete dataset

## Train ML model

In [25]:
# import libraries
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import VotingClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTENC


In [26]:
# import data
df_proteins = pd.read_csv('../merged_data/df_proteintargets_targettype_appr_fail.csv', sep= ',')

In [29]:
# define X and y
X = df_proteins.drop('Outcome', axis = 1)
y = df_proteins['Outcome']

In [31]:
# model parameters:
categorical_features = ['KEGG_pathway', 'Biochemical_class', 'Enzyme_class', 'PDB_structure', 'Mode_of_action']
categorical_transformer = OneHotEncoder(sparse=False,handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('OHE', categorical_transformer, categorical_features)
    ]
)

models = [
          ('logreg', LogisticRegression()),
          ('forest', RandomForestClassifier(n_estimators=120))
]
m = VotingClassifier(models)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ('smote', SMOTENC(random_state=11, categorical_features=[0,1,2,3,4,])),
        ('models', m)
    ]
)

In [32]:
from sklearn import set_config
set_config(display="diagram")
pipeline

In [37]:
pipeline.fit_resample(X,y)

In [38]:
model = pipeline.fit(X,y)

In [None]:
# save model as a pickl file
import pickle

with open("model_predicting_druggability.pkl", "wb") as f:
    pickle.dump(model, f)

### Predict PROTEIN1

In [42]:
# import test protein1
df_protein1 = pd.read_csv('df_protein1.csv', sep= ',')

In [46]:
pred1 = pipeline.predict(df_protein1)
pred1

array([0.])

### Predict PROTEIN2

In [47]:
# import test protein2
df_protein2 = pd.read_csv('df_protein2.csv', sep= ',')

In [48]:
pred2 = pipeline.predict(df_protein2)
pred2

array([1.])