In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("CycPeptMPDB_peptide.csv")

In [3]:
for column in df:
    print (column)

CycPeptMPDB_ID
Source
Year
Original_Name_in_Source_Literature
Structurally_Unique_ID
SMILES
Monomer_Length
Monomer_Length_in_Main_Chain
Molecule_Shape
Permeability
PAMPA
Caco2
MDCK
RRCK
MaxEStateIndex
MinEStateIndex
MaxAbsEStateIndex
MinAbsEStateIndex
qed
MolWt
HeavyAtomMolWt
ExactMolWt
NumValenceElectrons
NumRadicalElectrons
MaxPartialCharge
MinPartialCharge
MaxAbsPartialCharge
MinAbsPartialCharge
FpDensityMorgan1
FpDensityMorgan2
FpDensityMorgan3
BCUT2D_MWHI
BCUT2D_MWLOW
BCUT2D_CHGHI
BCUT2D_CHGLO
BCUT2D_LOGPHI
BCUT2D_LOGPLOW
BCUT2D_MRHI
BCUT2D_MRLOW
BalabanJ
BertzCT
Chi0
Chi0n
Chi0v
Chi1
Chi1n
Chi1v
Chi2n
Chi2v
Chi3n
Chi3v
Chi4n
Chi4v
HallKierAlpha
Ipc
Kappa1
Kappa2
Kappa3
LabuteASA
PEOE_VSA1
PEOE_VSA10
PEOE_VSA11
PEOE_VSA12
PEOE_VSA13
PEOE_VSA14
PEOE_VSA2
PEOE_VSA3
PEOE_VSA4
PEOE_VSA5
PEOE_VSA6
PEOE_VSA7
PEOE_VSA8
PEOE_VSA9
SMR_VSA1
SMR_VSA10
SMR_VSA2
SMR_VSA3
SMR_VSA4
SMR_VSA5
SMR_VSA6
SMR_VSA7
SMR_VSA8
SMR_VSA9
SlogP_VSA1
SlogP_VSA10
SlogP_VSA11
SlogP_VSA12
SlogP_VSA2
SlogP_VSA3
S

In [4]:
df.drop(['PAMPA', 'Caco2', 'MDCK','RRCK'], axis = 1, inplace = True)

In [5]:
df = df.dropna()

In [6]:
df.drop(['SMILES', 'Structurally_Unique_ID', 'Original_Name_in_Source_Literature', 'Year', 'Source', 'CycPeptMPDB_ID'], axis = 1, inplace = True)

In [7]:
df = df[df.Permeability != -10]

In [8]:
df1 = pd.get_dummies(df['Molecule_Shape'])
df = pd.concat([df, df1], axis=1).reindex(df.index)
df.drop('Molecule_Shape', axis=1, inplace=True)

In [9]:
df.drop('Lariat', axis = 1, inplace = True)

In [10]:
df['Permeability'] = df['Permeability'].apply(lambda x: 1 if x >= -6 else 0)

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [12]:
from xgboost import XGBClassifier

In [13]:
x2 = df[['MinPartialCharge', 'PEOE_VSA6', 'SlogP_VSA8', 'VSA_EState6', 'VSA_EState9', 'NumAromaticRings', 'NumRotatableBonds', 'MolLogP', 'fr_Al_OH']]

In [14]:
y2 = df['Permeability']

In [15]:
xtrain, xtest, ytrain, ytest = train_test_split(x2, y2, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

In [16]:
params = {
    "learning_rate": 0.3
}

model = XGBClassifier(**params)

In [17]:
model.fit(xtrain, ytrain)

In [18]:
ypred = model.predict(xtest)

In [20]:
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [21]:
accuracy = accuracy_score(ytest, ypred)
accuracy

0.8072122052704577

In [22]:
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.71      0.61      0.66       434
           1       0.84      0.89      0.87      1008

    accuracy                           0.81      1442
   macro avg       0.78      0.75      0.76      1442
weighted avg       0.80      0.81      0.80      1442



In [23]:
print(f1_score(ytest, ypred))

0.8660886319845857


In [26]:
import pickle

In [27]:
filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [29]:
loaded_model = pickle.load(open('trained_model.sav', 'rb'))

In [31]:
input_data = (-0.390126486,109.084361,0,12.32715344,9.915555435,0,15,3.269,1)

input_data_as_numpy_array = np.asarray(input_data)

input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('Low Permeability')
else:
  print('Good Permeability')

[0]
Low Permeability
