In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [2]:
df = pd.read_csv("CycPeptMPDB_Peptide.csv")

In [3]:
df.drop(['PAMPA', 'Caco2', 'MDCK','RRCK'], axis = 1, inplace = True)
df = df.dropna()
df.drop(['SMILES', 'Structurally_Unique_ID', 'Original_Name_in_Source_Literature', 'Year', 'Source', 'CycPeptMPDB_ID'], axis = 1, inplace = True)
df.drop(['Molecule_Shape','Monomer_Length_in_Main_Chain', 'Monomer_Length'], axis = 1, inplace = True)
df = df[df.Permeability != -10]

In [4]:
def classify_permeability(value):
    if value >= -5:
        return 'good'
    elif -7 < value < -5:
        return 'moderate/low'
    else:
        return 'impermeable'

df['Permeability_class'] = df['Permeability'].apply(classify_permeability)

# Drop the original 'Permeability_' column as we now have the classified version
df = df.drop(columns=['Permeability'])

# Display the distribution of the new classes
print(df['Permeability_class'].value_counts())

Permeability_class
moderate/low    5718
good            1053
impermeable      438
Name: count, dtype: int64


In [5]:
X = df.drop(columns=['Permeability_class'])
y = df['Permeability_class']

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

# Display encoded classes
print(f"Encoded classes: {le.classes_}")

Encoded classes: ['good' 'impermeable' 'moderate/low']


In [7]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state= 0, stratify=y)

In [8]:
import xgboost as xgb

# Build and train the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(Xtrain, ytrain)

# Predict and evaluate
xgb_preds = xgb_model.predict(Xtest)

print(classification_report(ytest, xgb_preds, target_names=le.classes_))
print(f"Accuracy: {accuracy_score(ytest, xgb_preds)}")
print(f"F1 Score: {f1_score(ytest, xgb_preds, average='weighted')}")

              precision    recall  f1-score   support

        good       0.81      0.69      0.75       211
 impermeable       0.76      0.51      0.61        87
moderate/low       0.91      0.96      0.94      1144

    accuracy                           0.89      1442
   macro avg       0.83      0.72      0.76      1442
weighted avg       0.89      0.89      0.89      1442

Accuracy: 0.8938973647711512
F1 Score: 0.8885519918669946


In [9]:
import pickle

In [10]:
filename = 'trained_model.sav'
pickle.dump(xgb_model, open(filename, 'wb'))

In [11]:
loaded_model = pickle.load(open('trained_model.sav', 'rb'))

In [12]:
yn = df['Permeability_class']
Xn = df[['MolLogP','TPSA', 'MolWt','NumHAcceptors','NumHDonors', 'NumRotatableBonds']]
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
yn = le.fit_transform(yn)

# Display encoded classes
print(f"Encoded classes: {le.classes_}")

Encoded classes: ['good' 'impermeable' 'moderate/low']


In [13]:
Xtrain, Xtest, ytrain, ytest = train_test_split(Xn, yn, test_size=0.2, random_state= 10, stratify=y)

In [14]:
import xgboost as xgb

# Build and train the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(Xtrain, ytrain)

# Predict and evaluate
xgb_preds = xgb_model.predict(Xtest)

print(classification_report(ytest, xgb_preds, target_names=le.classes_))
print(f"Accuracy: {accuracy_score(ytest, xgb_preds)}")
print(f"F1 Score: {f1_score(ytest, xgb_preds, average='weighted')}")

              precision    recall  f1-score   support

        good       0.81      0.65      0.72       211
 impermeable       0.73      0.49      0.59        87
moderate/low       0.90      0.96      0.93      1144

    accuracy                           0.89      1442
   macro avg       0.81      0.70      0.75      1442
weighted avg       0.88      0.89      0.88      1442

Accuracy: 0.8862690707350902
F1 Score: 0.8800158397463771


In [15]:
data = pd.read_csv('Cleaned_classification.csv')
data.rename(columns={'MW': 'MolWt'}, inplace=True)
data.rename(columns={'miLogP': 'MolLogP'}, inplace=True)
data.rename(columns={'HBD': 'NumHDonors'}, inplace=True)
data.rename(columns={'HBA': 'NumHAcceptors'}, inplace=True)
data.rename(columns={'RotB': 'NumRotatableBonds'}, inplace=True)
data.rename(columns={'tPSA (Ã…Â²)': 'TPSA'}, inplace=True)
data.drop(['Compound'], axis = 1, inplace = True)
data.drop(['F (%)'], axis = 1, inplace = True)
data['MolLogP'] = data['MolLogP'].astype(float)
data['TPSA'] = data['TPSA'].astype(float)
data

Unnamed: 0,MolWt,MolLogP,NumHDonors,NumHAcceptors,NumRotatableBonds,TPSA,0
0,541,1.6,4.0,10.0,2.0,143.0,2
1,836,-0.8,12.0,18.0,13.0,287.0,2
2,561,-2.3,9.0,15.0,8.0,236.0,2
3,566,3.9,5.0,10.0,10.0,145.0,2
4,679,4.7,6.0,12.0,12.0,174.0,2
...,...,...,...,...,...,...,...
131,1184,3.5,5.0,22.0,14.0,276.0,2
132,1170,3.7,5.0,21.0,13.0,258.0,2
133,1203,3.6,5.0,23.0,15.0,279.0,2
134,1322,2.9,6.0,25.0,19.0,302.0,2


In [16]:
current_feature_order = ['MolWt', 'MolLogP', 'NumHDonors', 'NumHAcceptors', 'NumRotatableBonds', 'TPSA']

# Get the current order of features
desired_feature_order = ['MolLogP', 'TPSA', 'MolWt', 'NumHAcceptors', 'NumHDonors', 'NumRotatableBonds']

# Create a list of indices to reorder the columns
indices = [current_feature_order.index(feature) for feature in desired_feature_order]

# Reorder the columns of X_pred
X_pred = data.iloc[:, indices]

In [17]:
xgb_comparison = xgb_model.predict(X_pred)

In [18]:
ypred2 = pd.DataFrame(xgb_comparison)

In [19]:
ypred2.to_csv('output.csv', index=False)

In [20]:
da = pd.read_csv("output.csv")

In [21]:
da.corr()

Unnamed: 0,0
0,1.0
