##### Imports:

In [20]:
### Install

%pip install -U pandas scikit-learn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

Note: you may need to restart the kernel to use updated packages.


##### MLPClassifier:

In [37]:
# Load data
csv_file_path = "data/proteinliganddocking.csv"
data = pd.read_csv(csv_file_path)

# columns
# id,protein_id,ligand_id,protein_HELM,ligand_HELM,affinity,resolution,pdb_code,classification,source

# Define features and target variable
X = data.drop(columns=["id", 
                       "protein_id", 
                       "ligand_id",
                       "affinity",
                       "pdb_code", 
                       "source"])
X = pd.get_dummies(X)

y = data["affinity"]

# Split data into train/test set
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42)


mlp = MLPRegressor(hidden_layer_sizes= (100, 50),
                   activation= 'relu',
                   solver= 'lbfgs',
                   learning_rate= 'adaptive',
                   learning_rate_init= 0.01, 
                   max_iter= 1500,
                   alpha= 0.0001,
                   random_state= 42)

mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error: {round(mse, 4)}")


Mean Squared Error: 0.8421


The Mean Squared Error started at 3.2383 with the help of RandomizedSearchCV the optimal parameters where searched.

In [38]:
param_dist = {
    'hidden_layer_sizes': [(100,), (100, 50), (150, 100, 50), (100, 100, 10)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'lbfgs'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'max_iter': [500, 1000, 1500]
}

mlp = MLPRegressor(random_state=42)

random_search = RandomizedSearchCV(estimator=mlp, param_distributions=param_dist, n_iter=100, cv=5, random_state=42)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Define an empty list to store results
results_list = []

# Perform RandomizedSearchCV and store results
for params, mse in zip(random_search.cv_results_['params'], random_search.cv_results_['mean_test_score']):
    result = {**params, 'MSE': mse}
    results_list.append(result)

# Create a DataFrame from the list of results
results_df = pd.DataFrame(results_list)
results_df.to_csv('hyperparameter_results.csv', index=False)

best_model = random_search.best_estimator_
print( best_model.get_params())

Best Hyperparameters: {'solver': 'lbfgs', 'max_iter': 1500, 'learning_rate_init': 0.01, 'learning_rate': 'constant', 'hidden_layer_sizes': (100, 50), 'alpha': 0.0001, 'activation': 'relu'}
{'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100, 50), 'learning_rate': 'constant', 'learning_rate_init': 0.01, 'max_fun': 15000, 'max_iter': 1500, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': 42, 'shuffle': True, 'solver': 'lbfgs', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}
