In [1]:
import multiprocessing
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score

In [20]:
# Load the data
data = pd.read_csv("Xy_bladder.csv", index_col=0)
X = data.drop("label", axis=1)
y = data["label"]

X_neg = X[data['label'] == 'negative']
X_pos = X[data['label'] == 'positive']
y_neg = y[y=='negative']
y_pos = y[y=='positive']

# Define the models
models = {
    "SVM": SVC(),
    "LR": LogisticRegression(),
    "GBM": GradientBoostingClassifier(),
    "RF": RandomForestClassifier(),
    "ANN": MLPClassifier(max_iter=1000)
}

# Define the hyperparameters for each model
params = {
    "ANN": {
        "hidden_layer_sizes": [(100,), (50,50), (25,25,25)],
        "alpha": [0.001, 0.01, 0.1]
    },
    "SVM": {
        "C": [0.1, 1, 10],
        "gamma": [0.1, 1, "scale", "auto"]
    },
    "LR": {
        "C": [0.1, 1, 10]
    },
    "GBM": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 1],
        "max_depth": [3, 4, 5]
    },
    "RF": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 4, 5],
        "max_features": ["sqrt", "log2"]
    }
}

# Define the function to run the models
def run_model(model_name, model, X, y):
    # Define the cross-validation method
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    # Define the grid search with cross-validation
    grid_search = GridSearchCV(model, params[model_name], scoring="roc_auc", cv=cv, n_jobs=-1, error_score='raise')
    # Fit the grid search to the data
    grid_search.fit(X, y)
    # Print the best hyperparameters and the best score
    print(model_name + " best hyperparameters: " + str(grid_search.best_params_))
    print(model_name + " best score: " + str(grid_search.best_score_))
    # Save the best model and its parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    return best_model, best_params

for i in range(10):
    X = pd.concat((X_pos, X_neg[i::10]), axis=0)
    y = pd.concat((y_pos, y_neg[i::10]), axis=0)
    # Run the models in parallel
    pool = multiprocessing.Pool(processes=len(models))
    results = [pool.apply_async(run_model, args=(name, model, X, y)) for name, model in models.items()]
    output = [p.get() for p in results]

    # Save the best models and their parameters
    best_models = {}
    best_params = {}
    for i, (name, _) in enumerate(models.items()):
        best_models[name] = output[i][0]
        best_params[name] = output[i][1]

GBM best hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
GBM best score: 0.81875
RF best hyperparameters: {'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 200}
RF best score: 0.8387499999999999


  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-

LR best hyperparameters: {'C': 0.1}
LR best score: 0.6537499999999999
SVM best hyperparameters: {'C': 10, 'gamma': 0.1}
SVM best score: 0.66625
ANN best hyperparameters: {'alpha': 0.01, 'hidden_layer_sizes': (100,)}
ANN best score: 0.70625
RF best hyperparameters: {'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 100}
RF best score: 0.75625
GBM best hyperparameters: {'learning_rate': 1, 'max_depth': 4, 'n_estimators': 100}
GBM best score: 0.6575


  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-

LR best hyperparameters: {'C': 0.1}
LR best score: 0.74
SVM best hyperparameters: {'C': 10, 'gamma': 'scale'}
SVM best score: 0.80375
ANN best hyperparameters: {'alpha': 0.1, 'hidden_layer_sizes': (50, 50)}
ANN best score: 0.7675
RF best hyperparameters: {'max_depth': 4, 'max_features': 'log2', 'n_estimators': 100}
RF best score: 0.8262499999999999
GBM best hyperparameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}
GBM best score: 0.73625


  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-

LR best hyperparameters: {'C': 1}
LR best score: 0.8387499999999999
SVM best hyperparameters: {'C': 10, 'gamma': 'auto'}
SVM best score: 0.8162499999999999
ANN best hyperparameters: {'alpha': 0.001, 'hidden_layer_sizes': (25, 25, 25)}
ANN best score: 0.835
RF best hyperparameters: {'max_depth': 5, 'max_features': 'log2', 'n_estimators': 100}
RF best score: 0.79
GBM best hyperparameters: {'learning_rate': 1, 'max_depth': 4, 'n_estimators': 200}
GBM best score: 0.765


  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-

LR best hyperparameters: {'C': 10}
LR best score: 0.8550000000000001
SVM best hyperparameters: {'C': 10, 'gamma': 'scale'}
SVM best score: 0.8099999999999999
ANN best hyperparameters: {'alpha': 0.01, 'hidden_layer_sizes': (100,)}
ANN best score: 0.85625
GBM best hyperparameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}
GBM best score: 0.875625
RF best hyperparameters: {'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 200}
RF best score: 0.85125


  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-

LR best hyperparameters: {'C': 1}
LR best score: 0.805
SVM best hyperparameters: {'C': 0.1, 'gamma': 1}
SVM best score: 0.81875
ANN best hyperparameters: {'alpha': 0.001, 'hidden_layer_sizes': (25, 25, 25)}
ANN best score: 0.8375
RF best hyperparameters: {'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 200}
RF best score: 0.8087500000000001
GBM best hyperparameters: {'learning_rate': 1, 'max_depth': 3, 'n_estimators': 100}
GBM best score: 0.79625


In [21]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score
import multiprocessing

data = pd.read_csv("Xy_bladder.csv", index_col=0)
X = data.drop("label", axis=1)
y = data["label"]

X_neg = X[data['label'] == 'negative']
X_pos = X[data['label'] == 'positive']
y_neg = y[y=='negative']
y_pos = y[y=='positive']

# Define the parameters for the ANN
param_grid = {
    "hidden_layer_sizes": [(50,), (100,), (150,), (200,)],
    "alpha": [0.0001, 0.001, 0.01],
    "max_iter": [500, 1000, 2000],
}

# Create the ANN classifier object
classifier = MLPClassifier()

# Define the 10-fold cross-validation object
cv = StratifiedKFold(n_splits=2)

# Define the GridSearchCV object with AUROC as scoring metric
grid_search = GridSearchCV(
    classifier,
    param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=multiprocessing.cpu_count(),
)

for i in range(10):
    X = pd.concat((X_pos, X_neg[i::10]), axis=0)
    y = pd.concat((y_pos, y_neg[i::10]), axis=0)

# Fit the GridSearchCV object to the data
    grid_search.fit(X, y.values.ravel())

    # Print the best parameters and AUROC score
    print("Best parameters: ", grid_search.best_params_)
    print("Best AUROC score: ", grid_search.best_score_)

    # Save the best model parameters for future predictions
    best_params = grid_search.best_params_
    best_model = MLPClassifier(**best_params)
    best_model.fit(X, y.values.ravel())


Best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (150,), 'max_iter': 2000}
Best AUROC score:  0.7892857142857143
Best parameters:  {'alpha': 0.01, 'hidden_layer_sizes': (50,), 'max_iter': 1000}
Best AUROC score:  0.6714285714285715
Best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'max_iter': 1000}
Best AUROC score:  0.825
Best parameters:  {'alpha': 0.001, 'hidden_layer_sizes': (50,), 'max_iter': 2000}
Best AUROC score:  0.6726190476190477
Best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'max_iter': 1000}
Best AUROC score:  0.786904761904762
Best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (150,), 'max_iter': 500}
Best AUROC score:  0.7357142857142857
Best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (200,), 'max_iter': 2000}
Best AUROC score:  0.7785714285714287
Best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'max_iter': 1000}
Best AUROC score:  0.8285714285714286




Best parameters:  {'alpha': 0.01, 'hidden_layer_sizes': (50,), 'max_iter': 500}
Best AUROC score:  0.7619047619047619




Best parameters:  {'alpha': 0.001, 'hidden_layer_sizes': (200,), 'max_iter': 2000}
Best AUROC score:  0.8523809523809524


In [23]:
best_model

### Predict top 10 possible bladder cancer target genes using ANN

In [59]:
X_predict = pd.read_csv('all_gene_dataset.csv', index_col=0)
# create a list of words to exclude
exclude_words = list(data.index)

# create a Boolean mask to exclude rows containing the exclude words
exclude_mask = ~X_predict.index.str.contains('|'.join(exclude_words))

# apply the Boolean mask to the DataFrame to exclude the rows
X_filtered = X_predict[exclude_mask]

# make predictions on the test data using the best_model
y_predict_pred_proba = best_model.predict_proba(X_predict)[:, 1]

# add the predicted probabilities to the X_predict DataFrame
X_predict['predicted_proba'] = y_predict_pred_proba

# select the top 10 observations with the highest predicted probabilities
best_observations = X_predict.nlargest(10, 'predicted_proba')

# print the top 10 observations with their predicted probabilities
print(best_observations[['predicted_proba']])


                  predicted_proba
Gene                             
MUC12 (10071)                 1.0
TP53 (7157)                   1.0
CYP2D6 (1565)                 1.0
TTN (7273)                    1.0
KIR2DL1 (3802)                1.0
CDKN2A (1029)                 1.0
DNAH5 (1767)                  1.0
FREM1 (158326)                1.0
LAMA1 (284217)                1.0
ADAMTSL1 (92949)              1.0


Feature names must be in the same order as they were in fit.

