In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load your datasets
features = pd.read_csv('/content/GEO_HG_PPI.csv', header=0)
labels = pd.read_csv('/content/labels_GEO_HG.csv')

# Prepare the data (assuming labels need to be transposed and properly aligned)
labels = labels.T
labels.columns = ['Label']
# Transpose the features dataframe so that rows are samples and columns are features
features = features.set_index('probe').T

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Save the sample IDs
sample_ids = features.index

# Scale the features
features_scaled = scaler.fit_transform(features)

# Convert back to DataFrame
features_scaled = pd.DataFrame(features_scaled, columns=features.columns, index=sample_ids)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Replace 'features_scaled' and 'labels['Label']' with your actual dataset
X_train, X_test, y_train, y_test = train_test_split(
    features_scaled,  # Your scaled feature set
    labels['Label'],  # Your target variable
    test_size=0.3,
    random_state=42,
    stratify=labels['Label']  # Ensuring stratification
)

# Define the base models
base_models = [
    ('lr', LogisticRegression(random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('nb', GaussianNB()),
    ('knn', KNeighborsClassifier()),
    ('svc', SVC(probability=True, random_state=42))
]

# Define the meta-learner
meta_learner = LogisticRegression(random_state=42)

# Create the stacked model
stacked_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_learner,
    cv=5  # 5-fold cross-validation
)

# Train the stacked model
stacked_model.fit(X_train, y_train)

# Make predictions and evaluate the model
predictions = stacked_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Stacked Model Accuracy: {accuracy}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Stacked Model Accuracy: 0.7938144329896907


In [None]:
!pip install optuna

Defaulting to user installation because normal site-packages is not writeable
Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
   --------------------------------------- 413.4/413.4 kB 12.6 MB/s eta 0:00:00
Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
   --------------------------------------- 233.4/233.4 kB 13.9 MB/s eta 0:00:00
Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
   ---------------------------------------- 78.7/78.7 kB 4.3 MB/s eta 0:00:00
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3

In [None]:
import optuna
import numpy as np
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


def objective(trial):
    # Suggest hyperparameters
    rf_n_estimators = trial.suggest_int("rf_n_estimators", 100, 300, step=50)
    rf_max_depth = trial.suggest_int("rf_max_depth", 10, 30, step=10)
    svc_C = trial.suggest_float("svc_C", 0.1, 10, log=True)

    # Update the base models with suggested hyperparameters
    base_models = [
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=rf_n_estimators, max_depth=rf_max_depth, random_state=42)),
        ('nb', GaussianNB()),
        ('knn', KNeighborsClassifier()),
        ('svc', SVC(C=svc_C, probability=True, random_state=42))
    ]

    # Meta-learner remains the same
    meta_learner = LogisticRegression(random_state=42)

    # Define the stacked model with updated base models
    stacked_model = StackingClassifier(
        estimators=base_models,
        final_estimator=meta_learner,
        cv=5  # 5-fold cross-validation for training base models
    )

    # Perform cross-validation and return the mean accuracy
    scores = cross_val_score(stacked_model, X_train, y_train, cv=3, scoring='accuracy', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)  # Adjust the number of trials as needed

# Output the best hyperparameters
print("Best trial:", study.best_trial.params)

# Train and evaluate the stacked model with optimized hyperparameters
optimized_rf = RandomForestClassifier(n_estimators=study.best_trial.params['rf_n_estimators'], max_depth=study.best_trial.params['rf_max_depth'], random_state=42)
optimized_svc = SVC(C=study.best_trial.params['svc_C'], probability=True, random_state=42)

optimized_base_models = [
    ('lr', LogisticRegression(random_state=42)),
    ('rf', optimized_rf),
    ('nb', GaussianNB()),
    ('knn', KNeighborsClassifier()),
    ('svc', optimized_svc)
]

optimized_stacked_model = StackingClassifier(
    estimators=optimized_base_models,
    final_estimator=meta_learner,
    cv=5
)

optimized_stacked_model.fit(X_train, y_train)
optimized_predictions = optimized_stacked_model.predict(X_test)
optimized_accuracy = accuracy_score(y_test, optimized_predictions)
print(f"Optimized Stacked Model Accuracy: {optimized_accuracy}")


[I 2024-02-06 00:09:11,803] A new study created in memory with name: no-name-c33b9a4d-6735-4010-a5f2-cb9e5e835742
[I 2024-02-06 00:11:36,158] Trial 0 finished with value: 0.7256637168141592 and parameters: {'rf_n_estimators': 200, 'rf_max_depth': 20, 'svc_C': 0.3050331173050926}. Best is trial 0 with value: 0.7256637168141592.
[I 2024-02-06 00:13:59,130] Trial 1 finished with value: 0.7212389380530974 and parameters: {'rf_n_estimators': 200, 'rf_max_depth': 30, 'svc_C': 3.4078396638085353}. Best is trial 0 with value: 0.7256637168141592.
[I 2024-02-06 00:17:16,003] Trial 2 finished with value: 0.728613569321534 and parameters: {'rf_n_estimators': 250, 'rf_max_depth': 30, 'svc_C': 1.9229635235789775}. Best is trial 2 with value: 0.728613569321534.
[I 2024-02-06 00:20:41,117] Trial 3 finished with value: 0.7300884955752212 and parameters: {'rf_n_estimators': 300, 'rf_max_depth': 30, 'svc_C': 0.29586040430767313}. Best is trial 3 with value: 0.7300884955752212.
[I 2024-02-06 00:23:37,764]

[I 2024-02-06 01:36:50,264] Trial 38 finished with value: 0.7227138643067846 and parameters: {'rf_n_estimators': 300, 'rf_max_depth': 20, 'svc_C': 0.7083445029658395}. Best is trial 12 with value: 0.7300884955752213.
[I 2024-02-06 01:38:30,993] Trial 39 finished with value: 0.7241887905604719 and parameters: {'rf_n_estimators': 200, 'rf_max_depth': 20, 'svc_C': 0.5360815552532207}. Best is trial 12 with value: 0.7300884955752213.
[I 2024-02-06 01:40:20,293] Trial 40 finished with value: 0.724188790560472 and parameters: {'rf_n_estimators': 300, 'rf_max_depth': 30, 'svc_C': 1.4774077297267267}. Best is trial 12 with value: 0.7300884955752213.
[I 2024-02-06 01:42:17,148] Trial 41 finished with value: 0.7300884955752213 and parameters: {'rf_n_estimators': 200, 'rf_max_depth': 20, 'svc_C': 1.161435030958048}. Best is trial 12 with value: 0.7300884955752213.
[I 2024-02-06 01:43:42,608] Trial 42 finished with value: 0.7241887905604719 and parameters: {'rf_n_estimators': 150, 'rf_max_depth': 

[I 2024-02-06 02:43:40,777] Trial 76 finished with value: 0.7271386430678466 and parameters: {'rf_n_estimators': 200, 'rf_max_depth': 20, 'svc_C': 2.0919055520184187}. Best is trial 12 with value: 0.7300884955752213.
[I 2024-02-06 02:45:30,847] Trial 77 finished with value: 0.7300884955752213 and parameters: {'rf_n_estimators': 250, 'rf_max_depth': 20, 'svc_C': 1.1050190631121954}. Best is trial 12 with value: 0.7300884955752213.
[I 2024-02-06 02:47:31,479] Trial 78 finished with value: 0.7271386430678466 and parameters: {'rf_n_estimators': 200, 'rf_max_depth': 20, 'svc_C': 1.794129409355013}. Best is trial 12 with value: 0.7300884955752213.
[I 2024-02-06 02:49:09,241] Trial 79 finished with value: 0.7212389380530974 and parameters: {'rf_n_estimators': 150, 'rf_max_depth': 30, 'svc_C': 1.212837206993083}. Best is trial 12 with value: 0.7300884955752213.
[I 2024-02-06 02:51:08,514] Trial 80 finished with value: 0.728613569321534 and parameters: {'rf_n_estimators': 250, 'rf_max_depth': 2

Best trial: {'rf_n_estimators': 250, 'rf_max_depth': 30, 'svc_C': 1.2640016632692503}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Optimized Stacked Model Accuracy: 0.7938144329896907


In [None]:
from joblib import dump, load

# Save the model to disk
dump(optimized_stacked_model, 'optimized_stacked_model.joblib')

# Load the model from disk
loaded_model = load('optimized_stacked_model.joblib')

# Use the loaded model to make predictions
loaded_model_predictions = loaded_model.predict(X_test)
loaded_model_accuracy = accuracy_score(y_test, loaded_model_predictions)
print(f"Loaded Model Accuracy: {loaded_model_accuracy}")


Loaded Model Accuracy: 0.7938144329896907


In [None]:
import pickle

# Save the model to disk
with open('optimized_stacked_model.pkl', 'wb') as model_file:
    pickle.dump(optimized_stacked_model, model_file)

# Load the model from disk
with open('optimized_stacked_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

# Use the loaded model to make predictions
loaded_model_predictions = loaded_model.predict(X_test)
loaded_model_accuracy = accuracy_score(y_test, loaded_model_predictions)
print(f"Loaded Model Accuracy: {loaded_model_accuracy}")


Loaded Model Accuracy: 0.7938144329896907


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# Replace 'features_scaled' and 'labels['Label']' with your actual dataset
X_train, X_test, y_train, y_test = train_test_split(
    features_scaled,  # Your scaled feature set
    labels['Label'],  # Your target variable
    test_size=0.2,
    random_state=42,
    stratify=labels['Label']  # Ensuring stratification
)


# Define the base models, including MLP
base_models = [
    ('lr', LogisticRegression(max_iter=1000, random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('nb', GaussianNB()),
    ('knn', KNeighborsClassifier()),
    ('svc', SVC(probability=True, random_state=42)),
    ('mlp', MLPClassifier(max_iter=1000, random_state=42)),  # Add MLP as a base model
    # ('dt', DecisionTreeClassifier(random_state=42))
    ('xgb_model', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'))  # XGBoost as the third base model
]

# Define the meta-learner
meta_learner = LogisticRegression(random_state=42)

# Create the stacked model
stacked_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_learner,
    cv=5  # 5-fold cross-validation
)

# Train the stacked model
stacked_model.fit(X_train, y_train)

# Make predictions and evaluate the model
predictions = stacked_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Stacked Model Accuracy: {accuracy}")


Stacked Model Accuracy: 0.8041237113402062


In [None]:
import xgboost as xgb
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

# Step 1: Load and Split Your Data


# Step 2: Define Base Models and Meta-Learner
base_models = [
    ('glm_model', LogisticRegression(max_iter=1000)),  # GLM as one of the base models
    ('drf_model', RandomForestClassifier(n_estimators=100, random_state=42)),  # DRF as another base model
    ('xgb_model', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'))  # XGBoost as the third base model
]
meta_learner = LogisticRegression()  # GLM as the meta-learner

# Step 3: Create the Stacked Ensemble Model
stacked_ensemble = StackingClassifier(estimators=base_models, final_estimator=meta_learner)

# Step 4: Train the Stacked Ensemble Model
stacked_ensemble.fit(X_train, y_train)

# Step 5: Evaluate the Model
predictions = stacked_ensemble.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Stacked Model Accuracy: {accuracy}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Stacked Model Accuracy: 0.788659793814433


In [None]:
sample_ids

Index(['GSM177885', 'GSM177887', 'GSM177894', 'GSM177895', 'GSM177899',
       'GSM177900', 'GSM177901', 'GSM177902', 'GSM177909', 'GSM177918',
       ...
       'GSM615702', 'GSM615703', 'GSM615704', 'GSM615705', 'GSM615761',
       'GSM615763', 'GSM615764', 'GSM615766', 'GSM615768', 'GSM615775'],
      dtype='object', length=969)