In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold, RFECV
from sklearn.ensemble import RandomForestClassifier
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import NearestNeighbors
from sklearn.neural_network import MLPClassifier
import math
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

#RF
def fitness_function_rf(hyperparameters, X, y, cv=5):
    n_estimators = int(hyperparameters[0])
    # max_depth = int(hyperparameters[1])
    min_samples_split = int(hyperparameters[1])
    min_samples_leaf = int(hyperparameters[2])
    model = RandomForestClassifier(n_estimators=n_estimators, min_samples_split=min_samples_split,
                                   min_samples_leaf=min_samples_leaf, random_state=100)
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    return np.mean(cv_scores)

#SVM
def fitness_function_svm(hyperparameters, X, y, cv=5):
    C = int(hyperparameters[0])
    gamma = hyperparameters[1]
    #kernel = hyperparameters[2]
    model = SVC(C=C, gamma=gamma, kernel='rbf', random_state=100)
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    return np.mean(cv_scores)

#DT
def fitness_function_dt(hyperparameters, X, y, cv=5):
    min_samples_split = int(hyperparameters[0])
    min_samples_leaf = int(hyperparameters[1])
    model = DecisionTreeClassifier( min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, random_state=100)
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    return np.mean(cv_scores)
#LR
def fitness_function_lr(hyperparameters, X, y, cv=5):
    C = hyperparameters[0]
    #penalty = hyperparameters[1]
    model = LogisticRegression(C=C, penalty='l2', solver='liblinear', random_state=42)
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    return np.mean(cv_scores)
#KNN
def fitness_function_knn(hyperparameters, X, y, cv=5):
    n_neighbors = int(hyperparameters[0])
    #weights = hyperparameters[1]
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    return np.mean(cv_scores)
#MLP
def fitness_function_mlp(hyperparameters, X, y, cv=5):
    layer_1_size = int(hyperparameters[0])
    layer_2_size = int(hyperparameters[1])
    dropout_rate = hyperparameters[2]
    #learning_rate = hyperparameters[3]
    batch_size = int(hyperparameters[3])
    model = MLPClassifier(solver='sgd', alpha=1e-5,
                    hidden_layer_sizes=(layer_1_size, layer_2_size), activation= 'logistic',random_state=1, batch_size=batch_size, max_iter= 1000)
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    return np.mean(cv_scores)


def fitness_function_stacking(hyperparameters, X, y, cv=5):
    # Define base models based on the hyperparameters

    base_models = [
         ('rf', RandomForestClassifier(n_estimators=int(hyperparameters[0]),min_samples_split=int(hyperparameters[1]),min_samples_leaf=int(hyperparameters[2]), random_state=100)),
         ('svm', SVC(C=int(hyperparameters[3]), gamma=hyperparameters[4], kernel='rbf', random_state=100)),
         ('dt', DecisionTreeClassifier(max_depth=int(hyperparameters[5]), min_samples_split=int(hyperparameters[6]),min_samples_leaf=int(hyperparameters[7]), random_state=100))
    ]
    # Define meta-learner
    meta_learner = LogisticRegression(random_state=100)  # Simple Logistic Regression for this example

    model = StackingClassifier(estimators=base_models, final_estimator=meta_learner, cv=cv)
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    return np.mean(cv_scores)


In [None]:
# Define Search Spaces
search_space_rf = {
    'n_estimators': (100, 300),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 5)
}
search_space_svm = {
    "C": (1, 1000),
    "gamma": (0.0001, 0.001)
}
search_space_dt = {
    "min_samples_split": (2, 10),
    "min_samples_leaf": (1, 5)
}
search_space_lr = {
    "C": (0.1, 10.0)
}
search_space_knn = {
    "n_neighbors": (3, 30)
}
search_space_mlp = {
    "layer_1_size": (100, 250),
    "layer_2_size": (50, 150),
    "dropout_rate": (0.0, 0.4),
    #"learning_rate": (0.0001, 0.01),
    "batch_size": (32, 128)
}

search_space_stacking = {
  'n_estimators': (100, 300),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 5),
    "C": (1, 1000),
    "gamma": (0.0001, 0.001),
    "max_depth": (3, 10),
    "min_samples_split_dt": (2, 10),
    "min_samples_leaf_dt": (1, 5)
}


#**TSO**

In [None]:
def initialize_population(population_size, search_space):
    population = []
    for _ in range(population_size):
        tuna = []
        for _, (min_val, max_val) in search_space.items():
            val = random.uniform(min_val, max_val)
            tuna.append(val)
        population.append(tuna)
    return population


def evaluate_population(population, fitness_function, X, y):
    fitness_values = []
    for tuna in population:
        fitness = fitness_function(tuna, X, y)
        fitness_values.append(fitness)
    return fitness_values

def update_tuna_positions(population, fitness_values, search_space, iteration, max_iterations, a=0.3, z=0.06):
    best_index = np.argmax(fitness_values)
    best_tuna = np.array(population[best_index])
    new_population = []

    for i, tuna in enumerate(population):
        tuna = np.array(tuna)
        new_tuna = np.zeros_like(tuna)
        alpha1 = a + (1 - a) * iteration / max_iterations
        alpha2 = (1 - a) - (1 - a) * iteration / max_iterations
        p = (1 - iteration / max_iterations + 1e-8) ** (iteration / max_iterations)
        num_dimensions = len(search_space)
        min_vals = np.array([val[0] for val in search_space.values()])
        max_vals = np.array([val[1] for val in search_space.values()])

        if random.random() < z:  # Equation 1
            new_tuna = np.random.uniform(min_vals, max_vals)
        else:
            if random.random() < 0.5: # Equation 6
              rand_vals = np.random.random(num_dimensions)
              rand_vals_uniform = np.random.uniform(-1,1, num_dimensions)
              new_tuna = best_tuna + rand_vals * (best_tuna - tuna) + rand_vals_uniform * p**2 * (best_tuna - tuna)
            else:
                if iteration / max_iterations >= random.random():  # Equation 7
                  beta = np.exp(np.random.uniform(0, 1, num_dimensions) * iteration) * np.cos(np.random.uniform(0, 1, num_dimensions) * iteration)
                  new_tuna = alpha1 * best_tuna + beta * (best_tuna - tuna) + alpha2 * tuna
                else:  # Equation 2
                  rand_vals_uniform = np.random.uniform(min_vals, max_vals, num_dimensions)
                  beta = np.exp(np.random.uniform(0, 1, num_dimensions) * iteration) * np.cos(np.random.uniform(0, 1, num_dimensions) * iteration)
                  new_tuna = alpha1 * rand_vals_uniform + beta * (rand_vals_uniform - tuna) + alpha2 * tuna

            new_tuna = np.maximum(np.minimum(new_tuna, max_vals), min_vals)

        new_population.append(new_tuna.tolist())

    return new_population
def tso_optimization(fitness_function, search_space, X, y, population_size=30, max_iterations=30, terminate=5):
    population = initialize_population(population_size, search_space)
    best_fitness = float('-inf')
    best_solution = None
    counter = 0
    last_best_fitness = float('-inf')

    for iteration in range(max_iterations):
      fitness_values = evaluate_population(population, fitness_function, X, y)

      current_best_index = np.argmax(fitness_values)
      current_best_fitness = fitness_values[current_best_index]

      if current_best_fitness > best_fitness:
        best_fitness = current_best_fitness
        best_solution = population[current_best_index]
        counter=0
      else:
        counter +=1

      print(f"Iteration {iteration + 1}/{max_iterations}, Best Fitness: {best_fitness}")

      if counter >= terminate:
        print(f"Best Fitness: {best_fitness}")
        break

      last_best_fitness = best_fitness
      population = update_tuna_positions(population, fitness_values, search_space, iteration, max_iterations)
    return best_solution, best_fitness

In [None]:
data = pd.read_csv('synthesized_data_from_original.csv')
mapping = {'inactive': 0, 'intermediate': 1, 'active': 2}
data['labels'] = data['label'].map(mapping)
print(data.head)


<bound method NDFrame.head of        PubchemFP0  PubchemFP1  PubchemFP2  PubchemFP3  PubchemFP4  PubchemFP5  \
0               1           1           0           0           0           0   
1               1           1           1           1           0           0   
2               1           1           1           1           0           0   
3               1           1           1           1           0           0   
4               1           1           1           1           0           0   
...           ...         ...         ...         ...         ...         ...   
13474           1           1           0           0           0           0   
13475           1           1           1           0           0           0   
13476           1           1           1           1           0           0   
13477           1           1           1           1           0           0   
13478           1           1           1           1           0           0  

In [None]:

x = data.drop(['labels','label', 'pIC50'], axis=1)
y = data.labels
print(x, y)

       PubchemFP0  PubchemFP1  PubchemFP2  PubchemFP3  PubchemFP4  PubchemFP5  \
0               1           1           0           0           0           0   
1               1           1           1           1           0           0   
2               1           1           1           1           0           0   
3               1           1           1           1           0           0   
4               1           1           1           1           0           0   
...           ...         ...         ...         ...         ...         ...   
13474           1           1           0           0           0           0   
13475           1           1           1           0           0           0   
13476           1           1           1           1           0           0   
13477           1           1           1           1           0           0   
13478           1           1           1           1           0           0   

       PubchemFP6  PubchemF

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))
x = selection.fit_transform(x)
x = pd.DataFrame(x)
x.to_csv('X_low_variance_removed_3class.csv', index=False)






In [None]:
# on the k7 previous resampling
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
import pandas as pd

rfc = RandomForestClassifier(random_state=100)
rfecv = RFECV(estimator=rfc, step=1, cv=5, scoring='accuracy')
rfecv.fit(x, y)
selected_features = x.columns[rfecv.support_]

# Print the optimal number of features and selected feature names
print("Optimal number of features: %d" % rfecv.n_features_)
print("Selected Features:", selected_features.tolist())  # Convert index to list for better readability

# Create DataFrame with the selected features
X_selected = x[selected_features]

# Save the selected features to a CSV file
X_selected.to_csv('X_recursive_feature_elimination.csv', index=False)

Optimal number of features: 119
Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 65, 67, 68, 69, 70, 71, 73, 74, 75, 77, 79, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 94, 96, 97, 98, 99, 100, 101, 103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133]


In [None]:
X_selected = x[selected_features]


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV, SelectKBest, mutual_info_classif
import pandas as pd

# Apply SelectKBest to the RFECV selected features
k = 60  # desired number of features
selector = SelectKBest(score_func=mutual_info_classif, k=k) # you can use other score functions
selector.fit(X_selected, y)
selected_features_kbest = X_selected.columns[selector.get_support()]
X_selected = X_selected[selected_features_kbest]


In [None]:
X_selected

Unnamed: 0,0,96
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
5171,0,0
5172,0,0
5173,0,0
5174,0,0


In [None]:
data_df = pd.concat([X_selected, y], axis=1)

X  = data_df.drop(['labels'], axis=1)
Y = data_df.labels


In [None]:

best_hyperparameters, best_fitness = tso_optimization(fitness_function_rf, search_space_rf, x, y)
print("Best Hyperparameters:", best_hyperparameters)
print("Best Fitness:", best_fitness)

Iteration 1/30, Best Fitness: 0.7738654668773364
Iteration 2/30, Best Fitness: 0.7873683268829517
Iteration 3/30, Best Fitness: 0.79323037496628
Iteration 4/30, Best Fitness: 0.79323037496628
Iteration 5/30, Best Fitness: 0.79323037496628
Iteration 6/30, Best Fitness: 0.79323037496628
Iteration 7/30, Best Fitness: 0.79323037496628
Iteration 8/30, Best Fitness: 0.79323037496628
Best Fitness: 0.79323037496628
Best Hyperparameters: [123.01022527902597, 4.2438925270400185, 1.0]
Best Fitness: 0.79323037496628


In [None]:
best_hyperparameters, best_fitness = tso_optimization(fitness_function_svm, search_space_svm, X, y)
print("Best Hyperparameters:", best_hyperparameters)
print("Best Fitness:", best_fitness)

Iteration 1/30, Best Fitness: 0.7136964540334614
Iteration 2/30, Best Fitness: 0.7218583154869717
Iteration 3/30, Best Fitness: 0.7218583154869717


In [None]:
best_hyperparameters, best_fitness = tso_optimization(fitness_function_knn, search_space_knn, X, y)
print("Best Hyperparameters:", best_hyperparameters)
print("Best Fitness:", best_fitness)

Iteration 1/30, Best Fitness: 0.7613322286022581
Iteration 2/30, Best Fitness: 0.7613322286022581
Iteration 3/30, Best Fitness: 0.7613322286022581
Iteration 4/30, Best Fitness: 0.7613322286022581
Iteration 5/30, Best Fitness: 0.7613322286022581
Iteration 6/30, Best Fitness: 0.7613322286022581
Best Fitness: 0.7613322286022581
Best Hyperparameters: [4.567498511795449]
Best Fitness: 0.7613322286022581


In [None]:
best_hyperparameters, best_fitness = tso_optimization(fitness_function_dt, search_space_dt, X, y)
print("Best Hyperparameters:", best_hyperparameters)
print("Best Fitness:", best_fitness)

Iteration 1/30, Best Fitness: 0.765851422845912
Iteration 2/30, Best Fitness: 0.765851422845912
Iteration 3/30, Best Fitness: 0.7685221836239217
Iteration 4/30, Best Fitness: 0.7685221836239217
Iteration 5/30, Best Fitness: 0.7685221836239217
Iteration 6/30, Best Fitness: 0.7685221836239217
Iteration 7/30, Best Fitness: 0.7685221836239217
Iteration 8/30, Best Fitness: 0.7685221836239217
Best Fitness: 0.7685221836239217
Best Hyperparameters: [3.841846460951925, 1.0]
Best Fitness: 0.7685221836239217


In [None]:
best_hyperparameters, best_fitness = tso_optimization(fitness_function_stacking, search_space_stacking, X, y)
print("Best Hyperparameters for Stacking:", best_hyperparameters)
print("Best Fitness for Stacking:", best_fitness)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
best_hyperparameters, best_fitness = tso_optimization(fitness_function_mlp, search_space_mlp, X, y)
print("Best Hyperparameters:", best_hyperparameters)
print("Best Fitness:", best_fitness)

NameError: name 'X' is not defined