# MealPy

Suponga que tiene una población de x individuos/soluciones, cada una de las cuales se evalúa una vez en una sola iteración. Esto significa que en cada iteración de su algoritmo, la función de evaluación se llama x número de veces (una para cada individuo/solución). Por lo tanto, tienes la relación: `Número de evaluaciones = Número de iteraciones * x`
Por lo general, los investigadores establecen el número máximo de evaluaciones de funciones como criterio de parada para asegurarse de que los algoritmos que se comparan hayan muestreado el espacio de búsqueda el mismo número de veces.

In [None]:
!pip install mealpy

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime
sns.set()

In [20]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import mealpy
from mealpy import FloatVar, ALO, BinaryVar
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder

In [33]:
#MODELS
def models(name:str='dt'):
  """Selecciona el modelo a entrenar:
                'dt':DecisionTreeClassifier,
                'ab':AdaBoostClassifier,
                'nb':MultinomialNB,
                'rf':RandomForestClassifier,
                'knn':KNeighborsClassifier(),
                'svm':SVC"""
  models_dic = {'dt':DecisionTreeClassifier(),
                'ab':AdaBoostClassifier(),
                'nb':MultinomialNB(),
                'rf':RandomForestClassifier(),
                'knn':KNeighborsClassifier(n_neighbors=9),
                'svm':SVC()}
  return models_dic[name]

In [31]:
#PRE-SETS
df_hd = pd.read_csv('HD_filtered.csv')
encoder = LabelEncoder()
X = df_hd.drop(columns=['Samples','Grade']).to_numpy()
y = encoder.fit_transform(df_hd.Grade.to_numpy())
clases = list(df_hd.columns[:-2])

In [6]:
# Pre-filtering Select K-best
kbest = SelectKBest(score_func=f_classif, k=100)
X_select = kbest.fit_transform(X, y)
print("Columnas seleccionadas:", len(kbest.get_support(indices=True)))
selected_features = [clases[i] for i in kbest.get_support(indices=True)]
print(f"Features seleccionadas: {selected_features}")


Columnas seleccionadas: 100
Features seleccionadas: ['ENSG00000114948', 'ENSG00000056736', 'ENSG00000078328', 'ENSG00000184545', 'ENSG00000106617', 'ENSG00000153310', 'ENSG00000135439', 'ENSG00000171208', 'ENSG00000111218', 'ENSG00000084090', 'ENSG00000145012', 'ENSG00000100852', 'ENSG00000100027', 'ENSG00000111785', 'ENSG00000069966', 'ENSG00000157680', 'ENSG00000102401', 'ENSG00000127152', 'ENSG00000164068', 'ENSG00000157087', 'ENSG00000154556', 'ENSG00000175471', 'ENSG00000276644', 'ENSG00000130477', 'ENSG00000153885', 'ENSG00000198363', 'ENSG00000196236', 'ENSG00000100592', 'ENSG00000112697', 'ENSG00000188786', 'ENSG00000197006', 'ENSG00000152413', 'ENSG00000169871', 'ENSG00000113916', 'ENSG00000106610', 'ENSG00000109919', 'ENSG00000198142', 'ENSG00000143774', 'ENSG00000139218', 'ENSG00000158195', 'ENSG00000169567', 'ENSG00000075415', 'ENSG00000122012', 'ENSG00000134376', 'ENSG00000104723', 'ENSG00000144909', 'ENSG00000140443', 'ENSG00000091879', 'ENSG00000198740', 'ENSG00000104863

## Random Forest

In [18]:
model = models('rf')
model.fit(X_select, y)
print(f"Base line score: {model.score(X_select, y)}")

Base line score: 1.0


In [17]:
model = models('rf')
scores = cross_val_score(model, X_select, y, cv=4, scoring='accuracy')
acc = np.mean(scores)
print(f"Cross val score: {acc}")

Cross val score: 0.5371287128712872


In [16]:
model = models('rf')
xtrain,xtest,ytrain,ytest = train_test_split(X_select,y,test_size=0.3,random_state=42,stratify=y)
model.fit(xtrain, ytrain)
print(f"Train score: {model.score(xtrain, ytrain)}")
print(f"Test score: {model.score(xtest, ytest)}")

Train score: 1.0
Test score: 0.6147540983606558


In [27]:
#OBJECTIVE
def fitness_function(solution):
  selected_indices = np.flatnonzero(solution)
  X_new = X_select[:, selected_indices]

  model = models('rf')
  xtrain,xtest,ytrain,ytest = train_test_split(X_new,y,test_size=0.2,random_state=42,stratify=y)
  model.fit(xtrain,ytrain)
  ypred = model.predict(xtest)

  num_variables = len(selected_indices)
  acc = accuracy_score(ytest, ypred)
  penalizacion = num_variables/X_select.shape[1]
  fitness = acc - penalizacion
  return fitness

#PROBLEM
problem_dict = {
  "bounds": BinaryVar(n_vars=X_select.shape[1]),
  "obj_func": fitness_function,
  "minmax": "max",
  "log_file":"rf_result.log"
}

#OPTIMIZADOR
optimizer = ALO.OriginalALO(epoch=10, pop_size=5)
g_best = optimizer.solve(problem_dict)

INFO:mealpy.swarm_based.ALO.OriginalALO:Solving single objective optimization problem.
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 1, Current best: 0.30962962962962964, Global best: 0.30962962962962964, Runtime: 2.15422 seconds
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 2, Current best: 0.30962962962962964, Global best: 0.30962962962962964, Runtime: 1.89003 seconds
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 3, Current best: 0.36962962962962964, Global best: 0.36962962962962964, Runtime: 1.48353 seconds
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 4, Current best: 0.36962962962962964, Global best: 0.36962962962962964, Runtime: 1.54905 seconds
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 5, Current best: 0.38197530864197526, Global best: 0.38197530864197526, Runtime: 1.50857 seconds
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 6, Current best: 0.38197530864197526, Global best: 0.381

In [28]:
selected_indices = np.flatnonzero(g_best.solution)
selected_variables = df_hd.columns[selected_indices]

print(f"Variables seleccionadas: {list(selected_variables)}")
print(f"Cantidad de variables seleccionadas: {len(selected_variables)}")
print(f"Mejor valor de aptitud: {g_best.target.fitness}")

Variables seleccionadas: ['ENSG00000078328', 'ENSG00000084090', 'ENSG00000111785', 'ENSG00000157087', 'ENSG00000175471', 'ENSG00000130477', 'ENSG00000198363', 'ENSG00000109919', 'ENSG00000198142', 'ENSG00000075415', 'ENSG00000122012', 'ENSG00000144909', 'ENSG00000104863', 'ENSG00000119547', 'ENSG00000069702', 'ENSG00000169282', 'ENSG00000162733', 'ENSG00000144445', 'ENSG00000176142', 'ENSG00000182240', 'ENSG00000006377', 'ENSG00000158528', 'ENSG00000105784', 'ENSG00000135643', 'ENSG00000049759', 'ENSG00000163904']
Cantidad de variables seleccionadas: 26
Mejor valor de aptitud: 0.38197530864197526


## K-Nerest Neighbors

In [34]:
model = models('knn')
xtrain,xtest,ytrain,ytest = train_test_split(X_select,y,test_size=0.3,random_state=42,stratify=y)
model.fit(xtrain, ytrain)
print(f"Train score: {model.score(xtrain, ytrain)}")
print(f"Test score: {model.score(xtest, ytest)}")

Train score: 0.5709219858156028
Test score: 0.5901639344262295


In [35]:
#OBJECTIVE
def fitness_function(solution):
  selected_indices = np.flatnonzero(solution)
  X_new = X_select[:, selected_indices]

  model = models('knn')
  xtrain,xtest,ytrain,ytest = train_test_split(X_new,y,test_size=0.2,random_state=42,stratify=y)
  model.fit(xtrain,ytrain)
  ypred = model.predict(xtest)

  num_variables = len(selected_indices)
  acc = accuracy_score(ytest, ypred)
  penalizacion = num_variables/X_select.shape[1]
  fitness = acc - penalizacion
  return fitness

#PROBLEM
problem_dict = {
  "bounds": BinaryVar(n_vars=X_select.shape[1]),
  "obj_func": fitness_function,
  "minmax": "max",
  "log_file":"knn_result.log"
}

#OPTIMIZADOR
optimizer = ALO.OriginalALO(epoch=10, pop_size=5)
g_best = optimizer.solve(problem_dict)

INFO:mealpy.swarm_based.ALO.OriginalALO:Solving single objective optimization problem.
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 1, Current best: 0.2938271604938271, Global best: 0.2938271604938271, Runtime: 0.08870 seconds
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 2, Current best: 0.2938271604938271, Global best: 0.2938271604938271, Runtime: 0.11134 seconds
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 3, Current best: 0.2938271604938271, Global best: 0.2938271604938271, Runtime: 0.09057 seconds
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 4, Current best: 0.3214814814814815, Global best: 0.3214814814814815, Runtime: 0.08918 seconds
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 5, Current best: 0.3214814814814815, Global best: 0.3214814814814815, Runtime: 0.08791 seconds
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 6, Current best: 0.3214814814814815, Global best: 0.32148148148148

In [36]:
selected_indices = np.flatnonzero(g_best.solution)
selected_variables = df_hd.columns[selected_indices]

print(f"Variables seleccionadas: {list(selected_variables)}")
print(f"Cantidad de variables seleccionadas: {len(selected_variables)}")
print(f"Mejor valor de aptitud: {g_best.target.fitness}")

Variables seleccionadas: ['ENSG00000056736', 'ENSG00000078328', 'ENSG00000154556', 'ENSG00000169871', 'ENSG00000198142', 'ENSG00000091879', 'ENSG00000123064', 'ENSG00000172530', 'ENSG00000100330', 'ENSG00000137996', 'ENSG00000158528', 'ENSG00000147421', 'ENSG00000049759', 'ENSG00000164292', 'ENSG00000163904', 'ENSG00000106344']
Cantidad de variables seleccionadas: 16
Mejor valor de aptitud: 0.3214814814814815


In [40]:
optimizer.history.save_global_objectives_chart(filename="hello/goc")
optimizer.history.save_local_objectives_chart(filename="hello/loc")

optimizer.history.save_global_best_fitness_chart(filename="hello/gbfc")
optimizer.history.save_local_best_fitness_chart(filename="hello/lbfc")

optimizer.history.save_runtime_chart(filename="hello/rtc")
optimizer.history.save_exploration_exploitation_chart(filename="hello/eec")

optimizer.history.save_diversity_chart(filename="hello/dc")

## Adaboost Classifier

In [42]:
model = models('ab')
xtrain,xtest,ytrain,ytest = train_test_split(X_select,y,test_size=0.3,random_state=42,stratify=y)
model.fit(xtrain, ytrain)
print(f"Train score: {model.score(xtrain, ytrain)}")
print(f"Test score: {model.score(xtest, ytest)}")

Train score: 0.524822695035461
Test score: 0.5081967213114754


In [47]:
#OBJECTIVE
def fitness_function(solution):
  selected_indices = np.flatnonzero(solution)
  X_new = X_select[:, selected_indices]

  model = models('ab')
  xtrain,xtest,ytrain,ytest = train_test_split(X_new,y,test_size=0.2,random_state=42,stratify=y)
  model.fit(xtrain,ytrain)
  ypred = model.predict(xtest)

  num_variables = len(selected_indices)
  acc = accuracy_score(ytest, ypred)

  alfa = 0.1
  beta = 1 - alfa
  fitness = 1.0 - (num_variables/X_select.shape[1]) # Primera parte de la función agregativa
  fitness = (alfa * fitness) + (beta * acc)
  return fitness

#PROBLEM
problem_dict = {
  "bounds": BinaryVar(n_vars=X_select.shape[1]),
  "obj_func": fitness_function,
  "minmax": "max",
  "log_file":"knn_result.log"
}

#OPTIMIZADOR
optimizer = ALO.OriginalALO(epoch=10, pop_size=5)
g_best = optimizer.solve(problem_dict)

INFO:mealpy.swarm_based.ALO.OriginalALO:Solving single objective optimization problem.
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 1, Current best: 0.5998765432098765, Global best: 0.5998765432098765, Runtime: 1.29021 seconds
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 2, Current best: 0.5998765432098765, Global best: 0.5998765432098765, Runtime: 1.27866 seconds
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 3, Current best: 0.5998765432098765, Global best: 0.5998765432098765, Runtime: 1.27233 seconds
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 4, Current best: 0.5998765432098765, Global best: 0.5998765432098765, Runtime: 1.34366 seconds
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 5, Current best: 0.6422222222222222, Global best: 0.6422222222222222, Runtime: 1.20468 seconds
INFO:mealpy.swarm_based.ALO.OriginalALO:>>>Problem: P, Epoch: 6, Current best: 0.6422222222222222, Global best: 0.64222222222222

In [48]:
selected_indices = np.flatnonzero(g_best.solution)
selected_variables = df_hd.columns[selected_indices]

print(f"Variables seleccionadas: {list(selected_variables)}")
print(f"Cantidad de variables seleccionadas: {len(selected_variables)}")
print(f"Mejor valor de aptitud: {g_best.target.fitness}")

Variables seleccionadas: ['ENSG00000114948', 'ENSG00000078328', 'ENSG00000084090', 'ENSG00000276644', 'ENSG00000130477', 'ENSG00000153885', 'ENSG00000197006', 'ENSG00000143774', 'ENSG00000148341', 'ENSG00000064763', 'ENSG00000169282', 'ENSG00000101940', 'ENSG00000141469', 'ENSG00000137996', 'ENSG00000105784', 'ENSG00000110880']
Cantidad de variables seleccionadas: 16
Mejor valor de aptitud: 0.6422222222222222
