In [1]:
from typing import Any

import plotly.graph_objects as go

from numpy.random import randint
import numpy as np
np.random.seed = 101

from deap import creator
from deap import tools
from deap import base

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits
from sklearn.metrics import fbeta_score

In [2]:
def random_params() -> np.array:
    """
    Function to generate initial (random) hyperparameters of random forest model.
    """
    
    estimators = randint(1, 50)
    cpp = np.random.random()
    depth = randint(1, 15)
    min_samples_split = randint(2, 10)
    
    params = estimators, cpp, depth, min_samples_split
        
    return np.array(params).reshape(1, 4)

def cross_over(child_1: list, child_2: list) -> tuple:
    """
    Function to crossover random gene (hyperparameter) between chromosomes.
    """
    
    param_to_swich = randint(0, len(child_1))
    child_1[param_to_swich], child_2[param_to_swich] = child_2[param_to_swich], child_1[param_to_swich]
    
    return child_1, child_2

def evaluate(individual: list) -> list:
    """
    Function to evaluate fitness.
    
    """

    params = {"n_estimators": int(individual[0][0][0]),
              "ccp_alpha": float(individual[0][0][1]),
              "max_depth": int(individual[0][0][2]),
              "min_samples_split": int(individual[0][0][3])}
    
    if params["n_estimators"] <= 0 or params["max_depth"] <= 0 or params["min_samples_split"] <= 0:
        return [0]
    
    model = RandomForestClassifier(**params)
    
    model = model.fit(X_train, y_train)
    predict = model.predict(X_test)
    
    return [fbeta_score(y_test, predict, beta=1, average='micro')]

def create_plot(logs: dict) -> None:
    """
    Function to create figure object.
    """
    
    mean_f_score, generations = list(logs.values()), list(logs.keys())

    fig = go.Figure(data=go.Scatter(x=generations, y=mean_f_score, mode='lines+markers', name='f_beta score across generations'))
    
    fig.update_xaxes(title_text="Generation number")
    fig.update_yaxes(title_text="Mean f-1 score across population")

    fig.show()

In [3]:
# Models are training using DIGITS dataset.
# =================   ==============
# Classes                         10
# Samples per class             ~180
# Samples total                 1797
# Dimensionality                  64
# Features             integers 0-16
# =================   ==============

X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=101)

In [4]:
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()

# define individual as one model
toolbox.register("individual", tools.initRepeat, creator.Individual, random_params, n=1)

# define the population to be a list of models
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# define 'evaluate' as fintess function
toolbox.register("evaluate", evaluate)

# define the crossover operator
toolbox.register("mate", cross_over)

# register a mutation operator
# 0.05 is ndependent probability for each attribute to be flipped
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)

# operator for selecting individuals for breeding the next generation.
# tournsize is the number of individuals participating in each tournament.

toolbox.register("select", tools.selTournament, tournsize=4)

In [10]:
def evolution(number_of_generations: int, threshold: float) -> dict:    
    """
    Function to run evolution, to stop function specifies 
    the max number of generations or threshold of f beta score to stop training.    
    """
    generation_logs = {}    
    pop = toolbox.population(n=10)
    
    print("Start of evolution")
    
    fitnesses = list(map(toolbox.evaluate, pop))
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit
        
    # Extracting all the fitnesses of 
    fits = [ind.fitness.values[0] for ind in pop]

    g = 0
    
    # Begin the evolution
    while g < number_of_generations:
        
        g = g + 1
        
        # Select the next generation individuals
        offspring = toolbox.select(pop, len(pop))

        # Clone the selected individuals
        offspring = list(map(toolbox.clone, offspring))

        # Apply crossover and mutation on the offspring
        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            
            # cross two individuals with probability
            if np.random.random() < 0.4:
                toolbox.mate(child1[0][0], child1[0][0])
                # fitness values of the children
                # must be recalculated later
                
                del child1.fitness.values
                del child2.fitness.values

        for mutant in offspring:

            # mutate an individual with probability
            if np.random.random() < 0.2:
                toolbox.mutate(mutant[0][0])
                del mutant.fitness.values
    
        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
                
        # The population is entirely replaced by the offspring
        pop[:] = offspring
        
        # Gather all the fitnesses in one list and print the stats
        fits = [ind.fitness.values[0] for ind in pop]
        
        length = len(pop)
        mean = sum(fits) / length
        sum2 = sum(x*x for x in fits)
        std = abs(sum2 / length - mean**2)**0.5
        
        print(f"Population metrics")
        print(f"Mean f_beta: {mean}")
        print(f"Std {std}")
        
        generation_logs.update({g: mean})
        
        if mean > threshold:
            break
    
    print("Evolution done")
    
    best_ind = tools.selBest(pop, 1)[0]
    params = best_ind[0][0]

    print(f"Best individual has f1 score = {best_ind.fitness} with: \n\nn_estimators = {params[0]} \nccp = {params[1]} \nmax_depth = {params[2]} \nmin_split = {params[3]}")
    
    return generation_logs

In [11]:
if __name__ == "__main__":
    logs = evolution(number_of_generations=100, threshold=0.80)
    create_plot(logs)

Start of evolution
Population metrics
Mean f_beta: 0.09314179796107508
Std 0.0032437442075995983
Population metrics
Mean f_beta: 0.09314179796107508
Std 0.0032437442075995983
Population metrics
Mean f_beta: 0.09443929564411492
Std 0.0029729406732798367
Population metrics
Mean f_beta: 0.09249304911955517
Std 0.0031782072658905233
Population metrics
Mean f_beta: 0.0916589434661724
Std 0.0027320487438333542
Population metrics
Mean f_beta: 0.09249304911955517
Std 0.0031782072658905233
Population metrics
Mean f_beta: 0.09443929564411493
Std 0.0029729406732795453
Population metrics
Mean f_beta: 0.09573679332715476
Std 0.0019462465245600085
Population metrics
Mean f_beta: 0.16672845227062094
Std 0.2539552195655446
Population metrics
Mean f_beta: 0.4109360518999073
Std 0.4103418869458054
Population metrics
Mean f_beta: 0.8359592215013902
Std 0.24735100866632487
Evolution done
Best individual has f1 score = (0.9240037071362373,) with: 

n_estimators = 10.0 
ccp = 0.0 
max_depth = 6.0 
min_split