In [None]:
import numpy as np
import random
import warnings
import copy
import pandas as pd
import time
import matplotlib
from numpy.random import default_rng
import matplotlib.pyplot as plt
from datetime import datetime
from scipy.stats import rankdata
from sklearn import datasets, metrics
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score, ShuffleSplit

In [None]:
def initial_population(size_population,n_variables, n_max=None, n_min=1, n_max_estricto=False,verbose=False): 
    individuos=np.zeros((size_population,n_variables))
    n_predictores=np.zeros((size_population,))
    for i in range(size_population):
            individuos[i,:] = np.random.randint(low  = 0, high = 2, size = n_variables)
            suma=np.sum(individuos[i,:])
            n_predictores[i]=suma
            
            if suma==0:
                individuos[i,random.randint(0, n_variables)]=1
                n_predictores[i]=1
                  
            if n_max is not None:
                if n_max_estricto:
                    if suma>n_max:
                        quitar=suma-n_max
                        indi=list(np.where(individuos[i,:] == 1))
                        rng = default_rng()
                        numbers = list(rng.choice(len(indi[0]), size=int(quitar), replace=False))
                        individuos[i,indi[0][numbers]]=0
                        n_predictores[i]=np.sum(individuos[i,:])
                  
            if verbose:
                print("Individuo " +str(i+1)+ ":  " + str(individuos[i,:])+ ', Predictores: '+ str(n_predictores[i]))     
    return (individuos)

In [None]:
def fitness(individuos, x, y, tipo_modelo, modelo, metrica,cv=5,
                      test_size=0.2, cv_seed=123, nivel_referencia = None,                     
                      rf_n_estimators = 100, verbose = False):
        # cv : `int` número de repeticiones `ShuffleSplit` de scikit-learn
        # x : `numpy array 2d` matriz predictores
        # y : `numpy array 1d` variable respuesta
        # modelo: {lineal, logistico, randomforest}
        # tipo_modelo: {regresion, clasificacion}
        # metrica: {"neg_mean_squared_error","accuracy"}
        # rf_n_estimators : `int`
        # número de árboles en los modelos random forest. (default 100)
    fitness=np.zeros((individuos.shape[0],))
    if modelo == "lineal":
        mod = LinearRegression()
    elif modelo == "glm":
        mod = LogisticRegression()
    elif modelo == "randomforest" and tipo_modelo == "regresion":
        mod = RandomForestRegressor(
                        n_estimators = rf_n_estimators,
                        random_state = 1234,
                        bootstrap    = False
                      )      
    elif modelo == "randomforest" and tipo_modelo == "clasificacion":
        mod = RandomForestClassifier(
                        n_estimators = rf_n_estimators,
                        random_state = 1234,
                        bootstrap    = False
                      ) 
    cv = ShuffleSplit(n_splits= cv, test_size= test_size, random_state = cv_seed)
    for i in range(individuos.shape[0]):  

        metrica_cv = cross_val_score(
                        estimator = mod,
                        X         = x[:,individuos[i,:].astype(bool)],
                        y         = y,
                        cv        = cv,
                        scoring   = metrica,
                        n_jobs    = 1
                     )
        fitness[i]=metrica_cv.mean()
    ind_mini=np.argmax(fitness)
    the_best=[[ind_mini], [fitness[ind_mini]],[individuos[ind_mini,:]]]
    if verbose:
        print("the best individuo is: " +str(ind_mini)+"  fiteness= "+ str(fitness[ind_mini]) +"\nstring= " +str(individuos[ind_mini,:]) )
    return(fitness,the_best)

In [None]:
def selection(fitness_population, individuos,size_population,selection_method="tournament",elitismo= 0.3,verbosa=False): 
    """ fitness_population  = Evalaución de la población
       fitness_prob = Evalaución de la población inversa y normalizada  
       metodo_seleccion="tournament"
    """
    selected_population=[]
    if elitismo is not None: 
        n_elitismo = int(np.ceil(size_population*elitismo))
        rank = np.flip(np.argsort(fitness_population), axis = 0)
        elite = [copy.deepcopy(individuos[i]) for i in rank[:n_elitismo]]
    else:
        elite=[]
        n_elitismo=0
        
    if selection_method == "tournament":
        ind_tournament= [random.randint(0, size_population-1) for i in range(size_population)]
        for i,j in zip(range(size_population-n_elitismo),ind_tournament):
            mini= np.argmax([ fitness_population[i],fitness_population[j]])
            if mini==0:
                selected_population.append(individuos[i])
                sel=i
            else:
                selected_population.append(individuos[j])
                sel=j
            if verbosa:
                print("Individuo seleccionado: " +str(selected_population[i]) )       
    return selected_population, elite

In [None]:
def crossover(selected_population,size_population,n_variables,elite=None,verbosa=False): 
    """ fitness_population  = Evalaución de la población
       fitness_prob = Evalaución de la población inversa y normalizada  
       metodo_seleccion="tournament"
    """
    if elite:
        n_elitismo =len(elite)
    else: 
        n_elitismo =0
    pointC= [random.randint(1, n_variables-2) for i in range(size_population-n_elitismo)]
    
    # [print(selected_population[i]) for i in range(size_population)]
    i=0
    cont_int=0
    new_generation=np.zeros((size_population,n_variables))
    rng = default_rng()
    while i<size_population-n_elitismo:
        ind_parents=np.zeros((2,))
        parents= np.zeros((2,n_variables))
        ind_parents = rng.choice(size_population-n_elitismo, size=2, replace=False)
        for ii in range(2):
            parents[ii,:]=selected_population[int(ind_parents[ii])]
        new_generation[i,:] = np.concatenate((parents[0,0:pointC[i]],parents[1,pointC[i]:]))
        i += 1

    if elite:
        cont=0
        for ii in range(i,size_population):
            new_generation[ii,:]=elite[cont]
            cont += 1
    if verbosa:
        [print("Individuo " + str(i+1)+" seleccionado: "+ str(new_generation[i])) for i in range(size_population)]
    return new_generation

In [None]:
def mutation(individuos,size_population,n_variables, prob_mut=0.02, verbosa=False): 
    """ prob  = probabilidad de mutación debe ser baja
    """
    n_predictores=np.zeros((size_population,))
    dat=np.random.uniform(low=0,high=1,size=(size_population,n_variables))<0.02
    x,y=np.where(dat)
    n_mut= len(x)
    for i,j in zip(x,y):
        individuos[i,j]=1-individuos[i,j]
    
    for i in range(size_population):
            suma=np.sum(individuos[i,:])
            n_predictores[i]=suma
            if suma==0:
                individuos[i,random.randint(0, n_variables)]=1
                n_predictores[i]=1
                  
            if n_max is not None:
                if n_max_estricto:
                    if suma>=n_max:
                        quitar=suma-n_max
                        indi=list(np.where(individuos[i,:] == 1))
                        rng = default_rng()
                        numbers = list(rng.choice(len(indi[0]), size=int(quitar), replace=False))
                        individuos[i,indi[0][numbers]]=0
                        n_predictores[i]=np.sum(individuos[i,:])
    if verbosa:
        print("numbers of mutated bits: " + str(n_mut))
    return individuos

## Test 1

In [None]:
boston = datasets.load_boston(return_X_y= True)
size_population=10
n_variables=13
n_min=1
n_max=6
n_max_estricto=True
generations=30

individuos=initial_population(size_population=10,n_variables=13, n_max=6, n_min=1, n_max_estricto=True,verbose=False)

for i in range(generations):
    print('Iteration: '+str(i))
    fitness_population, best=fitness(individuos, x  = boston[0],y  = boston[1], cv = 5,
                test_size = 0.2,
                tipo_modelo = "regresion",
                modelo      = "randomforest",
                metrica     = "neg_mean_squared_error",
                verbose     = True
            )
    
    selected_population, elite=selection(fitness_population, individuos,size_population,selection_method="tournament",elitismo= None,verbosa=False)     
    individuos=crossover(selected_population,size_population,n_variables,elite,verbosa=False)
    individuos=mutation(individuos,size_population,n_variables, prob_mut=0.02, verbosa=False)
print("----------------  finish  ----------------")    
fitness_population, best=fitness(individuos, x  = boston[0],y  = boston[1], cv = 5,
                test_size = 0.2,
                tipo_modelo = "regresion",
                modelo      = "randomforest",
                metrica     = "neg_mean_squared_error",
                verbose     = True
            )


## Test 2

In [None]:
from sklearn.datasets import make_friedman1
datos = make_friedman1(
            n_samples    = 500,
            n_features   = 10,
            noise        = 1,
            random_state = None
        )

# Se añaden además 20 columnas adicionales con valores aleatorios distribuidos
# de forma normal.
x = datos[0]
y = datos[1]
ruido = np.random.normal(size = (500,20))
x = np.hstack((x, ruido))
print(x.shape,y.shape)

In [None]:
size_population=10
n_variables=x.shape[1]
n_min=1
n_max=5
generations=50

individuos=initial_population(size_population=50,n_variables=n_variables, n_max=n_max, n_min=n_min, n_max_estricto=False,verbose=False)
history=[]
for i in range(generations):
    print('Iteration: '+str(i))
    fitness_population, best=fitness(individuos, x  = x,y  = y, cv = 3,
                test_size = 0.2,
                tipo_modelo = "regresion",
                modelo      = "randomforest",
                metrica     = "neg_mean_squared_error",
                rf_n_estimators = 100,
                verbose     = True
            )
    history.append(best)

    selected_population, elite=selection(fitness_population, individuos,size_population,selection_method="tournament",elitismo= 0.01,verbosa=False)     
    individuos=crossover(selected_population,size_population,n_variables,elite,verbosa=False)
    individuos=mutation(individuos,size_population,n_variables, prob_mut=0.1, verbosa=False)
    
print("----------------  finish  ----------------")    
fitness_population, best=fitness(individuos, x  = x,y  = y, cv = 3,
                test_size = 0.2,
                tipo_modelo = "regresion",
                modelo      = "randomforest",
                metrica     = "neg_mean_squared_error",
                rf_n_estimators = 100,
                verbose     = True
            )



In [None]:
fitn=[history[i][1] for i in range(0,generations)]
gene=np.arange(0,generations)
plt.style.use('ggplot')
fig, ax = plt.subplots()
plt.plot(gene,fitn)
ax.set(title='Evolución del mejor Individuo',xlabel='generacion', ylabel='fitness');


variable=np.arange(0,n_variables)
predictores=[history[i][2] for i in range(0,generations)]
per_pred=(100*(np.sum(predictores,axis=0)/generations)).reshape((n_variables, ))

frecuencia_selecion = pd.DataFrame(
                                {"predictor":variable,
                                 "frecuencia" : per_pred}) \
                              .sort_values(
                                    by=["frecuencia"],
                                    ascending = False
                               );

plt.style.use('ggplot')
fig, ax = plt.subplots()
frecuencia_selecion.plot.barh(
            x = "predictor",
            y = "frecuencia",
            ax = ax
        )
ax.set(title='Frecuencia de selección',
               xlabel='frecuencia', ylabel='predictor')
ax.legend().set_visible(False)

## Test 3

In [None]:
##########################################
# filename = 'test1/pima-indians-diabetes.csv'
# names = ['pregnant', 'plasma', 'pressure', 'skin', 'test', 'BMI', 'pedigree', 'age', 'class']
# dataframe = pd.read_csv(filename, names=names)
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

#########################################

size_population=14
n_variables=8
n_min=1
n_max=3
n_max_estricto=True
generations=30

individuos=initial_population(size_population=size_population,n_variables=n_variables, n_max=n_max, n_min=n_min, n_max_estricto=False,verbose=False)

for i in range(generations):
    print('Iteration: '+str(i))
    fitness_population, best=fitness(individuos, x  = X,y  = Y, cv = 5,
                test_size = 0.2,
                tipo_modelo = "clasificacion",
                modelo      = "randomforest",
                metrica     = "accuracy",
                verbose     = True
            )
    
    selected_population, elite=selection(fitness_population, individuos,size_population,selection_method="tournament",elitismo= 0.2,verbosa=False)     
    individuos=crossover(selected_population,size_population,n_variables,elite,verbosa=False)
    individuos=mutation(individuos,size_population,n_variables, prob_mut=0.02, verbosa=False)
print("----------------  finish  ----------------")    
fitness_population, best=fitness(individuos, x  = X,y  = Y, cv = 5,
                test_size = 0.2,
                tipo_modelo = "clasificacion",
                modelo      = "randomforest",
                metrica     = "accuracy",
                verbose     = True
            )
print("The best features:")
[print(names[i]) for i in range(len(names)-1) if best[2][0][i] ];


## Test 4

In [None]:
df = pd.read_csv('test3/train.csv')
array = df.values
data_top = df.columns
print(df.columns)
print(data_top[-1])
X = array[:,0:370]
Y = array[:,370]

In [None]:
size_population=8
n_variables=370
n_min=1
n_max=4
n_max_estricto=True
generations=20
history=[]

individuos=initial_population(size_population=size_population,n_variables=n_variables, n_max=n_max, n_min=n_min, n_max_estricto=n_max_estricto,verbose=False)


for i in range(generations):
    print('Iteration: '+str(i))
    fitness_population, best=fitness(individuos, x  = X,y  = Y, cv = 5,
                test_size = 0.2,
                tipo_modelo = "clasificacion",
                modelo      = "randomforest",
                metrica     = "accuracy",
                verbose     = True
            )
    history.append(best)

    selected_population, elite=selection(fitness_population, individuos,size_population,selection_method="tournament",elitismo= 0.01,verbosa=False)     
    individuos=crossover(selected_population,size_population,n_variables,elite,verbosa=False)
    individuos=mutation(individuos,size_population,n_variables, prob_mut=0.02, verbosa=False)
print("----------------  finish  ----------------")    
fitness_population, best=fitness(individuos, x  = X,y  = Y, cv = 5,
                test_size = 0.2,
                tipo_modelo = "clasificacion",
                modelo      = "randomforest",
                metrica     = "accuracy",
                verbose     = True
            )
print("The best features:")
[print(i) for i in range(n_variables) if best[2][0][i] ];


In [None]:
fitn=[history[i][1] for i in range(0,generations)]
gene=np.arange(0,generations)
plt.style.use('ggplot')
fig, ax = plt.subplots()
plt.plot(gene,fitn)
plt.ylim(0.958,0.96)
ax.set(title='Evolución del mejor Individuo',xlabel='generacion', ylabel='fitness');

In [None]:
variable=np.arange(0,n_variables)
predictores=[history[i][2] for i in range(0,generations)]
per_pred=(100*(np.sum(predictores,axis=0)/generations)).reshape((n_variables, ))

frecuencia_selecion = pd.DataFrame(
                                {"predictor":variable,
                                 "frecuencia" : per_pred}) \
                              .sort_values(
                                    by=["frecuencia"],
                                    ascending = False
                               );
plt.style.use('ggplot')
fig, ax = plt.subplots(figsize=(8,16))
frecuencia_selecion[0:50].plot.barh(
            x = "predictor",
            y = "frecuencia",
            ax = ax
        )
ax.set(title='Frecuencia de selección',
               xlabel='frecuencia', ylabel='predictor')
ax.legend().set_visible(False)

## Test 5

In [90]:
df = pd.read_csv('test4/train_encoder.csv')
df.fillna(df.mean().round(1), inplace=True)
df.fillna(df.median().round(1), inplace=True)
array = df.values
data_top = df.columns
print(df.columns)
print(data_top[-1])
X = array[:,1:80]
Y = array[:,80]
print(X)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [91]:
size_population=10
n_variables=X.shape[1]
n_min=1
n_max=5
n_max_estricto=True
generations=50
history=[]



individuos=initial_population(size_population=size_population,n_variables=n_variables, n_max=n_max, n_min=n_min, n_max_estricto=n_max_estricto,verbose=False)

for i in range(generations):
    print('Iteration: '+str(i))
    fitness_population, best=fitness(individuos, x  = X, y  = Y, cv = 3,
                test_size = 0.2,
                tipo_modelo = "regresion",
                modelo      = "randomforest",
                metrica     = "neg_mean_squared_error",
                rf_n_estimators = 100,
                verbose     = True
            )
    history.append(best)

    selected_population, elite=selection(fitness_population, individuos,size_population,selection_method="tournament",elitismo= 0.01,verbosa=False)     
    individuos=crossover(selected_population,size_population,n_variables,elite,verbosa=False)
    individuos=mutation(individuos,size_population,n_variables, prob_mut=0.1, verbosa=False)
    
print("----------------  finish  ----------------")    
fitness_population, best=fitness(individuos, x  = X,y  = Y, cv = 3,
                test_size = 0.2,
                tipo_modelo = "regresion",
                modelo      = "randomforest",
                metrica     = "neg_mean_squared_error",
                rf_n_estimators = 100,
                verbose     = True
            )


Iteration: 0
the best individuo is: 2  fiteness= -3066256894.0478425
string= [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0.]
Iteration: 1
the best individuo is: 7  fiteness= -3176168099.979641
string= [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0.]
Iteration: 2
the best individuo is: 9  fiteness= -2265724832.586662
string= [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0.]
Iteration: 3
the best individuo is: 9  fitene

KeyboardInterrupt: 