# Paralelización de la evaluación de los hiperparámetros para un clasificador tipo Random Forest

Este ejercicio se trata de evaluar un conjunto de hiperparámetros para un clasiicador tipo Random Forest.

Los hiperparámetros que se se van a variar son:
- El número de árboles (10-210) 
- El criterio para realizar la partición del conjunto de datos (entropy, gini)


In [1]:
# Import the necessary dependencies
import multiprocess
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# We can see the number of cores in the computer 
cores = multiprocess.cpu_count()
print('Cores in the computer:', cores)

Cores in the computer: 8


In [4]:
# We create a list with the parameters to be evaluated 
hyperparameters = []
for criterion in ['gini','entropy']:
    for trees in range(10, 210):
        hyperparameters.append([trees, criterion])
        
print(hyperparameters)

[[10, 'gini'], [11, 'gini'], [12, 'gini'], [13, 'gini'], [14, 'gini'], [15, 'gini'], [16, 'gini'], [17, 'gini'], [18, 'gini'], [19, 'gini'], [20, 'gini'], [21, 'gini'], [22, 'gini'], [23, 'gini'], [24, 'gini'], [25, 'gini'], [26, 'gini'], [27, 'gini'], [28, 'gini'], [29, 'gini'], [30, 'gini'], [31, 'gini'], [32, 'gini'], [33, 'gini'], [34, 'gini'], [35, 'gini'], [36, 'gini'], [37, 'gini'], [38, 'gini'], [39, 'gini'], [40, 'gini'], [41, 'gini'], [42, 'gini'], [43, 'gini'], [44, 'gini'], [45, 'gini'], [46, 'gini'], [47, 'gini'], [48, 'gini'], [49, 'gini'], [50, 'gini'], [51, 'gini'], [52, 'gini'], [53, 'gini'], [54, 'gini'], [55, 'gini'], [56, 'gini'], [57, 'gini'], [58, 'gini'], [59, 'gini'], [60, 'gini'], [61, 'gini'], [62, 'gini'], [63, 'gini'], [64, 'gini'], [65, 'gini'], [66, 'gini'], [67, 'gini'], [68, 'gini'], [69, 'gini'], [70, 'gini'], [71, 'gini'], [72, 'gini'], [73, 'gini'], [74, 'gini'], [75, 'gini'], [76, 'gini'], [77, 'gini'], [78, 'gini'], [79, 'gini'], [80, 'gini'], [81, 

In [8]:
def evaluate_set(hyperparameter_set, p_id, lock):
    """
    Evaluate a set of hyperparameters
    Args:
    hyperparameter_set: a list with the set of hyperparameters to be evaluated
    """
    import datetime
    print('Yo soy el proceso', p_id, 'Comence a las',datetime.datetime.now())
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import datasets
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    # We load the dataset, here we use 80-20 for training and testing splits
    iris=datasets.load_iris()
    X=iris.data
    y=iris.target
    # se particiona el conjunto en 80-20 para la evaluación
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        stratify=y, 
                                                        test_size=0.20)
    for s in hyperparameter_set:
        clf=RandomForestClassifier(n_estimators=int(s[0]), criterion=s[1])
        clf.fit(X_train, y_train)
        y_pred=clf.predict(X_test)
        lock.acquire()
        print('Accuracy en el proceso',p_id,':',accuracy_score(y_test,y_pred))
        lock.release()

In [9]:
# Now we will evaluated with more threads
if __name__ == '__main__':   
    threads=[]
    N_THREADS=8
    splits=np.split(np.array(hyperparameters), N_THREADS)
    lock=multiprocess.Lock()
    for i in range(N_THREADS-1):
        # Se generan los hilos de procesamiento
        threads.append(multiprocess.Process(target=evaluate_set, args=[splits[i],i, lock]))


    start_time = time.perf_counter()
    # Se lanzan a ejecución
    for thread in threads:
        thread.start()

    # y se espera a que todos terminen
    for thread in threads:
        thread.join()
                
    finish_time = time.perf_counter()
    print(f"Program finished in {finish_time-start_time} seconds")

Program finished in 24.18581679998897 seconds
