In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data_m1 = pd.read_excel('data/M1.xlsx')

# Features (X) and target (y)
X_m1 = data_m1.drop('WPR', axis=1)
y_m1 = data_m1['WPR']

# Show the first 20 rows of the data
print(data_m1.head(20))

# Split data into training and testing sets
X_train_m1, X_test_m1, y_train_m1, y_test_m1 = train_test_split(X_m1, y_m1, test_size=0.2, random_state=42)

print("X_train_m1 shape:", X_train_m1.shape)
print("y_train_m1 shape:", y_train_m1.shape)


       Qa     Qc     Qd   Qhx   Ir     Qe       WPR
0   0.000  0.000  0.000  0.00    0  0.000  0.000000
1   0.000  0.000  0.000  0.00    0  0.000  0.000000
2   0.000  0.000  0.000  0.00    0  0.000  0.000000
3   0.000  0.000  0.000  0.00    0  0.000  0.000000
4   0.000  0.000  0.000  0.00    0  0.000  0.000000
5   0.000  0.000  0.000  0.00    0  0.000  0.000000
6   0.000  0.000  0.000  0.00    6  0.000  0.000000
7   0.000  0.000  0.000  0.00  122  0.000  0.000000
8   2.601  1.889  2.645  1.24  333  1.845  0.100440
9   5.770  4.702  5.911  1.70  517  4.560  1.051920
10  7.868  6.516  8.093  1.99  636  6.292  1.505088
11  8.792  7.302  9.057  2.11  670  7.037  1.938276
12  9.446  7.855  9.742  2.19  695  7.560  2.405268
13  8.669  7.198  8.928  2.09  634  6.938  2.019600
14  6.688  5.500  6.864  1.83  497  5.324  0.753840
15  3.562  2.751  3.632  1.39  302  2.681  0.000000
16  0.000  0.000  0.000  0.00   91  0.000  0.000000
17  0.000  0.000  0.000  0.00    2  0.000  0.000000
18  0.000  0

In [22]:
# Train the baseline Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_m1, y_train_m1)

# Make predictions on both training and test data
y_train_pred = rf_model.predict(X_train_m1)
y_test_pred = rf_model.predict(X_test_m1)

# Evaluate the model on the test data
mse = mean_squared_error(y_test_m1, y_test_pred)
r2 = r2_score(y_test_m1, y_test_pred)

print(f'Baseline RF Model - Mean Squared Error: {mse}')
print(f'Baseline RF Model - R^2 Score: {r2}')

# Save predictions vs. ground truth to Excel
baseline_train_results = pd.DataFrame({'Ground Truth': y_train_m1, 'Prediction': y_train_pred})
baseline_test_results = pd.DataFrame({'Ground Truth': y_test_m1, 'Prediction': y_test_pred})

baseline_train_results.to_excel('outputs/baseline_rf_train_predictions.xlsx', index=False)
baseline_test_results.to_excel('outputs/baseline_rf_test_predictions.xlsx', index=False)


Baseline RF Model - Mean Squared Error: 0.7181106254132201
Baseline RF Model - R^2 Score: 0.24688267288857202


In [23]:
from pyswarm import pso

# Objective function to minimize
def rf_pso(params):
    n_estimators = int(params[0])
    max_depth = int(params[1])
    
    rf_model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )
    rf_model.fit(X_train_m1, y_train_m1)
    y_pred = rf_model.predict(X_test_m1)
    
    return mean_squared_error(y_test_m1, y_pred)

# PSO parameter bounds
lb = [10, 1]  # Lower bounds for n_estimators and max_depth
ub = [100, 20]  # Upper bounds for n_estimators and max_depth

# Run PSO
optimal_params, optimal_mse = pso(rf_pso, lb, ub, swarmsize=10, maxiter=10)

print(f'Optimized Parameters (PSO): {optimal_params}')
print(f'Optimized Mean Squared Error (PSO): {optimal_mse}')


Stopping search: maximum iterations reached --> 10
Optimized Parameters (PSO): [41.70629072  3.85779957]
Optimized Mean Squared Error (PSO): 0.5905351801080733


In [6]:
# Retrain the Random Forest with optimized parameters from PSO
n_estimators_optimized = int(optimal_params[0])
max_depth_optimized = int(optimal_params[1])

rf_model_optimized = RandomForestRegressor(
    n_estimators=n_estimators_optimized, 
    max_depth=max_depth_optimized, 
    random_state=42
)

rf_model_optimized.fit(X_train_m1, y_train_m1)

# Make predictions on both training and test data
y_train_pred_optimized = rf_model_optimized.predict(X_train_m1)
y_test_pred_optimized = rf_model_optimized.predict(X_test_m1)

# Evaluate the optimized model on the test data
mse_optimized = mean_squared_error(y_test_m1, y_test_pred_optimized)
r2_optimized = r2_score(y_test_m1, y_test_pred_optimized)

print(f'Optimized RF Model (PSO) - Mean Squared Error: {mse_optimized}')
print(f'Optimized RF Model (PSO) - R^2 Score: {r2_optimized}')

# Save predictions vs. ground truth to Excel
optimized_train_results = pd.DataFrame({'Ground Truth': y_train_m1, 'Prediction': y_train_pred_optimized})
optimized_test_results = pd.DataFrame({'Ground Truth': y_test_m1, 'Prediction': y_test_pred_optimized})

optimized_train_results.to_excel('outputs/optimized_rf_pso_train_predictions.xlsx', index=False)
optimized_test_results.to_excel('outputs/optimized_rf_pso_test_predictions.xlsx', index=False)


Optimized RF Model (PSO) - Mean Squared Error: 0.5926641543108092
Optimized RF Model (PSO) - R^2 Score: 0.37844445135111004


In [7]:
# Compare results of baseline and Optimized RF Model (PSO
print("Baseline RF Model")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

print("\nOptimized RF Model (PSO)")
print(f"Mean Squared Error: {mse_optimized}")
print(f"R^2 Score: {r2_optimized}")


Baseline RF Model
Mean Squared Error: 0.7181106254132201
R^2 Score: 0.24688267288857202

Optimized RF Model (PSO)
Mean Squared Error: 0.5926641543108092
R^2 Score: 0.37844445135111004


In [8]:
#Optimize Random Forest with Genetic Algorithm (GA)
import random
from deap import base, creator, tools, algorithms
from sklearn.metrics import mean_squared_error

def rf_ga(individual):
    n_estimators = int(individual[1])
    max_depth = int(individual[1])
    
    rf_model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )
    rf_model.fit(X_train_m1, y_train_m1)
    y_pred = rf_model.predict(X_test_m1)
    
    return mean_squared_error(y_test_m1, y_pred),

# Create types
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)

# Register functions
toolbox = base.Toolbox()
toolbox.register("attr_float", random.uniform, 0.01, 0.3)
toolbox.register("attr_int", random.randint, 3, 10)
toolbox.register("attr_int2", random.randint, 50, 300)
toolbox.register("individual", tools.initCycle, creator.Individual,
                 (toolbox.attr_float, toolbox.attr_int, toolbox.attr_int2), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", rf_ga)
toolbox.register("mate", tools.cxBlend, alpha=0.5)

# Mutation function that constrains the learning_rate to [0.01, 0.3]
def constrained_mutation(individual, indpb):
    if random.random() < indpb:
        individual[0] = min(max(individual[0] + random.gauss(0, 0.05), 0.01), 0.3)
    if random.random() < indpb:
        individual[1] = random.randint(3, 10)
    if random.random() < indpb:
        individual[2] = random.randint(50, 300)
    return individual,

toolbox.register("mutate", constrained_mutation, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)

# Define population and evolution
population = toolbox.population(n=10)
algorithms.eaSimple(population, toolbox, cxpb=0.7, mutpb=0.2, ngen=40, verbose=True)

# Extract best individual
best_individual = tools.selBest(population, 1)[0]

# Extracting the best individual
best_individual = tools.selBest(population, k=1)[0]
optimal_params_ga = [int(best_individual[0]), int(best_individual[1])]
print(f'Optimized Parameters (GA): {optimal_params_ga}')
print(f'GA optimized parameters: {best_individual}')


gen	nevals
0  	10    
1  	3     
2  	10    
3  	9     
4  	9     
5  	8     
6  	6     
7  	10    
8  	7     
9  	9     
10 	9     
11 	9     
12 	9     
13 	9     
14 	7     
15 	8     
16 	8     
17 	6     
18 	8     
19 	5     
20 	9     
21 	10    
22 	8     
23 	6     
24 	8     
25 	4     
26 	9     
27 	8     
28 	8     
29 	7     
30 	5     
31 	9     
32 	8     
33 	8     
34 	8     
35 	8     
36 	7     
37 	8     
38 	6     
39 	5     
40 	7     
Optimized Parameters (GA): [0, 4]
GA optimized parameters: [0.1781056152379834, 4.250662476314364, 226.21730517442415]


In [9]:
# Retrain the Random Forest with optimized parameters from GA
rf_model_ga_optimized = RandomForestRegressor(
    n_estimators=optimal_params_ga[1], 
    max_depth=optimal_params_ga[1], 
    random_state=42
)

rf_model_ga_optimized.fit(X_train_m1, y_train_m1)

# Make predictions on both training and test data
y_train_pred_ga_optimized = rf_model_ga_optimized.predict(X_train_m1)
y_test_pred_ga_optimized = rf_model_ga_optimized.predict(X_test_m1)

# Evaluate the optimized model on the test data
mse_ga_optimized = mean_squared_error(y_test_m1, y_test_pred_ga_optimized)
r2_ga_optimized = r2_score(y_test_m1, y_test_pred_ga_optimized)

print(f'Optimized RF Model (GA) - Mean Squared Error: {mse_ga_optimized}')
print(f'Optimized RF Model (GA) - R^2 Score: {r2_ga_optimized}')

# Save predictions vs. ground truth to Excel
optimized_ga_train_results = pd.DataFrame({'Ground Truth': y_train_m1, 'Prediction': y_train_pred_ga_optimized})
optimized_ga_test_results = pd.DataFrame({'Ground Truth': y_test_m1, 'Prediction': y_test_pred_ga_optimized})

optimized_ga_train_results.to_excel('outputs/optimized_rf_ga_train_predictions.xlsx', index=False)
optimized_ga_test_results.to_excel('outputs/optimized_rf_ga_test_predictions.xlsx', index=False)


Optimized RF Model (GA) - Mean Squared Error: 0.5920059751706191
Optimized RF Model (GA) - R^2 Score: 0.37913471563251544


In [10]:
#Optimized RF Model (PSO) vs. Optimized RF Model (GA)

In [11]:
import optuna
from sklearn.ensemble import RandomForestRegressor

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 100)
    max_depth = trial.suggest_int('max_depth', 1, 20)

    rf_model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )
    
    rf_model.fit(X_train_m1, y_train_m1)
    y_pred = rf_model.predict(X_test_m1)
    
    mse = mean_squared_error(y_test_m1, y_pred)
    
    return mse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

optimal_params_optuna = study.best_params
print(f'Optimized Parameters (Optuna): {optimal_params_optuna}')

# Retrain the model with the optimized parameters
rf_model_optuna_optimized = RandomForestRegressor(
    n_estimators=optimal_params_optuna['n_estimators'],
    max_depth=optimal_params_optuna['max_depth'],
    random_state=42
)

rf_model_optuna_optimized.fit(X_train_m1, y_train_m1)

# Make predictions on both training and test data
y_train_pred_optuna_optimized = rf_model_optuna_optimized.predict(X_train_m1)
y_test_pred_optuna_optimized = rf_model_optuna_optimized.predict(X_test_m1)

# Save predictions vs. ground truth to Excel
optimized_optuna_train_results = pd.DataFrame({'Ground Truth': y_train_m1, 'Prediction': y_train_pred_optuna_optimized})
optimized_optuna_test_results = pd.DataFrame({'Ground Truth': y_test_m1, 'Prediction': y_test_pred_optuna_optimized})

optimized_optuna_train_results.to_excel('outputs/optimized_rf_optuna_train_predictions.xlsx', index=False)
optimized_optuna_test_results.to_excel('outputs/optimized_rf_optuna_test_predictions.xlsx', index=False)


[I 2024-08-31 09:52:05,572] A new study created in memory with name: no-name-39bef4b7-6876-45bb-84c8-b3f2e722b3fd
[I 2024-08-31 09:52:05,870] Trial 0 finished with value: 0.6055196711348385 and parameters: {'n_estimators': 27, 'max_depth': 6}. Best is trial 0 with value: 0.6055196711348385.
[I 2024-08-31 09:52:06,577] Trial 1 finished with value: 0.6113878942635281 and parameters: {'n_estimators': 77, 'max_depth': 7}. Best is trial 0 with value: 0.6055196711348385.
[I 2024-08-31 09:52:06,661] Trial 2 finished with value: 0.6153310681176575 and parameters: {'n_estimators': 29, 'max_depth': 1}. Best is trial 0 with value: 0.6055196711348385.
[I 2024-08-31 09:52:07,517] Trial 3 finished with value: 0.6925110642253975 and parameters: {'n_estimators': 64, 'max_depth': 14}. Best is trial 0 with value: 0.6055196711348385.
[I 2024-08-31 09:52:08,511] Trial 4 finished with value: 0.6581772870716005 and parameters: {'n_estimators': 83, 'max_depth': 11}. Best is trial 0 with value: 0.605519671134

Optimized Parameters (Optuna): {'n_estimators': 100, 'max_depth': 3}


In [14]:
import numpy as np
from niapy.task import Task
from niapy.algorithms.basic import HarrisHawksOptimization
from niapy.problems.problem import Problem
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Define a custom benchmark problem
class RandomForestOptimizationProblem(Problem):
    def __init__(self, X_train, y_train, X_test, y_test):
        # Define the problem dimension and bounds (2 parameters to optimize)
        super().__init__(dimension=2, lower=[10, 1], upper=[100, 20])
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

    def _evaluate(self, x):
        # Extract the parameters
        n_estimators = int(x[0])
        max_depth = int(x[1])

        # Create and train the Random Forest model
        rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        rf_model.fit(self.X_train, self.y_train)

        # Predict and calculate the mean squared error
        y_pred = rf_model.predict(self.X_test)
        mse = mean_squared_error(self.y_test, y_pred)
        
        return mse

# Initialize the problem with your dataset
problem = RandomForestOptimizationProblem(X_train_m1, y_train_m1, X_test_m1, y_test_m1)


In [15]:
# Define the task for the HHO algorithm
task = Task(problem=problem, max_iters=100)

# Initialize the HHO algorithm
algo = HarrisHawksOptimization(population_size=30)

# Run the optimization
best_params, best_mse = algo.run(task)

print(f'Optimized Parameters (HHO): n_estimators = {int(best_params[0])}, max_depth = {int(best_params[1])}')
print(f'Best MSE achieved: {best_mse}')


Optimized Parameters (HHO): n_estimators = 99, max_depth = 3
Best MSE achieved: 0.58787776732519


In [16]:
# Train the model with the optimized parameters
rf_model_hho_optimized = RandomForestRegressor(
    n_estimators=int(best_params[0]),
    max_depth=int(best_params[1]),
    random_state=42
)

rf_model_hho_optimized.fit(X_train_m1, y_train_m1)

# Make predictions on both training and test data
y_train_pred_hho_optimized = rf_model_hho_optimized.predict(X_train_m1)
y_test_pred_hho_optimized = rf_model_hho_optimized.predict(X_test_m1)

# Save predictions vs. ground truth to Excel
optimized_hho_train_results = pd.DataFrame({'Ground Truth': y_train_m1, 'Prediction': y_train_pred_hho_optimized})
optimized_hho_test_results = pd.DataFrame({'Ground Truth': y_test_m1, 'Prediction': y_test_pred_hho_optimized})

optimized_hho_train_results.to_excel('outputs/optimized_rf_hho_train_predictions.xlsx', index=False)
optimized_hho_test_results.to_excel('outputs/optimized_rf_hho_test_predictions.xlsx', index=False)
