In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import MinMaxScaler

# Load the dataset
data_m1 = pd.read_excel('data/M2.xlsx')

# Features (X) and target (y)
X_m1 = data_m1.drop('WPR', axis=1)
y_m1 = data_m1['WPR']

# Show the first 20 rows of the data
print(data_m1.head(20))

# Split data into training and testing sets
X_train_m1, X_test_m1, y_train_m1, y_test_m1 = train_test_split(X_m1, y_m1, test_size=0.2, random_state=42)

print("X_train_m1 shape:", X_train_m1.shape)
print("y_train_m1 shape:", y_train_m1.shape)

scaler = MinMaxScaler()
X_train_m1 = scaler.fit_transform(X_train_m1)
X_test_m1 = scaler.transform(X_test_m1)



     effa   effc   effd      mu     T20       win    COP       WPR
0   0.000  0.000  0.000  0.0000   0.000  0.002411  0.000  0.000000
1   0.000  0.000  0.000  0.0000   0.000  0.002006  0.000  0.000000
2   0.000  0.000  0.000  0.0000   0.000  0.002001  0.000  0.000000
3   0.000  0.000  0.000  0.0000   0.000  0.002102  0.000  0.000000
4   0.000  0.000  0.000  0.0000   0.000  0.002078  0.000  0.000000
5   0.000  0.000  0.000  0.0000   0.000  0.002011  0.000  0.000000
6   0.000  0.000  0.000  0.0000   0.000  0.001874  0.000  0.000000
7   0.000  0.000  0.000  0.0000   0.000  0.001938  0.000  0.000000
8   0.689  0.641  0.497  0.1590  -6.700  0.002231  0.698  0.100440
9   0.666  0.641  0.478  0.2288 -13.250  0.002167  0.771  1.051920
10  0.653  0.641  0.469  0.2544 -16.520  0.002276  0.777  1.505088
11  0.648  0.641  0.465  0.2702 -16.210  0.002703  0.777  1.938276
12  0.645  0.641  0.462  0.2800 -16.050  0.003149  0.776  2.405268
13  0.649  0.641  0.465  0.2815 -13.430  0.003044  0.777  2.01

In [2]:
from sklearn.impute import SimpleImputer

# Impute missing values in X_train_m1
imputer = SimpleImputer(strategy='mean')  # You can also use 'median' or 'constant'
X_train_m1 = imputer.fit_transform(X_train_m1)
X_test_m1 = imputer.transform(X_test_m1)

In [3]:
# Train the baseline Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_m1, y_train_m1)

# Make predictions on both training and test data
y_train_pred = rf_model.predict(X_train_m1)
y_test_pred = rf_model.predict(X_test_m1)

# Evaluate the model on the test data
mse = mean_squared_error(y_test_m1, y_test_pred)
r2 = r2_score(y_test_m1, y_test_pred)

print(f'Baseline RF Model - Mean Squared Error: {mse}')
print(f'Baseline RF Model - R^2 Score: {r2}')

# Save predictions vs. ground truth to Excel
baseline_train_results = pd.DataFrame({'Ground Truth': y_train_m1, 'Prediction': y_train_pred})
baseline_test_results = pd.DataFrame({'Ground Truth': y_test_m1, 'Prediction': y_test_pred})

baseline_train_results.to_excel('outputs/M2baseline_rf_train_predictions.xlsx', index=False)
baseline_test_results.to_excel('outputs/M2baseline_rf_test_predictions.xlsx', index=False)


Baseline RF Model - Mean Squared Error: 0.4047362462462565
Baseline RF Model - R^2 Score: 0.5755335331757621


In [4]:
from pyswarm import pso

# Objective function to minimize
def rf_pso(params):
    n_estimators = int(params[0])
    max_depth = int(params[1])
    
    rf_model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )
    rf_model.fit(X_train_m1, y_train_m1)
    y_pred = rf_model.predict(X_test_m1)
    
    return mean_squared_error(y_test_m1, y_pred)

# PSO parameter bounds
lb = [10, 1]  # Lower bounds for n_estimators and max_depth
ub = [100, 20]  # Upper bounds for n_estimators and max_depth

# Run PSO
optimal_params, optimal_mse = pso(rf_pso, lb, ub, swarmsize=10, maxiter=10)

print(f'Optimized Parameters (PSO): {optimal_params}')
print(f'Optimized Mean Squared Error (PSO): {optimal_mse}')


Stopping search: maximum iterations reached --> 10
Optimized Parameters (PSO): [35.01825789  7.70033045]
Optimized Mean Squared Error (PSO): 0.35410820046814506


In [5]:
# Retrain the Random Forest with optimized parameters from PSO
n_estimators_optimized = int(optimal_params[0])
max_depth_optimized = int(optimal_params[1])

rf_model_optimized = RandomForestRegressor(
    n_estimators=n_estimators_optimized, 
    max_depth=max_depth_optimized, 
    random_state=42
)

rf_model_optimized.fit(X_train_m1, y_train_m1)

# Make predictions on both training and test data
y_train_pred_optimized = rf_model_optimized.predict(X_train_m1)
y_test_pred_optimized = rf_model_optimized.predict(X_test_m1)

# Evaluate the optimized model on the test data
mse_optimized = mean_squared_error(y_test_m1, y_test_pred_optimized)
r2_optimized = r2_score(y_test_m1, y_test_pred_optimized)

print(f'Optimized RF Model (PSO) - Mean Squared Error: {mse_optimized}')
print(f'Optimized RF Model (PSO) - R^2 Score: {r2_optimized}')

# Save predictions vs. ground truth to Excel
optimized_train_results = pd.DataFrame({'Ground Truth': y_train_m1, 'Prediction': y_train_pred_optimized})
optimized_test_results = pd.DataFrame({'Ground Truth': y_test_m1, 'Prediction': y_test_pred_optimized})

optimized_train_results.to_excel('outputs/M2optimized_rf_pso_train_predictions.xlsx', index=False)
optimized_test_results.to_excel('outputs/M2optimized_rf_pso_test_predictions.xlsx', index=False)


Optimized RF Model (PSO) - Mean Squared Error: 0.35410820046814506
Optimized RF Model (PSO) - R^2 Score: 0.6286296121974948


In [6]:
# Compare results of baseline and Optimized RF Model (PSO
print("Baseline RF Model")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

print("\nOptimized RF Model (PSO)")
print(f"Mean Squared Error: {mse_optimized}")
print(f"R^2 Score: {r2_optimized}")


Baseline RF Model
Mean Squared Error: 0.4047362462462565
R^2 Score: 0.5755335331757621

Optimized RF Model (PSO)
Mean Squared Error: 0.35410820046814506
R^2 Score: 0.6286296121974948


In [7]:
#Optimize Random Forest with Genetic Algorithm (GA)
import random
from deap import base, creator, tools, algorithms
from sklearn.metrics import mean_squared_error

def rf_ga(individual):
    n_estimators = int(individual[1])
    max_depth = int(individual[1])
    
    rf_model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )
    rf_model.fit(X_train_m1, y_train_m1)
    y_pred = rf_model.predict(X_test_m1)
    
    return mean_squared_error(y_test_m1, y_pred),

# Create types
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)

# Register functions
toolbox = base.Toolbox()
toolbox.register("attr_float", random.uniform, 0.01, 0.3)
toolbox.register("attr_int", random.randint, 3, 10)
toolbox.register("attr_int2", random.randint, 50, 300)
toolbox.register("individual", tools.initCycle, creator.Individual,
                 (toolbox.attr_float, toolbox.attr_int, toolbox.attr_int2), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", rf_ga)
toolbox.register("mate", tools.cxBlend, alpha=0.5)

# Mutation function that constrains the learning_rate to [0.01, 0.3]
def constrained_mutation(individual, indpb):
    if random.random() < indpb:
        individual[0] = min(max(individual[0] + random.gauss(0, 0.05), 0.01), 0.3)
    if random.random() < indpb:
        individual[1] = random.randint(3, 10)
    if random.random() < indpb:
        individual[2] = random.randint(50, 300)
    return individual,

toolbox.register("mutate", constrained_mutation, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)

# Define population and evolution
population = toolbox.population(n=10)
algorithms.eaSimple(population, toolbox, cxpb=0.7, mutpb=0.2, ngen=40, verbose=True)

# Extract best individual
best_individual = tools.selBest(population, 1)[0]

# Extracting the best individual
best_individual = tools.selBest(population, k=1)[0]
optimal_params_ga = [int(best_individual[0]), int(best_individual[1])]
print(f'Optimized Parameters (GA): {optimal_params_ga}')
print(f'GA optimized parameters: {best_individual}')


gen	nevals
0  	10    
1  	10    
2  	8     
3  	10    
4  	9     
5  	8     
6  	9     
7  	7     
8  	5     
9  	6     
10 	8     
11 	6     
12 	6     
13 	6     
14 	8     
15 	10    
16 	9     
17 	5     
18 	5     
19 	10    
20 	8     
21 	8     
22 	6     
23 	8     
24 	8     
25 	10    
26 	9     
27 	7     
28 	6     
29 	8     
30 	8     
31 	10    
32 	6     
33 	7     
34 	10    
35 	6     
36 	10    
37 	8     
38 	9     
39 	7     
40 	8     
Optimized Parameters (GA): [0, 8]
GA optimized parameters: [0.06800087993174751, 8.640490283125306, 121.49124043910572]


In [8]:
# Retrain the Random Forest with optimized parameters from GA
rf_model_ga_optimized = RandomForestRegressor(
    n_estimators=optimal_params_ga[1], 
    max_depth=optimal_params_ga[1], 
    random_state=42
)

rf_model_ga_optimized.fit(X_train_m1, y_train_m1)

# Make predictions on both training and test data
y_train_pred_ga_optimized = rf_model_ga_optimized.predict(X_train_m1)
y_test_pred_ga_optimized = rf_model_ga_optimized.predict(X_test_m1)

# Evaluate the optimized model on the test data
mse_ga_optimized = mean_squared_error(y_test_m1, y_test_pred_ga_optimized)
r2_ga_optimized = r2_score(y_test_m1, y_test_pred_ga_optimized)

print(f'Optimized RF Model (GA) - Mean Squared Error: {mse_ga_optimized}')
print(f'Optimized RF Model (GA) - R^2 Score: {r2_ga_optimized}')

# Save predictions vs. ground truth to Excel
optimized_ga_train_results = pd.DataFrame({'Ground Truth': y_train_m1, 'Prediction': y_train_pred_ga_optimized})
optimized_ga_test_results = pd.DataFrame({'Ground Truth': y_test_m1, 'Prediction': y_test_pred_ga_optimized})

optimized_ga_train_results.to_excel('outputs/M2optimized_rf_ga_train_predictions.xlsx', index=False)
optimized_ga_test_results.to_excel('outputs/M2optimized_rf_ga_test_predictions.xlsx', index=False)


Optimized RF Model (GA) - Mean Squared Error: 0.35962710430243205
Optimized RF Model (GA) - R^2 Score: 0.6228416709567263


In [9]:
#Optimized RF Model (PSO) vs. Optimized RF Model (GA)

In [10]:
import optuna
from sklearn.ensemble import RandomForestRegressor

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 100)
    max_depth = trial.suggest_int('max_depth', 1, 20)

    rf_model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )
    
    rf_model.fit(X_train_m1, y_train_m1)
    y_pred = rf_model.predict(X_test_m1)
    
    mse = mean_squared_error(y_test_m1, y_pred)
    
    return mse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

optimal_params_optuna = study.best_params
print(f'Optimized Parameters (Optuna): {optimal_params_optuna}')

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-08-31 23:18:50,630] A new study created in memory with name: no-name-b7e6ef12-1a84-4aa7-94fd-1025b9a43a0f
[I 2024-08-31 23:18:50,759] Trial 0 finished with value: 0.35863664627367975 and parameters: {'n_estimators': 15, 'max_depth': 8}. Best is trial 0 with value: 0.35863664627367975.
[I 2024-08-31 23:18:51,104] Trial 1 finished with value: 0.7130983551709307 and parameters: {'n_estimators': 75, 'max_depth': 1}. Best is trial 0 with value: 0.35863664627367975.
[I 2024-08-31 23:18:51,746] Trial 2 finished with value: 0.3585013046007807 and parameters: {'n_estimators': 63, 'max_depth': 7}. Best is trial 2 with value: 0.3585013046007807.
[I 2024-08-31 23:18:51,949] Trial 3 finished with value: 0.40272765194297144 and parameters: {'n_estimators': 13, 'max_depth': 17}. Best is trial 2 with value: 0.3585013046007807.
[I 2024-08-31 23:18:52,353] Trial 4 finished with value: 0.3569705186864074 and parameters: {'n_estimators': 48, 'max_d

Optimized Parameters (Optuna): {'n_estimators': 95, 'max_depth': 8}


In [11]:
# Retrain the model with the optimized parameters
rf_model_optuna_optimized = RandomForestRegressor(
    n_estimators=optimal_params_optuna['n_estimators'],
    max_depth=optimal_params_optuna['max_depth'],
    random_state=42
)

rf_model_optuna_optimized.fit(X_train_m1, y_train_m1)

# Make predictions on both training and test data
y_train_pred_optuna_optimized = rf_model_optuna_optimized.predict(X_train_m1)
y_test_pred_optuna_optimized = rf_model_optuna_optimized.predict(X_test_m1)

# Save predictions vs. ground truth to Excel
optimized_optuna_train_results = pd.DataFrame({'Ground Truth': y_train_m1, 'Prediction': y_train_pred_optuna_optimized})
optimized_optuna_test_results = pd.DataFrame({'Ground Truth': y_test_m1, 'Prediction': y_test_pred_optuna_optimized})

optimized_optuna_train_results.to_excel('outputs/M2optimized_rf_optuna_train_predictions.xlsx', index=False)
optimized_optuna_test_results.to_excel('outputs/M2optimized_rf_optuna_test_predictions.xlsx', index=False)

In [12]:
import numpy as np
from niapy.task import Task
from niapy.algorithms.basic import HarrisHawksOptimization
from niapy.problems.problem import Problem
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Define a custom benchmark problem
class RandomForestOptimizationProblem(Problem):
    def __init__(self, X_train, y_train, X_test, y_test):
        # Define the problem dimension and bounds (2 parameters to optimize)
        super().__init__(dimension=2, lower=[10, 1], upper=[100, 20])
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

    def _evaluate(self, x):
        # Extract the parameters
        n_estimators = int(x[0])
        max_depth = int(x[1])

        # Create and train the Random Forest model
        rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        rf_model.fit(self.X_train, self.y_train)

        # Predict and calculate the mean squared error
        y_pred = rf_model.predict(self.X_test)
        mse = mean_squared_error(self.y_test, y_pred)
        
        return mse

# Initialize the problem with your dataset
problem = RandomForestOptimizationProblem(X_train_m1, y_train_m1, X_test_m1, y_test_m1)


In [13]:
# Define the task for the HHO algorithm
task = Task(problem=problem, max_iters=100)

# Initialize the HHO algorithm
algo = HarrisHawksOptimization(population_size=30)

# Run the optimization
best_params, best_mse = algo.run(task)

print(f'Optimized Parameters (HHO): n_estimators = {int(best_params[0])}, max_depth = {int(best_params[1])}')
print(f'Best MSE achieved: {best_mse}')


Optimized Parameters (HHO): n_estimators = 21, max_depth = 7
Best MSE achieved: 0.35303886323718014


In [14]:
# Train the model with the optimized parameters
rf_model_hho_optimized = RandomForestRegressor(
    n_estimators=int(best_params[0]),
    max_depth=int(best_params[1]),
    random_state=42
)

rf_model_hho_optimized.fit(X_train_m1, y_train_m1)

# Make predictions on both training and test data
y_train_pred_hho_optimized = rf_model_hho_optimized.predict(X_train_m1)
y_test_pred_hho_optimized = rf_model_hho_optimized.predict(X_test_m1)

# Save predictions vs. ground truth to Excel
optimized_hho_train_results = pd.DataFrame({'Ground Truth': y_train_m1, 'Prediction': y_train_pred_hho_optimized})
optimized_hho_test_results = pd.DataFrame({'Ground Truth': y_test_m1, 'Prediction': y_test_pred_hho_optimized})

optimized_hho_train_results.to_excel('outputs/M2optimized_rf_hho_train_predictions.xlsx', index=False)
optimized_hho_test_results.to_excel('outputs/M2optimized_rf_hho_test_predictions.xlsx', index=False)
