# Datast 3 : Car Evaluation

In this dataset we will explore how our plumber optimizer performs on imbalenced multiclass classification where we use f1_score 

### Importing the libraries 

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import time
import psutil
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score

### Importinng the PlumberOptimizer

In [None]:
import sys
sys.path.append('../')
from Plumber import PlumberOptimizer

### Loading the Data and performing feature engineering:

In [3]:
file_path = './car.data'
column_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
car_data = pd.read_csv(file_path, header=None, names=column_names)

# One-hot encode the categorical variables
encoder = OneHotEncoder()
X = encoder.fit_transform(car_data.drop('class', axis=1))

# Label encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(car_data['class'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Testing the performance and efficiency of the PlumberOptimizer

In [None]:
# Arrays to store execution times, memory usage, CPU usage, and F1 scores
plumber_time = np.zeros(5,'float')
plumber_mem = np.zeros(5,'float')
plumber_cpu =  np.zeros(5,'float')
plumber_score = np.zeros(5,'float')


for i in range(5):
    opt = PlumberOptimizer(X_train, y_train, f1=True)

    # Monitor the current process, record the start time, initial CPU usage, and memory usage in MB
    process = psutil.Process(os.getpid())
    start_time = time.time()
    start_cpu = process.cpu_percent(interval=None)
    start_mem = process.memory_info().rss / (1024 * 1024) 
    best = opt.optimize()

    
    end_time = time.time()
    end_cpu = process.cpu_percent(interval=None)
    end_mem = process.memory_info().rss / (1024 * 1024)  

    # Calculating execution time, memory usage, and CPU usage
    plumber_time[i] = end_time - start_time
    plumber_mem[i] = end_mem - start_mem
    plumber_cpu[i] = end_cpu - start_cpu

    # Training the model using the best parameters
    rf = RandomForestClassifier(**best , random_state=42 )
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    plumber_score[i] = f1_score(y_test, y_pred,average='weighted')

# summary statistics for the Plumber Optimizer
print("\n" *3)
print("*************")
print(f"Execution Time: mean={np.mean(plumber_time):.2f} seconds, std={np.std(plumber_time)} seconds, max={np.max(plumber_time)} seconds, min={np.min(plumber_time)} seconds")
print(f"CPU Usage: mean={np.mean(plumber_cpu):.2f}%, std={np.std(plumber_cpu):.2f}%, max={np.max(plumber_cpu):.2f}%, min={np.min(plumber_cpu):.2f}%")
print(f"Memory Usage: mean={np.mean(plumber_mem):.2f} MB, std={np.std(plumber_mem):.2f} MB, max={np.max(plumber_mem):.2f} MB, min={np.min(plumber_mem):.2f} MB")
print(f"Validation Score: mean={np.mean(plumber_score)} , std={np.std(plumber_score)} , max={np.max(plumber_score)} , min={np.min(plumber_score)} ")

A single model takes  0.08180832862854004 seconds to run
Suggested number of trails is 150 
Best Parameters: {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'}
Best Score: 0.9500393440720188
A single model takes  0.07026886940002441 seconds to run
Suggested number of trails is 150 
Best Parameters: {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Best Score: 0.9500393440720188
A single model takes  0.06702589988708496 seconds to run
Suggested number of trails is 150 
Best Parameters: {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Best Score: 0.9500393440720188
A single model takes  0.06731224060058594 seconds to run
Suggested number of trails is 150 
Best Parameters: {'n_estimators': 75, 'max_depth': 25, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'}
Best Score: 0.9492042533373249
A

In [None]:
# Generating visualizations for parameter importance and parallel coordinate study

fig1, fig2 = opt.visualize()

fig1.show()
fig2.show()

fig1: After the first phase, This is how important each variable is

fig2: Visualizing the focused study plot parallel coordinate


### Testing the performance and efficiency of the GridSearch

In [None]:

# Defining the parameter grid for GridSearch
param_grid = {
    'n_estimators': [100, 200 , 300, 400, 500], 
    'max_features': ['sqrt', 'log2'],  
    'max_depth': [10, 20, 30, 40, 50],  
    'min_samples_split': [2, 5, 10, 15, 20], 
    'min_samples_leaf': [ 2, 4 , 6, 8, 10] 
}
rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')

# Monitor the current process, record the start time, initial CPU usage, and memory usage in MB
process = psutil.Process(os.getpid())
start_time = time.time()
start_cpu = process.cpu_percent(interval=None)
start_mem = process.memory_info().rss / (1024 * 1024)  

grid_search.fit(X_train, y_train)

end_time = time.time()
end_cpu = process.cpu_percent(interval=None)
end_mem = process.memory_info().rss / (1024 * 1024)  

# Print metrics for GridSearch
print(f"Execution Time: {end_time - start_time} seconds")
print(f"CPU Usage: {end_cpu - start_cpu}%")
print(f"Memory Usage: {end_mem - start_mem} MB")

# Training the model with the best parameters and evaluate
rf = RandomForestClassifier(** grid_search.best_params_, random_state=42 )
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

f1 = f1_score(y_test, y_pred,average='weighted')
print(f'validation f1_score: {f1}')

Execution Time: 79.20469880104065 seconds
CPU Usage: 2.9%
Memory Usage: 3.25 MB
validation f1_score: 0.9487251078407137



invalid value encountered in cast



### Testing the performance and efficiency of the RandomSearch

In [None]:
# Defining the parameter distribution for RandomizedSearch
param_dist = {
    'n_estimators': np.arange(50, 500),
    'max_features': ['sqrt', 'log2'],
    'max_depth': np.arange(10, 31),
    'min_samples_split': np.arange(2, 50),
    'min_samples_leaf': np.arange(1, 50)
}

# Arrays to store execution times, memory usage, CPU usage, and F1 scores
rand_time = np.zeros(5,'float')
rand_mem = np.zeros(5,'float')
rand_cpu =  np.zeros(5,'float')
rand_score = np.zeros(5,'float')


for i in range(5):

    rf = RandomForestClassifier(random_state=42)
    random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                    n_iter=250, cv=3, n_jobs=-1,scoring='f1_weighted')
    
    # Monitor the current process, record the start time, initial CPU usage, and memory usage in MB
    process = psutil.Process(os.getpid())
    start_time = time.time()
    start_cpu = process.cpu_percent(interval=None)
    start_mem = process.memory_info().rss / (1024 * 1024)  

    random_search.fit(X_train, y_train)

    end_time = time.time()
    end_cpu = process.cpu_percent(interval=None)
    end_mem = process.memory_info().rss / (1024 * 1024)  

    # Calculating execution time, memory usage, and CPU usage
    rand_time[i] = end_time - start_time
    rand_mem[i] = end_mem - start_mem
    rand_cpu[i] = end_cpu - start_cpu

    # Training using best parameters
    rf = RandomForestClassifier(** random_search.best_params_ , random_state=42 )
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    rand_score[i] = f1_score(y_test, y_pred,average='weighted')

# Summary metrics for RandomizedSearch
print(f"Execution Time: mean={np.mean(rand_time):.2f} seconds, std={np.std(rand_time)} seconds, max={np.max(rand_time)} seconds, min={np.min(rand_time)} seconds")
print(f"CPU Usage: mean={np.mean(rand_cpu):.2f}%, std={np.std(rand_cpu):.2f}%, max={np.max(rand_cpu):.2f}%, min={np.min(rand_cpu):.2f}%")
print(f"Memory Usage: mean={np.mean(rand_mem):.2f} MB, std={np.std(rand_mem):.2f} MB, max={np.max(rand_mem):.2f} MB, min={np.min(rand_mem):.2f} MB")
print(f"Validation Score: mean={np.mean(rand_score)} , std={np.std(rand_score)} , max={np.max(rand_score)} , min={np.min(rand_score)} ")



invalid value encountered in cast


invalid value encountered in cast


invalid value encountered in cast


invalid value encountered in cast



Execution Time: mean=13.41 seconds, std=0.43932979196647676 seconds, max=14.19255781173706 seconds, min=12.92927598953247 seconds
CPU Usage: mean=4.58%, std=0.52%, max=5.50%, min=4.10%
Memory Usage: mean=0.84 MB, std=0.80 MB, max=2.14 MB, min=0.00 MB
Validation Score: mean=0.953596218702686 , std=0.012131639932962164 , max=0.9664836991463859 , min=0.9333270820108261 



invalid value encountered in cast



### Testing the performance and efficiency of the BayesSearchCV

In [None]:
# Define the parameter distribution for BayesSearchCV
param_dist = {
    'n_estimators': Integer(50, 500),
    'max_features': Categorical(['sqrt', 'log2']),
    'max_depth': Integer(10, 50),
    'min_samples_split': Integer(2, 50),
    'min_samples_leaf': Integer(1, 50)
}

# Arrays to store execution times, memory usage, CPU usage, and F1 scores
rand_time = np.zeros(5,'float')
rand_mem = np.zeros(5,'float')
rand_cpu =  np.zeros(5,'float')
rand_score = np.zeros(5,'float')
for i in range(5):
    print(i)
    rf = RandomForestClassifier(random_state=42)
    bayes_search = BayesSearchCV(estimator=rf, search_spaces=param_dist,
                             n_iter=100, cv=3, n_jobs=-1,scoring='f1_weighted')

    # Monitor the current process, record the start time, initial CPU usage, and memory usage in MB
    process = psutil.Process(os.getpid())
    start_time = time.time()
    start_cpu = process.cpu_percent(interval=None)
    start_mem = process.memory_info().rss / (1024 * 1024) 

    bayes_search.fit(X_train, y_train)

    end_time = time.time()
    end_cpu = process.cpu_percent(interval=None)
    end_mem = process.memory_info().rss / (1024 * 1024)  

    # Calculating execution time, memory usage, and CPU usage
    rand_time[i] = end_time - start_time
    rand_mem[i] = end_mem - start_mem
    rand_cpu[i] = end_cpu - start_cpu

    # Training using best parameters
    rf = RandomForestClassifier(** bayes_search.best_params_ , random_state=42 )
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    rand_score[i] = f1_score(y_test, y_pred,average='weighted')


# Summary metrics for BayesSearchCV
print(f"Execution Time: mean={np.mean(rand_time):.2f} seconds, std={np.std(rand_time)} seconds, max={np.max(rand_time)} seconds, min={np.min(rand_time)} seconds")
print(f"CPU Usage: mean={np.mean(rand_cpu):.2f}%, std={np.std(rand_cpu):.2f}%, max={np.max(rand_cpu):.2f}%, min={np.min(rand_cpu):.2f}%")
print(f"Memory Usage: mean={np.mean(rand_mem):.2f} MB, std={np.std(rand_mem):.2f} MB, max={np.max(rand_mem):.2f} MB, min={np.min(rand_mem):.2f} MB")
print(f"Validation Score: mean={np.mean(rand_score)} , std={np.std(rand_score)} , max={np.max(rand_score)} , min={np.min(rand_score)} ")


0
1
2
3



The objective has been evaluated at point [np.int64(50), np.str_('log2'), np.int64(1), np.int64(2), np.int64(500)] before, using random point [np.int64(21), 'sqrt', np.int64(18), np.int64(46), np.int64(415)]



4
Execution Time: mean=89.89 seconds, std=3.208992615543136 seconds, max=95.37893891334534 seconds, min=86.04194712638855 seconds
CPU Usage: mean=81.32%, std=0.61%, max=82.00%, min=80.40%
Memory Usage: mean=21.89 MB, std=33.64 MB, max=86.84 MB, min=0.00 MB
Validation Score: mean=0.9687250974369241 , std=0.004751394555981951 , max=0.9725060924394234 , min=0.9596102106818903 
