# Datast 4 : Auto-MPG

In this dataset we will explore how our plumber optimizer performs on a simple regression task with a relatively small dataset

### Importing the libraries 

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import time
import psutil
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score

### Importinng the PlumberOptimizer

In [None]:
import sys
sys.path.append('../')
from Plumber import PlumberOptimizer

### Loading the Data and performing feature engineering:

In [None]:
file_path = "./auto-mpg.data"
column_names = ["mpg", "cylinders", "displacement", "horsepower", "weight", 
                "acceleration", "model_year", "origin", "car_name"]

data = pd.read_csv(file_path, names=column_names, sep=r'\s+', na_values="?")

data["horsepower"] = data["horsepower"].fillna(data["horsepower"].median())

# Feature engineering
encoder = OneHotEncoder(sparse_output=False, drop='first')  # Updated parameter
origin_encoded = encoder.fit_transform(data[['origin']])
origin_encoded_df = pd.DataFrame(origin_encoded, columns=encoder.get_feature_names_out(['origin']))
data = pd.concat([data, origin_encoded_df], axis=1)

data.drop(columns=["origin", "car_name"], inplace=True)

# Splitting the data into features X and target y, and performing a train-test split
X = data.drop(columns=["mpg"])
y = data["mpg"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Baseline Random Forest model
model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

Mean Squared Error: 4.83


### Testing the performance and efficiency of the PlumberOptimizer

In [None]:
# Arrays to store execution times, memory usage, CPU usage, and evaluatin scores
plumber_time = np.zeros(5,'float')
plumber_mem = np.zeros(5,'float')
plumber_cpu =  np.zeros(5,'float')
plumber_score = np.zeros(5,'float')

for i in range(5):
    opt = PlumberOptimizer(X_train, y_train,classification=False)

    # Monitor the current process, record the start time, initial CPU usage, and memory usage in MB
    process = psutil.Process(os.getpid())
    start_time = time.time()
    start_cpu = process.cpu_percent(interval=None)
    start_mem = process.memory_info().rss / (1024 * 1024) 


    best = opt.optimize()

    end_time = time.time()
    end_cpu = process.cpu_percent(interval=None)
    end_mem = process.memory_info().rss / (1024 * 1024)  

    # Calculating execution time, memory usage, and CPU usage
    plumber_time[i] = end_time - start_time
    plumber_mem[i] = end_mem - start_mem
    plumber_cpu[i] = end_cpu - start_cpu

    # Training and evaluating the model using the best parameters
    rf = RandomForestRegressor(**best , random_state=42 )
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)


    plumber_score[i] = mean_squared_error(y_test, y_pred)

# summary for the Plumber Optimizer
print("\n" *3)
print("*************")
print(f"Execution Time: mean={np.mean(plumber_time):.2f} seconds, std={np.std(plumber_time)} seconds, max={np.max(plumber_time)} seconds, min={np.min(plumber_time)} seconds")
print(f"CPU Usage: mean={np.mean(plumber_cpu):.2f}%, std={np.std(plumber_cpu):.2f}%, max={np.max(plumber_cpu):.2f}%, min={np.min(plumber_cpu):.2f}%")
print(f"Memory Usage: mean={np.mean(plumber_mem):.2f} MB, std={np.std(plumber_mem):.2f} MB, max={np.max(plumber_mem):.2f} MB, min={np.min(plumber_mem):.2f} MB")
print(f"Validation Score: mean={np.mean(plumber_score)} , std={np.std(plumber_score)} , max={np.max(plumber_score)} , min={np.min(plumber_score)} ")

A single model takes  0.07674717903137207 seconds to run
Suggested number of trails is 150 
Best Parameters: {'n_estimators': 300, 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'log2'}
Best Score: -9.456518132311913
A single model takes  0.05961298942565918 seconds to run
Suggested number of trails is 150 
Best Parameters: {'n_estimators': 200, 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'log2'}
Best Score: -9.448879200591401
A single model takes  0.05901598930358887 seconds to run
Suggested number of trails is 150 
Best Parameters: {'n_estimators': 50, 'max_depth': 13, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'log2'}
Best Score: -9.499934884780803
A single model takes  0.05967998504638672 seconds to run
Suggested number of trails is 150 
Best Parameters: {'n_estimators': 300, 'max_depth': 25, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'log2'}
Best Score: -9.458525990164155
A

### Testing the performance and efficiency of the GridSearch

In [None]:
# Defining GridSearch parameters
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],  
    'max_features': ['sqrt', 'log2'],  
    'max_depth': [10, 20, 30, 40, 50],  
    'min_samples_split': [2, 5, 10, 15, 20], 
    'min_samples_leaf': [2, 4, 6, 8,12]  
}

rf = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

# Monitor the current process, record the start time, initial CPU usage, and memory usage in MB
process = psutil.Process(os.getpid())
start_time = time.time()
start_cpu = process.cpu_percent(interval=None)
start_mem = process.memory_info().rss / (1024 * 1024)  

grid_search.fit(X_train, y_train)

end_time = time.time()
end_cpu = process.cpu_percent(interval=None)
end_mem = process.memory_info().rss / (1024 * 1024) 

# Print metrics for GridSearch
print(f"Execution Time: {end_time - start_time:.2f} seconds")
print(f"CPU Usage: {end_cpu - start_cpu:.2f}%")
print(f"Memory Usage: {end_mem - start_mem:.2f} MB")

# Training the model with the best parameters and evaluate
rf = RandomForestRegressor(** grid_search.best_params_, random_state=42 )
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)


accuracy = mean_squared_error(y_test, y_pred)
print(f"Validation Accuracy: {accuracy}")




Execution Time: 55.61 seconds
CPU Usage: 5.20%
Memory Usage: 0.39 MB
Validation Accuracy: 4.759129346682876


  _data = np.array(data, dtype=dtype, copy=copy,


### Testing the performance and efficiency of the RandomSearch

In [None]:
# Defining the parameter for RandomizedSearch
param_dist = {
    'n_estimators': np.arange(50, 500),
    'max_features': ['sqrt', 'log2'],
    'max_depth': np.arange(10, 31),
    'min_samples_split': np.arange(2, 50),
    'min_samples_leaf': np.arange(1, 50)
}
# Arrays to store execution times, memory usage, CPU usage, and evaluation scores
rand_time = np.zeros(5,'float')
rand_mem = np.zeros(5,'float')
rand_cpu =  np.zeros(5,'float')
rand_score = np.zeros(5,'float')
for i in range(5):
    rf = RandomForestRegressor(random_state=42)
    random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                    n_iter=250, cv=3, n_jobs=-1,scoring='neg_mean_squared_error')
    
    # Monitor the current process, record the start time, initial CPU usage, and memory usage in MB
    process = psutil.Process(os.getpid())
    start_time = time.time()
    start_cpu = process.cpu_percent(interval=None)
    start_mem = process.memory_info().rss / (1024 * 1024)  

    random_search.fit(X_train, y_train)

    end_time = time.time()
    end_cpu = process.cpu_percent(interval=None)
    end_mem = process.memory_info().rss / (1024 * 1024)  

    # Calculating execution time, memory usage, and CPU usage
    rand_time[i] = end_time - start_time
    rand_mem[i] = end_mem - start_mem
    rand_cpu[i] = end_cpu - start_cpu

    # Training the model with the best parameters and evaluate
    rf = RandomForestRegressor(** random_search.best_params_ , random_state=42 )
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)


    rand_score[i] = mean_squared_error(y_test, y_pred)

#RandomizedSearchCV results
print(f"Execution Time: mean={np.mean(rand_time):.2f} seconds, std={np.std(rand_time)} seconds, max={np.max(rand_time)} seconds, min={np.min(rand_time)} seconds")
print(f"CPU Usage: mean={np.mean(rand_cpu):.2f}%, std={np.std(rand_cpu):.2f}%, max={np.max(rand_cpu):.2f}%, min={np.min(rand_cpu):.2f}%")
print(f"Memory Usage: mean={np.mean(rand_mem):.2f} MB, std={np.std(rand_mem):.2f} MB, max={np.max(rand_mem):.2f} MB, min={np.min(rand_mem):.2f} MB")
print(f"Validation Score: mean={np.mean(rand_score)} , std={np.std(rand_score)} , max={np.max(rand_score)} , min={np.min(rand_score)} ")


  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,


Execution Time: mean=9.69 seconds, std=0.2744911821236582 seconds, max=10.126244068145752 seconds, min=9.30179476737976 seconds
CPU Usage: mean=5.80%, std=0.52%, max=6.40%, min=5.10%
Memory Usage: mean=0.69 MB, std=0.53 MB, max=1.53 MB, min=0.00 MB
Validation Score: mean=4.7608038748004 , std=0.1876776310729053 , max=4.978418353616898 , min=4.470734293932664 


  _data = np.array(data, dtype=dtype, copy=copy,


### Testing the performance and efficiency of the BayesSearchCV

In [None]:
# Define the parameter for BayesSearchCV
param_dist = {
    'n_estimators': Integer(50, 500),
    'max_features': Categorical(['sqrt', 'log2']),
    'max_depth': Integer(10, 50),
    'min_samples_split': Integer(2, 50),
    'min_samples_leaf': Integer(1, 50)
}
# Arrays to store execution times, memory usage, CPU usage, and evaluation scores
rand_time = np.zeros(5,'float')
rand_mem = np.zeros(5,'float')
rand_cpu =  np.zeros(5,'float')
rand_score = np.zeros(5,'float')

for i in range(5):
    print(i)
    rf = RandomForestRegressor(random_state=42)
    bayes_search = BayesSearchCV(estimator=rf, search_spaces=param_dist,
                             n_iter=50, cv=3, n_jobs=-1,scoring='neg_mean_squared_error')

    # Monitor the current process, record the start time, initial CPU usage, and memory usage in MB
    process = psutil.Process(os.getpid())
    start_time = time.time()
    start_cpu = process.cpu_percent(interval=None)
    start_mem = process.memory_info().rss / (1024 * 1024) 

    bayes_search.fit(X_train, y_train)

    end_time = time.time()
    end_cpu = process.cpu_percent(interval=None)
    end_mem = process.memory_info().rss / (1024 * 1024)  

    # Calculating execution time, memory usage, and CPU usage
    rand_time[i] = end_time - start_time
    rand_mem[i] = end_mem - start_mem
    rand_cpu[i] = end_cpu - start_cpu

    # Training the model with the best parameters and evaluate
    rf = RandomForestRegressor(** bayes_search.best_params_ , random_state=42 )
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)


    rand_score[i] = mean_squared_error(y_test, y_pred)


# BayesSearchCV results
print(f"Execution Time: mean={np.mean(rand_time):.2f} seconds, std={np.std(rand_time)} seconds, max={np.max(rand_time)} seconds, min={np.min(rand_time)} seconds")
print(f"CPU Usage: mean={np.mean(rand_cpu):.2f}%, std={np.std(rand_cpu):.2f}%, max={np.max(rand_cpu):.2f}%, min={np.min(rand_cpu):.2f}%")
print(f"Memory Usage: mean={np.mean(rand_mem):.2f} MB, std={np.std(rand_mem):.2f} MB, max={np.max(rand_mem):.2f} MB, min={np.min(rand_mem):.2f} MB")
print(f"Validation Score: mean={np.mean(rand_score)} , std={np.std(rand_score)} , max={np.max(rand_score)} , min={np.min(rand_score)} ")


0
1




2




3
4




Execution Time: mean=23.59 seconds, std=1.7086194567197954 seconds, max=26.98505401611328 seconds, min=22.485191106796265 seconds
CPU Usage: mean=72.96%, std=1.25%, max=75.10%, min=71.20%
Memory Usage: mean=34.35 MB, std=55.40 MB, max=144.25 MB, min=0.00 MB
Validation Score: mean=4.745903558755072 , std=0.04455957154211209 , max=4.8101351064797795 , min=4.691229684685018 
