In [1]:
# Data processing libraries
import datasets
import pandas as pd
import os
import datetime
import matplotlib.pyplot as plt
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Import SVC classifier
from sklearn.svm import SVC
from SVC_optimizer_exe import SVC_Optimizer

# Import XGB classifier
import xgboost as xgb
from XGB_optimizer_exe import XGB_optimizer

# Import Gaussian classifier
from sklearn.naive_bayes import GaussianNB

# Import metrics to compute accuracy
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score

# Import optimizer
import optuna

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Import functools
from functools import partial
import joblib

#Import preprocessor
from Preprocessor_exe import Preprocessor
from Preprocessor_caller import caller

2024-09-26 13:40:45.964405: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-26 13:40:46.059698: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-26 13:40:46.636143: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-26 13:40:47.201398: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-26 13:40:47.643190: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been 

# Dataframe Builder

In [3]:
dtf = datasets.gdd.load_data()

In [14]:

dtf= caller(dtf, "Label").df


y = dtf["Label"]
x = dtf.drop("Label", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.33, random_state=42)


Epoch 108: early stopping
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 573us/step
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 430us/step
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 438us/step
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 424us/step


# Optimizers for SVC and XGB

In [7]:

def SVC_Optimizer (X_train, X_test, y_train, y_test):
    
    '''optimizes an SVC model by performing a GridSearch on fixed parameters.
    It takes as input the result of a train-test split (X_train, X_test, y_train, y_test)
    and returns the best model possible based on accuracy.
    '''
    
    '''Initializes the SVC_Optimizer with training and test datasets.
    
    Parameters:
    ----------
    X_train : array-like
        Training feature data.
    X_test : array-like
        Test feature data.
    y_train : array-like
        Training target data.
    y_test : array-like
        Test target data.
    '''
    X_train = X_train
    X_test = X_test
    y_train = y_train
    y_test = y_test
    parameters = [{'C': [1, 10, 100, 1000], 
                        'kernel': ['rbf'], 
                        'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]},                 
                        ]
    

    def optimizer(parameters, X_train, y_train):
        '''Performs GridSearch to find the best hyperparameters for the SVC model.
        
        Returns:
        -------
        dict
            The best parameters found during the GridSearch.
        '''
        svc = SVC()
        parameters = parameters  
        
        # Instantiate the GridSearchCV object with specified parameters
        grid_search = GridSearchCV(estimator=svc,  
                                    param_grid=parameters,
                                    scoring='accuracy',
                                    cv=5,
                                    refit=True,
                                    verbose=0,
                                    n_jobs=-1)

        # Fit the grid search to the training data
        grid_search.fit(X_train, y_train)
        
        # Return the best parameters found
        return grid_search.best_params_, grid_search.best_estimator_
    
  
    # Get the best parameters from the optimizer
    best_params, best_model  = optimizer(parameters, X_train, y_train)

    # Print the best parameters for reference
    print(f"Best parameters found: {best_params}")

    return best_model
           



In [8]:
def XGB_optimizer(X_train, X_test, y_train, y_test):
    '''optimizes an XGBoost model using Optuna for hyperparameter tuning.
It takes as input the result of a train-test split (X_train, X_test, y_train, y_test)
and returns the best model possible based on accuracy.

    
    Parameters:
    ----------
    X_train : array-like
        Training feature data.
    X_test : array-like
        Test feature data.
    y_train : array-like
        Training target data.
    y_test : array-like
        Test target data.
    '''
    X_train = X_train
    X_test = X_test
    y_train = y_train
    y_test = y_test

    def objective(trial):
        '''Defines the objective function for Optuna to optimize XGBoost hyperparameters.
        
        Parameters:
        ----------
        trial : optuna.trial.Trial
            A single call of the objective function corresponds to one trial of the optimization.
        
        Returns:
        -------
        float
            The accuracy of the model with the current set of hyperparameters.
        '''
        # Define the hyperparameter search space
        param = {
            'objective': trial.suggest_categorical("obj", ['reg:squarederror', 'reg:logistic', 'multi:softmax']),
            'booster': trial.suggest_categorical("booster", ["gbtree", "dart"]),     
            'colsample_bynode': trial.suggest_float("colsample_bynode", 0.5, 1.0),
            'colsample_bytree': trial.suggest_float("colsample_bytree", 0.5, 1.0),        
            'n_estimators': 150,        
            'reg_alpha': trial.suggest_float("alpha", 1e-8, 1.0, log=True), # L1 regularization
            'reg_lambda': trial.suggest_float("lambda", 1e-8, 1.0, log=True), # L2 regularization        
            'subsample': trial.suggest_float("subsample", 0.2, 1.0),        
        }

        # Additional parameters specific to 'gbtree' or 'dart' boosters
        if param["booster"] in ["gbtree", "dart"]:
            param["max_depth"] = trial.suggest_int("max_depth", 3, 5, step=2)
            param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
            param["eta"] =  trial.suggest_float("learning_rate", 0.008, 0.2)
            param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
            param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

        if param["booster"] == "dart":
            param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
            param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
            param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
            param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
        
        # Create and train the XGBoost model with the suggested hyperparameters
        xgb_model = xgb.XGBClassifier(**param)
        xgb_model.fit(X_train, y_train)
        
        # Predict and calculate accuracy
        y_pred = xgb_model.predict(X_test)
        accuracy = round(accuracy_score(y_test, y_pred), 4)

        return accuracy

    def optimization():
        '''Performs the optimization process using Optuna to find the best hyperparameters.
        
        Returns:
        -------
        optuna.trial.FrozenTrial
            The best trial found by the optimization process, containing the best hyperparameters and their corresponding accuracy.
        '''
        study = optuna.create_study(direction="maximize")
        study.optimize(partial(objective), n_trials=10, timeout=600)
        return study.best_trial
                
    best_trial = optimization()
    best_params = best_trial.params
    
    # Train the final model with the best hyperparameters
    best_model = xgb.XGBClassifier(**best_params)
    best_model.fit(X_train, y_train)

    # Print best hyperparameters
    print(f"Best hyperparameters found: {best_params}")
    
    # Return the best model and the best trial
    return best_model, best_trial


## Calls

In [9]:
svc = SVC_Optimizer(X_train, X_test, y_train, y_test)
xgb = XGB_optimizer(X_train, X_test, y_train, y_test)

[I 2024-09-26 13:00:50,267] A new study created in memory with name: no-name-d6c16826-8bc4-4a6a-b11a-0d71212d9b92


Best parameters found: {'C': 1000, 'gamma': 0.2, 'kernel': 'rbf'}


[I 2024-09-26 13:00:50,927] Trial 0 finished with value: 0.9238 and parameters: {'obj': 'multi:softmax', 'booster': 'gbtree', 'colsample_bynode': 0.6179065209820528, 'colsample_bytree': 0.8891933764019502, 'alpha': 2.549759591989523e-07, 'lambda': 0.0003151835935466623, 'subsample': 0.710531342207821, 'max_depth': 3, 'min_child_weight': 2, 'learning_rate': 0.1372631862063397, 'gamma': 2.14838443195522e-08, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.9238.
[I 2024-09-26 13:01:19,434] Trial 1 finished with value: 0.909 and parameters: {'obj': 'multi:softmax', 'booster': 'dart', 'colsample_bynode': 0.8363227384301146, 'colsample_bytree': 0.6326759672576856, 'alpha': 6.564312726861813e-06, 'lambda': 0.02144911689260386, 'subsample': 0.2833391840860571, 'max_depth': 5, 'min_child_weight': 2, 'learning_rate': 0.03738092839194132, 'gamma': 0.4602095881905746, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 0.12634801076675134, 'skip_

Best hyperparameters found: {'obj': 'reg:logistic', 'booster': 'gbtree', 'colsample_bynode': 0.9499189514019735, 'colsample_bytree': 0.8860527795097257, 'alpha': 0.0003110064036822101, 'lambda': 0.6152146730576613, 'subsample': 0.55026940355463, 'max_depth': 5, 'min_child_weight': 5, 'learning_rate': 0.15874833519225046, 'gamma': 7.738208384103265e-05, 'grow_policy': 'depthwise'}


In [2]:
class Model_Executer:
    '''This class creates and optimizes three models: XGBoost (XGB), Support Vector Classifier (SVC), and Naive Bayes (NB).
    The models are optimized and trained on the provided dataset. The class also includes methods for saving these models.

    Attributes:
    -----------
    df : pd.DataFrame
        The input dataframe containing the data.
    df_imputed : pd.DataFrame
        The imputed dataframe after preprocessing.
    X_train : pd.DataFrame
        The training data features.
    X_test : pd.DataFrame
        The test data features.
    y_train : pd.Series
        The training data labels.
    y_test : pd.Series
        The test data labels.
    metrics_dict : dict
        A dictionary containing performance metrics for each model.

    Methods:
    --------
    preprocess(df, limit_try=3000, target="Label"):
        Preprocesses the input dataframe by dropping the "Timestamp" column and imputing missing values.
    split(df, label):
        Splits the dataframe into training and test sets.
    optimize_model(df, label="Label"):
        Optimizes and returns three models: XGBoost, SVC, and Naive Bayes.
    metrics(y_pred):
        Calculates and returns performance metrics (accuracy, RMSE, MAE, R2) for the predicted values.
    model_creator():
        Creates and trains the models, then returns a dictionary of performance metrics.
    saver(model_to_save, path, filename):
        Saves the specified model as a pickle file in the given path with a timestamped filename.
    '''
    
    def __init__(self, df, label: str = "Label") -> None:
        '''Initializes the Model_Executer class with the provided dataframe and target label.

        Parameters:
        ----------
        df : pd.DataFrame
            The input dataframe.
        label : str, optional
            The name of the target column to predict, default is "Label".
        '''
        self.df = df
        self.df_imputed = self.preprocess(self.df)
        self.X_train, self.X_test, self.y_train, self.y_test = self.split(self.df_imputed, label)
        self.metrics_dict = self.model_creator()

    def preprocess(self, df, limit_try: int = 3000, target: str = "Label"):
        '''Preprocesses the input dataframe by dropping the "Timestamp" column and imputing missing values.
        
        Parameters:
        ----------
        df : pd.DataFrame
            The input dataframe to preprocess.
        limit_try : int, optional
            The number of rows to use for initial preprocessing, default is 3000.
        target : str, optional
            The target column name, default is "Label".
        
        Returns:
        -------
        pd.DataFrame
            The imputed dataframe.
        '''
        if "Timestamp" in df.columns:
            df.drop("Timestamp", axis=1, inplace=True)
        
        df_veri = df.iloc[:limit_try, :]
        df_try = df.iloc[limit_try:, :]
            
        obj = Preprocessor(df=df_veri, target=target)
        df_imputed = obj.predictor(df_try)

        return df_imputed
            
    def split(self, df, label):
        '''Splits the dataframe into training and test sets.
        
        Parameters:
        ----------
        df : pd.DataFrame
            The input dataframe to split.
        label : str
            The name of the target column.
        
        Returns:
        -------
        tuple
            A tuple containing the training and test features and labels (X_train, X_test, y_train, y_test).
        '''
        X = df.drop(label, axis=1).copy()
        y = df[label]

        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

        return X_train, X_test, y_train, y_test

    def optimize_model(self, df: pd.DataFrame, label: str = "Label"):
        '''Optimizes and returns three models: XGBoost, SVC, and Naive Bayes.
        
        Parameters:
        ----------
        df : pd.DataFrame
            The imputed dataframe.
        label : str, optional
            The target column name, default is "Label".
        
        Returns:
        -------
        tuple
            A tuple containing the optimized models (XGBoost, SVC, Naive Bayes).
        '''

        def xgboost():
            print("Entering optimization 1")
            opti = XGB_optimizer(self.X_train, self.X_test, self.y_train, self.y_test)
            trial = opti.optimization()
            xgb_model = xgb.XGBClassifier(**trial.params)
            print(f"XGB model created with params: {xgb_model.get_params()}")
            return xgb_model
        
        def svc():
            print("Entering optimization 2")
            svc_o = SVC_Optimizer(self.X_train, self.X_test, self.y_train, self.y_test)
            best_svc_model = svc_o.svc_opt_model()
            print("SVC model created")
            return best_svc_model
        
        def naive_bayes():
            print("Entering optimization 3")
            gnb = GaussianNB()
            print("Naive Bayes model created") 
            return gnb
        
        xgb_o, svc_o, naive_bayes_o = xgboost(), svc(), naive_bayes()

        return xgb_o, svc_o, naive_bayes_o
    
    def metrics(self, y_pred):
        '''Calculates and returns performance metrics (accuracy, RMSE, MAE, R2) for the predicted values.
        
        Parameters:
        ----------
        y_pred : array-like
            The predicted values.
        
        Returns:
        -------
        dict
            A dictionary containing the calculated metrics.
        '''
        accuracy = round(accuracy_score(self.y_test, y_pred), 4)
        rmse = mean_squared_error(self.y_test, y_pred, squared=False)
        mae = mean_absolute_error(self.y_test, y_pred)
        r2 = r2_score(self.y_test, y_pred)

        metrics_dict = {"accuracy": accuracy, "rmse": rmse, "mae": mae, "r2": r2}

        return metrics_dict

    def model_creator(self):
        '''Creates and trains the models, then returns a dictionary of performance metrics.
        
        Returns:
        -------
        dict
            A dictionary containing performance metrics for each model.
        '''
        xgb_o, svc_o, naive_bayes_o = self.optimize_model(self.df_imputed, "Label")        

        metrics_dict = {}

        # Train and evaluate the XGBoost model
        xgb_model = xgb_o            
        booster = xgb_model.fit(self.X_train, self.y_train) 
        self.booster = booster
        y_pred_xgb = booster.predict(self.X_test)
        metrics_dict["XGB"] = self.metrics(y_pred_xgb)

        # Train and evaluate the SVC model
        svc = svc_o
        svc_fit = svc.fit(self.X_train, self.y_train)
        self.svc = svc_fit
        y_pred_svc = svc_fit.predict(self.X_test)
        metrics_dict["SVC"] = self.metrics(y_pred_svc)                

        # Train and evaluate the Naive Bayes model
        nb = naive_bayes_o
        nb_fit = nb.fit(self.X_train, self.y_train)
        self.nb = nb_fit
        y_pred_nb = nb_fit.predict(self.X_test)
        metrics_dict["NB"] = self.metrics(y_pred_nb)  

        return metrics_dict
    
    def saver(self, model_to_save, path, filename):
        '''Saves the specified model as a pickle file in the given path with a timestamped filename.
        
        Parameters:
        ----------
        model_to_save : object
            The model to be saved.
        path : str
            The directory path where the model will be saved.
        filename : str
            The name of the file to save the model as.
        '''
        cwd = os.getcwd()
        path = os.path.join(cwd, path)
        date = datetime.datetime.now().strftime("%d_%m_%Y")

        if os.path.exists(f"{path}\\{date}"):            
            try:                 
                joblib.dump(model_to_save, f"{path}\\{date}\\{filename}.pkl")                
            except:                
                pass
        else:          
            os.mkdir(f"{path}\\{date}") 
            print(f"Directory {path} created at {cwd}") 
            joblib.dump(model_to_save, f"{path}\\{date}\\{filename}.pkl")


In [3]:
dtf = datasets.gdd.load_data()
model_ex = Model_Executer(dtf)

I0000 00:00:1727350860.461053  791698 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-09-26 13:41:00.470112: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 161: early stopping
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 565us/step
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 587us/step
[1m414/414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 423us/step


[I 2024-09-26 13:41:05,469] A new study created in memory with name: no-name-7097583e-e3c7-4963-9154-7dbea1d6c468


Entering optimization 1


[I 2024-09-26 13:41:11,518] Trial 0 finished with value: 0.7813 and parameters: {'obj': 'reg:squarederror', 'booster': 'dart', 'colsample_bynode': 0.5438463507638324, 'colsample_bytree': 0.9948004339427853, 'alpha': 0.047092244786536716, 'lambda': 0.046866331074994196, 'subsample': 0.2641420452002266, 'max_depth': 3, 'min_child_weight': 3, 'learning_rate': 0.02049990266905863, 'gamma': 0.33566335509193096, 'grow_policy': 'depthwise', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 0.7633198542737005, 'skip_drop': 0.0032921318906337167}. Best is trial 0 with value: 0.7813.
[I 2024-09-26 13:41:35,430] Trial 1 finished with value: 0.8782 and parameters: {'obj': 'multi:softmax', 'booster': 'dart', 'colsample_bynode': 0.779974468394286, 'colsample_bytree': 0.740388351407157, 'alpha': 7.545454408390574e-08, 'lambda': 0.01966373258814042, 'subsample': 0.6706771270190572, 'max_depth': 3, 'min_child_weight': 4, 'learning_rate': 0.06646748902823546, 'gamma': 0.014361465147437589

XGB model created with params: {'objective': 'binary:logistic', 'base_score': None, 'booster': 'gbtree', 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': 0.7987432838017517, 'colsample_bytree': 0.7997256981170161, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': 2.848076242548765e-08, 'grow_policy': 'depthwise', 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.127212426454724, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 5, 'max_leaves': None, 'min_child_weight': 10, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.5039035443834982, 'tree_method': None, 'validate_parameters': None, 'verbosity

# Retrieving the optimized models 

In [8]:
#Seeing the metrics 
model_ex.metrics_dict

{'XGB': {'accuracy': 0.8792,
  'rmse': 1.7009839445590746,
  'mae': 0.4334637964774951,
  'r2': 0.28149639610429167},
 'SVC': {'accuracy': 0.8635,
  'rmse': 1.7606261117238258,
  'mae': 0.46966731898238745,
  'r2': 0.2302267781056463},
 'NB': {'accuracy': 0.5044,
  'rmse': 2.4568686416869934,
  'mae': 1.3091976516634052,
  'r2': -0.49896812053859474}}

In [13]:
xgb = model_ex.booster
svc = model_ex.svc
nb = model_ex.nb

In [39]:
pred = xgb.predict(X_test)


true_percentage = ["True:", sum(1 for i, p in zip(pred, y_test) if p == i)/len(y_test)*100, "False:", sum(1 for i, p in zip(pred, y_test) if p != i)/len(y_test)*100]


In [40]:
true_percentage

['True:', 81.33756771903606, 'False:', 18.662432280963944]