In [1]:
import matplotlib.pyplot as plt
import os
from datetime import datetime
import time
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc, recall_score, roc_auc_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.utils import Bunch


### Datas preparation

In [2]:
def csv_data_into_bunch(filenames):
    """
    This function assume that the csv file will have label egal to 1 for anomalie and 0 for normal value.
    The csv have a header too.
    """
    datasets = []
    for i in filenames:
        name = os.path.splitext(os.path.basename(i))[0]

        data = pd.read_csv(i).to_numpy()
    
        x = data[:,:-1]
        y = data[:,-1]

        # Standardisation
        scaler = StandardScaler()
        x_scaled = scaler.fit_transform(x)

        # Store in a Bunch
        datasets.append(Bunch(name=name, data=x_scaled, target=y))

    return datasets


        
def list_all_file_name_in_folder(folder="../data_rapport"):
    return [os.path.join(folder, filename) for filename in os.listdir(folder) if filename.endswith(".csv")]

filenames = list_all_file_name_in_folder()
print(filenames)

datasets = csv_data_into_bunch(filenames)
speech = datasets[0]

['../data_rapport/speech.csv', '../data_rapport/kddcup99 http.csv', '../data_rapport/kddcup2014 donneurs .csv', '../data_rapport/satellite.csv', '../data_rapport/fraude.csv', '../data_rapport/cancer du sein.csv', '../data_rapport/shuttle.csv']


### Benchmark

This function takes two parameters: `datasets` and `algorithms`.

- `datasets`: a list of `Bunch` objects, each containing:
  - `name`: the name of the dataset  
  - `data`: the datas  
  - `target`: the label 



- `algorithms`: a list of tuples in the following form:

  ```python
  algorithms = [
      ("name", # Algorithm name 
       run_name, # runner function
       {"param1": ..., "param2": ...}), # dict of paramaters 
      # …
  ]

This function will return three pandas dataFramme : auc, recall, time

In [3]:
def benchmark(datasets, algorithms):
    """
    Run a benchmark of multiple algorithms on multiple datasets.
    
     datasets : list of Bunch
            Each Bunch must have attributes:
              - name : dataset name
              - data : feature data
              - target : ground truth labels
        algorithms : list of tuples
            Each tuple is (algo_name, runner_function, params_dict):
              - algo_name : descriptive name of the algorithm
              - runner_function : function(bunch, params) - > dict
              - params_dict: parameters to pass to the runner


    Returns
    -------
    df_auc : pandas.DataFrame
        AUC scores for each (dataset × algorithm).
    df_recall : pandas.DataFrame
        Recall scores for each (dataset × algorithm).
    df_time : pandas.DataFrame
        Execution time (seconds) for each (dataset × algorithm).
    """
    
    auc_list, recall_list, time_list = [], [], []
    
    for bunch in datasets:
        print(bunch.name, " : départ")
        auc_info, recall_info, time_info = {"dataset": bunch.name}, {"dataset": bunch.name}, {"dataset": bunch.name}
        
        for algo_name, algo_runner, params in algorithms:
            print("  ", algo_name, ": départ")
            res = algo_runner(bunch, params) 
            
            auc_info[algo_name]    = res['auc']
            recall_info[algo_name] = res['recall']
            time_info[algo_name]   = res['time']
            print("AUC :", res['auc'], "RAPPEL :", res['recall'],"TEMPS :", res['time'])
            print("  ", algo_name, ": fini")
            
        auc_list.append(auc_info)
        recall_list.append(recall_info)
        time_list.append(time_info)
        print(bunch.name, " : fini")
        
    df_auc    = pd.DataFrame(auc_list)
    df_recall = pd.DataFrame(recall_list)
    df_time   = pd.DataFrame(time_list)
    
    return df_auc, df_recall, df_time
        

### Algorithms 

Each runner function must accept exactly two arguments and return a dictionary with the same keys:


- `Parameters`

    - `bunch` : like in the datasets list of the benchmark parameters
    
    - `params` :  like in the algorithms list of the benchmark parameters

- `Returns`

- `dict`  
  ```python
  {
    "auc":    <float>,  // ROC AUC score
    "recall": <float>,  // Recall score
    "time":   <float>   // Elapsed time (seconds)
  }


In [4]:
def run_iforest(bunch, params):
    """
    Fit and evaluate IsolationForest on a single dataset.

    Parameters
    ----------
    bunch : Bunch
        Must have attributes:
          - data: feature data
          - target: ground truth labels
    params : dict
        Keyword arguments for IsolationForest constructor.

    Returns
    -------
    dict
        {
          'auc': float,      # ROC AUC score
          'recall': float,   # recall score
          'time': float      # elapsed time in seconds
        }
    """
    
    x, y_ground_truth = bunch.data, bunch.target
    IF = IsolationForest(**params)
    print(type(IF))
    
    # Fit + predict 
    start = time.perf_counter()
    IF.fit(x)
    y_pred = IF.predict(x) 
    elapsed = time.perf_counter() - start

    # Binairisation of the prediction and scores calculation 
    y_pred = (y_pred == -1)
    scores = -IF.decision_function(x)

    # metrics
    auc = roc_auc_score(y_ground_truth, scores)
    recall = recall_score(y_ground_truth, y_pred)

    return {
        'auc': auc,
        'recall': recall,
        'time' : elapsed
    }


def run_lof(bunch, params) :
    """
    Fit and evaluate LocalOutlierFactor on a single dataset.

    Parameters
    ----------
    bunch : Bunch
        Must have attributes:
          - data: feature data
          - target: ground truth labels
    params : dict
        Keyword arguments for LocalOutlierFactor constructor.

    Returns
    -------
    dict
        {
          'auc': float,      # ROC AUC score
          'recall': float,   # recall score
          'time': float      # elapsed time in seconds
        }
    """
    x, y_ground_truth = bunch.data, bunch.target
    LOF = LocalOutlierFactor(** params)
    
    # Fit + predict 
    start = time.perf_counter()
    y_pred = LOF.fit_predict(x)
    elapsed = time.perf_counter() - start

    # Binairisation of the prediction and scores calculation 
    y_pred = (y_pred == -1)
    scores = -LOF.negative_outlier_factor_

    # metrics
    auc = roc_auc_score(y_ground_truth, scores)
    recall = recall_score(y_ground_truth, y_pred)

    return {
        'auc': auc,
        'recall': recall,
        'time' : elapsed
    }

In [6]:
algorithms = [
     ("IForest", run_iforest, {'n_estimators':200, 'contamination':0.1}),
     ("LOF", run_lof, {'n_neighbors':250, 'contamination':0.1}),
]


#df_auc, df_recall, df_time = benchmark(datasets, algorithms)
df_auc, df_recall, df_time = benchmark([speech], algorithms)

speech  : départ
   IForest : départ
<class 'sklearn.ensemble._iforest.IsolationForest'>
AUC : 0.4608253250423968 RAPPEL : 0.13114754098360656 TEMPS : 0.2788610110001173
   IForest : fini
   LOF : départ
AUC : 0.47594347088750705 RAPPEL : 0.11475409836065574 TEMPS : 0.3211567149992334
   LOF : fini
speech  : fini


In [7]:
def add_hlines(latex):
    return latex.replace("\\\\", "\\\\ \\hline")
        
def pandas_to_latex(df_auc, df_recall, df_time):
    """
    Convert three pandas DataFrames into LaTeX-formatted tables with horizontal lines.

    Parameters
    ----------
    df_auc : pandas.DataFrame
        DataFrame of AUC scores.
    df_recall : pandas.DataFrame
        DataFrame of recall scores.
    df_time : pandas.DataFrame
        DataFrame of execution times.

    Returns
    -------
    tuple of str
        A tuple (latex_auc, latex_recall, latex_time), where each element is the
        LaTeX code for the corresponding DataFrame, with added \\hline commands.
    """
    latex_auc = add_hlines(df_auc.to_latex(index=False,float_format="%.2f"))
    latex_recall = add_hlines(df_recall.to_latex(index=False,float_format="%.2f"))
    latex_time = add_hlines(df_time.to_latex(index=False,float_format="%.2f"))

    return latex_auc, latex_recall, latex_time

latex_auc, latex_recall, latex_time = pandas_to_latex(df_auc, df_recall, df_time)



In [8]:
def write_latex_in_file(latex_auc, latex_recall,latex_time, algorithms, filepath = "./latex_results.txt"):
    """
    Prepend LaTeX tables and algorithm info to a text file with a timestamp.

    Parameters
    ----------
    latex_auc : str
        LaTeX code for the AUC table.
    latex_recall : str
        LaTeX code for the recall table.
    latex_time : str
        LaTeX code for the time table.
    algorithms : list of tuples
        Same list passed to benchmark, each tuple is
        (name, runner_function, params_dict).
    filepath : str, optional
        Path to the output text file (default "./latex_results.txt").

    Notes
    -----
    - The current timestamp is written at the top.
    - Algorithm names and their parameter settings are listed.
    - New content is prepended to preserve previous runs.
    """
    # Actual time
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Informations about algorithms used : name1 : param1 = 0, param2=0 ; ...
    infos = []
    for name, _, params in algorithms:
        param_str = ", ".join(f"{k}={v}" for k, v in params.items())
        infos.append(f"{name}: {param_str}")
    info_algo = "; ".join(infos)

    content = (
        f"{now}\n"
        f"{info_algo}\n\n"
        r"%=== AUC table ===" "\n"
        f"{latex_auc}\n\n"
        r"%=== Recall table ===" "\n"
        f"{latex_recall}\n\n"
        r"%=== Time table ===" "\n"
        f"{latex_time}\n"
    )

    # Add at the start of the file without delete the old content 
    old = ""
    if os.path.exists(filepath):
        with open(filepath, "r") as f:
            old = f.read()
        with open(filepath, "w") as f:
            f.write(content)
            f.write(old)
    
    

### Combination 

In [9]:
def automatisatison(datasets, algorithms, filepath = "./latex_results.txt"):
    """
    Run the full pipeline: benchmark algorithms, convert results to LaTeX, and write to file.

    Parameters
    ----------
    datasets : list of Bunch
        List of dataset objects (each with .name, .data, .target).
    algorithms : list of tuples
        Each tuple is (name, runner_function, params_dict), as for benchmark().
    filepath : str, optional
        Path where the LaTeX output will be written (default "./latex_results.txt").

    """
    start = time.perf_counter()
    
    # Calculate all the metrics
    df_auc, df_recall, df_time = benchmark(datasets, algorithms)

    # Transform the metrics into latex tab 
    latex_auc, latex_recall, latex_time = pandas_to_latex(df_auc, df_recall, df_time)

    # Write these latex tab in a file
    write_latex_in_file(latex_auc, latex_recall,latex_time, algorithms, filepath)

    elapsed = time.perf_counter() - start


In [None]:
automatisation(datasets, algorithms)

### PRINT, CSV, 

python3 benchmark.py 2>&1 | tee prints_and_errors.txt  

In [24]:
def pandas_to_csv(df_auc, df_recall, df_time, name_csv, new_folder_path="./csv_results/"):
    # Creation of a new folder
    os.makedirs(new_folder_path + name_csv, exist_ok=True)
    
    df_auc.to_csv(new_folder_path + name_csv + "/" +name_csv + "_auc.csv", index=False, header=True, float_format='%.2f')
    df_recall.to_csv(new_folder_path + name_csv + "/" +name_csv +"_recall.csv", index=False, header=True,  float_format='%.2f')
    df_time.to_csv(new_folder_path + name_csv + "/" +name_csv +"_time.csv", index=False, header=True, float_format='%.2f')

pandas_to_csv(df_auc, df_recall, df_time, "IF_LOF_Cancer_Speech")

### NaN Values 

```python 
Traceback (most recent call last):
  File "/home/khalil/4eme/stage/tests/benchmark.py", line 483, in <module>
    automatisation(datasets, algorithms)
  File "/home/khalil/4eme/stage/tests/benchmark.py", line 432, in automatisation
    df_auc, df_recall, df_time = benchmark(datasets, algorithms)
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/khalil/4eme/stage/tests/benchmark.py", line 308, in benchmark
    res = algo_runner(bunch, params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/khalil/4eme/stage/tests/benchmark.py", line 184, in run_vae
    VAE_model.fit(x)
  File "/home/khalil/ls/envs/sfml/lib/python3.12/site-packages/pyod/models/base_dl.py", line 202, in fit
    self.decision_scores_ = self.decision_function(X)
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/khalil/ls/envs/sfml/lib/python3.12/site-packages/pyod/models/base_dl.py", line 290, in decision_function
    anomaly_scores = self.evaluate(data_loader)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/khalil/ls/envs/sfml/lib/python3.12/site-packages/pyod/models/base_dl.py", line 315, in evaluate
    score = self.evaluating_forward(batch_data)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/khalil/ls/envs/sfml/lib/python3.12/site-packages/pyod/models/vae.py", line 264, in evaluating_forward
    score = pairwise_distances_no_broadcast(x.numpy(),
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/khalil/ls/envs/sfml/lib/python3.12/site-packages/pyod/utils/stat_models.py", line 43, in pairwise_distances_no_broadcast
    Y = check_array(Y)
        ^^^^^^^^^^^^^^
  File "/home/khalil/ls/envs/sfml/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1107, in check_array
    _assert_all_finite(
  File "/home/khalil/ls/envs/sfml/lib/python3.12/site-packages/sklearn/utils/validation.py", line 120, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "/home/khalil/ls/envs/sfml/lib/python3.12/site-packages/sklearn/utils/validation.py", line 169, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input contains NaN.

In [8]:
for bunch in datasets:
    has_nan = np.isnan(bunch.data).any()
    print(f"{bunch.name} has NaN?  {has_nan}")

speech has NaN?  False
satellite has NaN?  False
fraude has NaN?  False
cancer du sein has NaN?  False
shuttle has NaN?  False


### Function Benchmark 2, where we do for each algo all the datasets first

In [None]:
def benchmark(datasets, algorithms):
    """
    Run a benchmark of multiple algorithms on multiple datasets.
    
     datasets : list of Bunch
            Each Bunch must have attributes:
              - name : dataset name
              - data : feature data
              - target : ground truth labels
        algorithms : list of tuples
            Each tuple is (algo_name, runner_function, params_dict):
              - algo_name : descriptive name of the algorithm
              - runner_function : function(bunch, params) - > dict
              - params_dict: parameters to pass to the runner


    Returns
    -------
    df_auc : pandas.DataFrame
        AUC scores for each (dataset × algorithm).
    df_recall : pandas.DataFrame
        Recall scores for each (dataset × algorithm).
    df_time : pandas.DataFrame
        Execution time (seconds) for each (dataset × algorithm).
    """
    
    auc_list, recall_list, time_list = [], [], []
    
    for algo_name, algo_runner, params in algorithms:
        print(algo_name, " : départ")
        auc_info, recall_info, time_info = {"dataset": bunch.name}, {"dataset": bunch.name}, {"dataset": bunch.name}
        for bunch in datasets:
        
            print("  ", algo_name, ": départ")
            res = algo_runner(bunch, params) 
            
            auc_info[algo_name]    = res['auc']
            recall_info[algo_name] = res['recall']
            time_info[algo_name]   = res['time']
            print("AUC :", res['auc'], "RAPPEL :", res['recall'],"TEMPS :", res['time'])
            print("  ", algo_name, ": fini")
            
        auc_list.append(auc_info)
        recall_list.append(recall_info)
        time_list.append(time_info)
        print(bunch.name, " : fini")
        
    df_auc    = pd.DataFrame(auc_list)
    df_recall = pd.DataFrame(recall_list)
    df_time   = pd.DataFrame(time_list)
    
    return df_auc, df_recall, df_time