In [185]:
from pathlib import Path
import pandas as pd
import os

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from pymoo.config import Config

Config.warnings["not_compiled"] = False


## Date reading

**Input Data Structure**:
- Let $X = \{\mathbf{x}_i\}_{i=1}^N \subset \mathbb{R}^2$ be the set of Pareto-optimal solutions
- Each solution $\mathbf{x}_i = (f_1^{(i)}, f_2^{(i)}, ...,f_n^{(i)})$ represents a trade-off between:
  - $f_1$: Travel time (minutes)
  - $f_2$: Energy consumption (kWh)
  - $f_n$: Other data variables

Then, filter out the needed data for trainig purpose

In [186]:
# Define constants at the top
COLUMNS = [
    "accel_ms2",
    "decel_ms2",
    "time_min",
    "energy_kwh",
    "weighted_time",
    "weighted_energy",
]


def read_data() -> pd.DataFrame:
    """Read and merge Pareto front data from pickle files.

    Returns:
        pd.DataFrame: Combined DataFrame containing Pareto front data from all files

    Raises:
        FileNotFoundError: If no files found or directory doesn't exist
        KeyError: If any file is missing the 'pareto_front' key
    """
    # Construct paths using more reliable method
    data_dir = Path(os.getcwd()).parent / "data" / "raw"

    if not data_dir.exists():
        raise FileNotFoundError(f"Data directory not found: {data_dir.resolve()}")

    # Find all .pkl files (case insensitive)
    data_files = list(data_dir.glob("*.[pP][kK][lL]"))

    if not data_files:
        raise FileNotFoundError(f"No pickle files found in {data_dir.resolve()}")

    print(f"Found {len(data_files)} data files:")

    # Process files and handle potential errors
    dfs = []
    for file_path in data_files:
        try:
            # Load the entire file
            data = pd.read_pickle(file_path)

            pf_data = data["pareto_front"]

            pf_df = pd.DataFrame(pf_data, columns=COLUMNS)
            pf_df["distance_km"] = data.get("metadata", {}).get("distance_km", None)

            dfs.append(pf_df)

        except Exception as e:
            print(f"Error processing {file_path.name}: {str(e)}")
            continue

    if not dfs:
        raise ValueError("No valid Pareto front data found in any files")

    return pd.concat(dfs, ignore_index=True).reset_index(drop=True)


pareto_data = read_data()
pareto_data.info()

Found 4 data files:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   accel_ms2        800 non-null    float64
 1   decel_ms2        800 non-null    float64
 2   time_min         800 non-null    float64
 3   energy_kwh       800 non-null    float64
 4   weighted_time    800 non-null    float64
 5   weighted_energy  800 non-null    float64
 6   distance_km      800 non-null    int64  
dtypes: float64(6), int64(1)
memory usage: 43.9 KB


In [187]:
pareto_data

Unnamed: 0,accel_ms2,decel_ms2,time_min,energy_kwh,weighted_time,weighted_energy,distance_km
0,0.213321,0.209042,17.447130,1.565973,0.006693,0.993307,20
1,1.999999,1.999592,16.749167,1.594058,0.993307,0.006693,20
2,1.226421,1.205027,16.801731,1.591707,0.985297,0.014703,20
3,1.980324,1.979324,16.749995,1.591878,0.990107,0.009893,20
4,0.220566,0.213304,17.421467,1.567303,0.010157,0.989843,20
...,...,...,...,...,...,...,...
795,1.997366,1.966372,8.415944,0.834022,0.992390,0.007610,10
796,0.595512,1.756692,8.612372,0.823187,0.822677,0.177323,10
797,1.085927,1.987673,8.485979,0.830383,0.976364,0.023636,10
798,0.351023,1.996438,8.807303,0.816624,0.263026,0.736974,10



## Data preparation
**Normalization**: Scale objectives to $[0,1]$ range for training stability:
$$
\hat{f}_k = \frac{f_k - f_{k}^{min}}{f_{k}^{max} - f_{k}^{min}}, \quad \text{for } k=1,2
$$

**Standardization**:
$$
\hat{f}_k^{(i)} = \frac{f_k^{(i)} - \mu_k}{\sigma_k}, \quad \text{for } k=1,2
$$
where $\mu_k$, $\sigma_k$ are the mean and standard deviation of each objective.


In [188]:
# Check if the the data need to be normalized
pareto_data.aggregate({"time_min": ["min", "max"], "energy_kwh": ["min", "max"]})


Unnamed: 0,time_min,energy_kwh
min,4.249167,0.427002
max,17.44713,1.594058


In [189]:
# Normalize to [0, 1]
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(
    pareto_data[
        ["accel_ms2", "decel_ms2", "time_min", "energy_kwh", "distance_km"]
    ].values
)

pareto_data[["accel_ms2", "decel_ms2", "time_min", "energy_kwh", "distance_km"]] = (
    X_normalized
)

In [190]:
pareto_data

Unnamed: 0,accel_ms2,decel_ms2,time_min,energy_kwh,weighted_time,weighted_energy,distance_km
0,0.000000,0.003435,1.000000,0.975935,0.006693,0.993307,1.000000
1,1.000000,0.999795,0.947116,1.000000,0.993307,0.006693,1.000000
2,0.567030,0.557656,0.951099,0.997986,0.985297,0.014703,1.000000
3,0.988988,0.988517,0.947179,0.998132,0.990107,0.009893,1.000000
4,0.004055,0.005807,0.998056,0.977075,0.010157,0.989843,1.000000
...,...,...,...,...,...,...,...
795,0.998526,0.981310,0.315714,0.348758,0.992390,0.007610,0.333333
796,0.213911,0.864632,0.330597,0.339474,0.822677,0.177323,0.333333
797,0.488396,0.993163,0.321020,0.345640,0.976364,0.023636,0.333333
798,0.077072,0.998040,0.345367,0.333850,0.263026,0.736974,0.333333


## Saving processed data

In [191]:
# Split data (80% train, 20% validation)
train, test = train_test_split(pareto_data, test_size=0.2, random_state=42)

# Save the train and test data into one pickle file combined train and test data
data = {"train": train, "test": test}
output_dir = Path(os.getcwd()).parent / "data" / "processed"
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "pareto_data.pkl"
pd.to_pickle(obj=output_dir, filepath_or_buffer=output_file)
print(f"Data saved to {output_file.resolve()}")

Data saved to /Users/nicolaibrahim/Desktop/proj/Pareto-Optimization-/data/processed/pareto_data.pkl
