In [30]:
from pathlib import Path
import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from pymoo.config import Config

Config.warnings["not_compiled"] = False


## Date reading

**Input Data Structure**:
- Let $X = \{\mathbf{x}_i\}_{i=1}^N \subset \mathbb{R}^2$ be the set of Pareto-optimal solutions
- Each solution $\mathbf{x}_i = (f_1^{(i)}, f_2^{(i)}, ...,f_n^{(i)})$ represents a trade-off between:
  - $f_1$: Travel time (minutes)
  - $f_2$: Energy consumption (kWh)
  - $f_n$: Other data variables

Then, filter out the needed data for trainig purpose

In [31]:
# Define constants at the top
COLUMNS = [
    "accel_ms2",
    "decel_ms2",
    "time_min",
    "energy_kwh",
]

DATA_DIR = Path(os.getcwd()).parent / "data" / "raw"
PROCE_DIR = Path(os.getcwd()).parent / "data" / "processed"

pareto_df = pd.read_pickle(PROCE_DIR / "preprocessed_pareto_data.pkl")
pareto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   accel_ms2      800 non-null    float64
 1   decel_ms2      800 non-null    float64
 2   time_min       800 non-null    float64
 3   energy_kwh     800 non-null    float64
 4   distance_km    800 non-null    int64  
 5   time_weight    800 non-null    float64
 6   energy_weight  800 non-null    float64
dtypes: float64(6), int64(1)
memory usage: 43.9 KB



## Data preparation
**Normalization**: Scale objectives to $[0,1]$ range for training stability:
$$
\hat{f}_k = \frac{f_k - f_{k}^{min}}{f_{k}^{max} - f_{k}^{min}}, \quad \text{for } k=1,2
$$

**Standardization**:
$$
\hat{f}_k^{(i)} = \frac{f_k^{(i)} - \mu_k}{\sigma_k}, \quad \text{for } k=1,2
$$
where $\mu_k$, $\sigma_k$ are the mean and standard deviation of each objective.


In [32]:
# Check if the the data need to be normalized
pareto_df.aggregate({"time_min": ["min", "max"], "energy_kwh": ["min", "max"]})


Unnamed: 0,time_min,energy_kwh
min,4.249167,0.427002
max,17.44713,1.594058


In [33]:
pareto_df.head()

Unnamed: 0,accel_ms2,decel_ms2,time_min,energy_kwh,distance_km,time_weight,energy_weight
0,0.213321,0.209042,17.44713,1.565973,20,0.469955,0.530045
1,1.999999,1.999592,16.749167,1.594058,20,0.565723,0.434277
2,1.226421,1.205027,16.801731,1.591707,20,0.558342,0.441658
3,1.980324,1.979324,16.749995,1.591878,20,0.56335,0.43665
4,0.220566,0.213304,17.421467,1.567303,20,0.473798,0.526202


In [34]:
from joblib import dump

# Normalize to [0, 1]
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(
    pareto_df[
        ["accel_ms2", "decel_ms2", "time_min", "energy_kwh", "distance_km"]
    ].values
)

pareto_df[["accel_ms2", "decel_ms2", "time_min", "energy_kwh", "distance_km"]] = (
    X_normalized
)

dump(scaler, "../models/scaler.joblib")


['../models/scaler.joblib']

In [35]:
pareto_df.head()

Unnamed: 0,accel_ms2,decel_ms2,time_min,energy_kwh,distance_km,time_weight,energy_weight
0,0.0,0.003435,1.0,0.975935,1.0,0.469955,0.530045
1,1.0,0.999795,0.947116,1.0,1.0,0.565723,0.434277
2,0.56703,0.557656,0.951099,0.997986,1.0,0.558342,0.441658
3,0.988988,0.988517,0.947179,0.998132,1.0,0.56335,0.43665
4,0.004055,0.005807,0.998056,0.977075,1.0,0.473798,0.526202


## Saving processed data

In [36]:
# Split data (80% train, 20% validation)
train, test = train_test_split(pareto_df, test_size=0.2, random_state=42)

# Save the train and test data into one pickle file combined train and test data
data = {"train": train, "test": test}
output_dir = Path(os.getcwd()).parent / "data" / "processed"
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "final_pareto_data.pkl"
pd.to_pickle(obj=data, filepath_or_buffer=output_file)
print(f"Data saved to {output_file.resolve()}")

Data saved to /Users/nicolaibrahim/Desktop/proj/Pareto-Optimization-/data/processed/final_pareto_data.pkl
