In [25]:
from pathlib import Path
import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from pymoo.config import Config

Config.warnings["not_compiled"] = False

## Date reading

**Input Data Structure**:
- Let $X = \{\mathbf{x}_i\}_{i=1}^N \subset \mathbb{R}^2$ be the set of Pareto-optimal solutions
- Each solution $\mathbf{x}_i = (f_1^{(i)}, f_2^{(i)}, ...,f_n^{(i)})$ represents a trade-off between:
  - $f_1$: Travel time (minutes)
  - $f_2$: Energy consumption (kWh)
  - $f_n$: Other data variables

Then, filter out the needed data for trainig purpose

In [None]:
# Define constants at the top
COLUMNS = [
    "accel_ms2",
    "decel_ms2",
    "time_min",
    "energy_kwh",
]

DATA_DIR = Path(os.getcwd()).parent / "data" / "raw"
INTERIM_DIR = Path(os.getcwd()).parent / "data" / "interim"
EXPORT_PATH = Path(os.getcwd()).parent / "data" / "processed" / "final_pareto_data.pkl"
PREPROC_PARETO_PATH = INTERIM_DIR / "preproc_pareto_data.pkl"

In [27]:
pareto_df = pd.read_pickle(PREPROC_PARETO_PATH)
pareto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   accel_ms2        800 non-null    float64
 1   decel_ms2        800 non-null    float64
 2   time_min         800 non-null    float64
 3   energy_kwh       800 non-null    float64
 4   distance_km      800 non-null    int64  
 5   max_speed_mps    800 non-null    float64
 6   max_bettery_kwh  800 non-null    float64
 7   time_weight      800 non-null    float64
 8   energy_weight    800 non-null    float64
dtypes: float64(8), int64(1)
memory usage: 56.4 KB



## Data preparation
**Normalization**: Scale objectives to $[0,1]$ range for training stability:
$$
\hat{f}_k = \frac{f_k - f_{k}^{min}}{f_{k}^{max} - f_{k}^{min}}, \quad \text{for } k=1,2
$$

**Standardization**:
$$
\hat{f}_k^{(i)} = \frac{f_k^{(i)} - \mu_k}{\sigma_k}, \quad \text{for } k=1,2
$$
where $\mu_k$, $\sigma_k$ are the mean and standard deviation of each objective.


In [28]:
# Check if the the data need to be normalized
pareto_df.aggregate({"time_min": ["min", "max"], "energy_kwh": ["min", "max"]})

Unnamed: 0,time_min,energy_kwh
min,4.249167,0.427002
max,17.44713,1.594058


In [29]:
pareto_df.head()

Unnamed: 0,accel_ms2,decel_ms2,time_min,energy_kwh,distance_km,max_speed_mps,max_bettery_kwh,time_weight,energy_weight
0,0.213321,0.209042,17.44713,1.565973,20,20.0,10.0,0.006693,0.993307
1,1.999999,1.999592,16.749167,1.594058,20,20.0,10.0,0.993307,0.006693
2,1.226421,1.205027,16.801731,1.591707,20,20.0,10.0,0.98952,0.01048
3,1.980324,1.979324,16.749995,1.591878,20,20.0,10.0,0.990028,0.009972
4,0.220566,0.213304,17.421467,1.567303,20,20.0,10.0,0.007773,0.992227


In [30]:
from joblib import dump

# Normalize to [0, 1]
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(
    pareto_df[
        ["accel_ms2", "decel_ms2", "time_min", "energy_kwh", "distance_km"]
    ].values
)

pareto_df[["accel_ms2", "decel_ms2", "time_min", "energy_kwh", "distance_km"]] = (
    X_normalized
)

dump(scaler, "../models/scaler.joblib")

['../models/scaler.joblib']

In [31]:
pareto_df.head()

Unnamed: 0,accel_ms2,decel_ms2,time_min,energy_kwh,distance_km,max_speed_mps,max_bettery_kwh,time_weight,energy_weight
0,0.0,0.003435,1.0,0.975935,1.0,20.0,10.0,0.006693,0.993307
1,1.0,0.999795,0.947116,1.0,1.0,20.0,10.0,0.993307,0.006693
2,0.56703,0.557656,0.951099,0.997986,1.0,20.0,10.0,0.98952,0.01048
3,0.988988,0.988517,0.947179,0.998132,1.0,20.0,10.0,0.990028,0.009972
4,0.004055,0.005807,0.998056,0.977075,1.0,20.0,10.0,0.007773,0.992227


## Saving processed data

In [32]:
# Split data (80% train, 20% validation)
train, test = train_test_split(pareto_df, test_size=0.2, random_state=42)

# Save the train and test data into one pickle file combined train and test data
data = {"train": train, "test": test}
pd.to_pickle(obj=data, filepath_or_buffer=EXPORT_PATH)