In [19]:
from pathlib import Path
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import QuantileTransformer
from pymoo.config import Config

Config.warnings["not_compiled"] = False

In [20]:
# Define constants at the top
COLUMNS = [
    "accel_ms2",
    "decel_ms2",
    "time_min",
    "energy_kwh",
]

DATA_DIR = Path(os.getcwd()).parent / "data" / "raw"
INTERIM_DIR = Path(os.getcwd()).parent / "data" / "interim"

EXPORT_PATH = INTERIM_DIR / "preproc_pareto_data.pkl"

# Find all .pkl files (case insensitive)
data_file_paths = list(DATA_DIR.glob("*.[pP][kK][lL]"))

## Date reading

**Input Data Structure**:
- Let $X = \{\mathbf{x}_i\}_{i=1}^N \subset \mathbb{R}^2$ be the set of Pareto-optimal solutions
- Each solution $\mathbf{x}_i = (f_1^{(i)}, f_2^{(i)}, ...,f_n^{(i)})$ represents a trade-off between:
  - $f_1$: Travel time (minutes)
  - $f_2$: Energy consumption (kWh)
  - $f_n$: Other data variables

Then, filter out the needed data for trainig purpose

In [21]:
# Data Reading Phase
def read_individual_files(data_file_paths: list[Path]) -> list[pd.DataFrame]:
    """Read and validate individual files without processing.

    Returns:
        List of DataFrames with raw data from each file
    """
    file_dfs = []

    for file_path in data_file_paths:
        try:
            data = pd.read_pickle(file_path)

            # Create raw DataFrame with metadata
            raw_df = pd.DataFrame(data["pareto_front"], columns=COLUMNS)
            raw_df["distance_km"] = data.get("metadata", {}).get("distance_km", np.nan)
            raw_df["max_speed_mps"] = data["metadata"]["config"]["vehicle"][
                "max_speed_mps"
            ]
            raw_df["max_bettery_kwh"] = data["metadata"]["config"]["vehicle"][
                "max_battery_kwh"
            ]

            file_dfs.append(raw_df)

        except Exception as e:
            print(f"Error reading {file_path.name}: {str(e)}")
            continue

    if not file_dfs:
        raise ValueError("No valid files could be read")

    return file_dfs


pareto_dfs = read_individual_files(data_file_paths)

## Feature manipulation

### 1. Preference Weight Definitions

#### Basic Linear Weights:
$$
w_i^t = 1 - t_i', \quad 
w_i^e = 1 - e_i'
$$

#### Softmax-based Weights (with temperature parameter $\beta$):
$$
w_i^t = \frac{\exp(-\beta\,t_i')}{\exp(-\beta\,t_i') + \exp(-\beta\,e_i')}, \\
w_i^e = 1 - w_i^t
$$

### Parameter Explanation

| Symbol | Description                          | Typical Range     |
|--------|--------------------------------------|-------------------|
| $\beta$| Temperature parameter controls spread| $\beta > 0$       |
| $w_i^t$| Time-preference weight               | $0 \leq w_i^t \leq 1$ |
| $w_i^e$| Energy-preference weight             | $0 \leq w_i^e \leq 1$ |

### Key Properties
1. **Normalization**: 
   $$ 0 \leq t_i', e_i' \leq 1 $$
   
2. **Weight Relationships**:
   $$ w_i^t + w_i^e = 1 $$

3. **Parameter Sensitivity**:
   - $\beta \to 0$: Equal weights ($w_i^t \approx w_i^e \approx 0.5$)
   - $\beta \to \infty$: Binary preference (winner-takes-all)

In [16]:
def calculate_weights(df: pd.DataFrame, beta: float = 5.0) -> pd.DataFrame:
    """Calculate normalized time/energy weights for Pareto front solutions.

    Args:
        df: DataFrame containing columns 'time_min' and 'energy_kwh'
        beta: Temperature parameter for softmax weighting

    Returns:
        DataFrame with added 'time_weight' and 'energy_weight' columns
    """

    # Normalize time and energy using sklearn
    time_norm = QuantileTransformer().fit_transform(df[["time_min"]]).flatten()
    energy_norm = QuantileTransformer().fit_transform(df[["energy_kwh"]]).flatten()

    #  Softmax-based weighting
    exp_t = np.exp(-beta * time_norm)
    exp_e = np.exp(-beta * energy_norm)

    df = df.assign(
        time_weight=exp_t / (exp_t + exp_e), energy_weight=lambda x: 1 - x.time_weight
    )

    return df


# Data Processing Phase
def process_individual_files(
    raw_dfs: list[pd.DataFrame], beta: float = 5.0
) -> pd.DataFrame:
    """Process individual DataFrames with weight calculation.

    Args:
        raw_dfs: List of DataFrames from read_individual_files
        beta: Softmax temperature parameter

    Returns:
        Combined DataFrame with processed weights
    """
    processed_dfs = []

    for i, df in enumerate(raw_dfs):
        try:
            # Calculate weights for this specific dataset
            weighted_df = calculate_weights(df.copy(), beta)
            processed_dfs.append(weighted_df)

        except Exception as e:
            print(f"Error processing file {i}: {str(e)}")
            continue

    return pd.concat(processed_dfs, ignore_index=True)


# Processing phase
pareto_df = process_individual_files(pareto_dfs)

pareto_df



Unnamed: 0,accel_ms2,decel_ms2,time_min,energy_kwh,distance_km,max_speed_mps,max_bettery_kwh,time_weight,energy_weight
0,0.213321,0.209042,17.447130,1.565973,20,20.0,10.0,0.006693,0.993307
1,1.999999,1.999592,16.749167,1.594058,20,20.0,10.0,0.993307,0.006693
2,1.226421,1.205027,16.801731,1.591707,20,20.0,10.0,0.989520,0.010480
3,1.980324,1.979324,16.749995,1.591878,20,20.0,10.0,0.990028,0.009972
4,0.220566,0.213304,17.421467,1.567303,20,20.0,10.0,0.007773,0.992227
...,...,...,...,...,...,...,...,...,...
795,1.997366,1.966372,8.415944,0.834022,10,20.0,10.0,0.992605,0.007395
796,0.595512,1.756692,8.612372,0.823187,10,20.0,10.0,0.894323,0.105677
797,1.085927,1.987673,8.485979,0.830383,10,20.0,10.0,0.982795,0.017205
798,0.351023,1.996438,8.807303,0.816624,10,20.0,10.0,0.336620,0.663380


In [17]:
pareto_df["max_bettery_kwh"].unique()

array([10.])

## Saving data

In [18]:
pareto_df.to_pickle(EXPORT_PATH)