In [18]:
from dataclasses import dataclass
from dataclasses import replace
from functools import partial

import jax
import jax.numpy as jnp
from flax import struct
import optax as ox

# Packages that actually performs Sinkhorn algorithm
from ott.geometry.pointcloud import PointCloud
from ott.problems.linear.linear_problem import LinearProblem
from ott.solvers.linear.sinkhorn import Sinkhorn

from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.gaussian_process import kernels
## Preprocessing step
from sklearn.preprocessing import StandardScaler


import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import seaborn as sns

import h5py
#import plotly.graph_objects as go
#from mpl_toolkits.mplot3d import Axes3D

def read_cgns_coordinates(file_path):
    with h5py.File(file_path, 'r') as file:
        # We retrieve coordinate by coordinate.
        # ! Notice the space before the data. This is due to the naming in the files themselves.
        x = np.array(file['Base_2_3/Zone/GridCoordinates/CoordinateX'].get(' data'))
        y = np.array(file['Base_2_3/Zone/GridCoordinates/CoordinateY'].get(' data'))
        z = np.array(file['Base_2_3/Zone/GridCoordinates/CoordinateZ'].get(' data'))

    return x, y, z



@struct.dataclass
class WeightedPointCloud:
  """A weighted point cloud.
  
  Attributes:
    cloud: Array of shape (n, d) where n is the number of points and d the dimension.
    weights: Array of shape (n,) where n is the number of points.
  """
  cloud: jnp.array
  weights: jnp.array

  def __len__(self):
    return self.cloud.shape[0]


@struct.dataclass
class VectorizedWeightedPointCloud:
  """Vectorized version of WeightedPointCloud.

  Assume that b clouds are all of size n and dimension d.
  
  Attributes:
    _private_cloud: Array of shape (b, n, d) where n is the number of points and d the dimension.
    _private_weights: Array of shape (b, n) where n is the number of points.
  
  Methods:
    unpack: returns the cloud and weights.
  """
  _private_cloud: jnp.array
  _private_weights: jnp.array

  def __getitem__(self, idx):
    return WeightedPointCloud(self._private_cloud[idx], self._private_cloud[idx])
  
  def __len__(self):
    return self._private_cloud.shape[0]
  
  def __iter__(self):
    for i in range(len(self)):
      yield self[i]

  def unpack(self):
    return self._private_cloud, self._private_weights

def pad_point_cloud(point_cloud, max_cloud_size, fail_on_too_big=True):
  """Pad a single point cloud with zeros to have the same size.
  
  Args:
    point_cloud: a weighted point cloud.
    max_cloud_size: the size of the biggest point cloud.
    fail_on_too_big: if True, raise an error if the cloud is too big for padding.
  
  Returns:
    a WeightedPointCloud with padded cloud and weights.
  """
  cloud, weights = point_cloud.cloud, point_cloud.weights
  delta = max_cloud_size - cloud.shape[0]
  if delta <= 0:
    if fail_on_too_big:
      assert False, 'Cloud is too big for padding.'
    return point_cloud

  ratio = 1e-3  # less than 0.1% of the total mass.
  smallest_weight = jnp.min(weights) / delta * ratio
  small_weights = jnp.ones(delta) * smallest_weight

  weights = weights * (1 - ratio)  # keep 99.9% of the mass.
  weights = jnp.concatenate([weights, small_weights], axis=0)

  cloud = jnp.pad(cloud, pad_width=((0, delta), (0,0)), mode='mean')

  point_cloud = WeightedPointCloud(cloud, weights)

  return point_cloud

def pad_point_clouds(cloud_list):
  """Pad the point clouds with zeros to have the same size.

  Note: this function should be used outside of jax.jit because the computation graph
        is huge. O(len(cloud_list)) nodes are generated.

  Args:
    cloud_list: a list of WeightedPointCloud.
  
  Returns:
    a VectrorizedWeightedPointCloud with padded clouds and weights.
  """
  # sentinel for unified processing of all clouds, including biggest one.
  max_cloud_size = max([len(cloud) for cloud in cloud_list]) + 1
  sentinel_padder = partial(pad_point_cloud, max_cloud_size=max_cloud_size)

  cloud_list = list(map(sentinel_padder, cloud_list))
  coordinates = jnp.stack([cloud.cloud for cloud in cloud_list])
  weights = jnp.stack([cloud.weights for cloud in cloud_list])
  return VectorizedWeightedPointCloud(coordinates, weights)

def clouds_barycenter(points):
  """Compute the barycenter of a set of clouds.
  
  Args:
    points: a VectorizedWeightedPointCloud.
    
  Returns:
    a barycenter of the clouds of points, of shape (1, d) where d is the dimension.
  """
  clouds, weights = points.unpack()
  barycenter = jnp.sum(clouds * weights[:,:,jnp.newaxis], axis=1)
  barycenter = jnp.mean(barycenter, axis=0, keepdims=True)
  return barycenter


def to_simplex(mu):
  """Project weights to the simplex.
  
  Args: 
    mu: a WeightedPointCloud.
    
  Returns:
    a WeightedPointCloud with weights projected to the simplex."""
  if mu.weights is None:
    mu_weights = None
  else:
    mu_weights = jax.nn.softmax(mu.weights)
  return replace(mu, weights=mu_weights)


def reparametrize_mu(mu, cloud_barycenter, scale):
  """Re-parametrize mu to be invariant by translation and scaling.

  Args:
    mu: a WeightedPointCloud.
    cloud_barycenter: Array of shape (1, d) where d is the dimension.
    scale: float, scaling parameter for the re-parametrization of mu.
  
  Returns:
    a WeightedPointCloud with re-parametrized weights and cloud.
  """
  # invariance by translation : recenter mu around its mean
  mu_cloud = mu.cloud - jnp.mean(mu.cloud, axis=0, keepdims=True)  # center.
  mu_cloud = scale * jnp.tanh(mu_cloud)  # re-parametrization of the domain.
  mu_cloud = mu_cloud + cloud_barycenter  # re-center toward barycenter of all clouds.
  return replace(mu, cloud=mu_cloud)


def clouds_to_dual_sinkhorn(points, 
                            mu, 
                            init_dual=(None, None),
                            scale=1.,
                            has_aux=False,
                            sinkhorn_solver_kwargs=None, 
                            parallel: bool = True,
                            batch_size: int = -1):
  """Compute the embeddings of the clouds with regularized OT towards mu.
  
  Args:
    points: a VectorizedWeightedPointCloud.
    init_dual: tuple of two arrays of shape (b, n) and (b, m) where b is the number of clouds,
               n is the number of points in each cloud, and m the number of points in mu.
    scale: float, scaling parameter for the re-parametrization of mu.
    has_aux: bool, whether to return the full Sinkhorn output or only the dual variables.
    sinkhorn_solver_kwargs: dict, kwargs for the Sinkhorn solver.
      Must contain the key 'epsilon' for the regularization parameter.

  Returns:
    a tuple (dual, init_dual) with dual variables of shape (n, m) where n is the number of points
    and m the number of points in mu, and init_dual a tuple (init_dual_cloud, init_dual_mu) 
  """
  sinkhorn_epsilon = sinkhorn_solver_kwargs.pop('epsilon')
  
  # weight projection
  barycenter = clouds_barycenter(points)
  mu = to_simplex(mu)

  # cloud projection
  mu = reparametrize_mu(mu, barycenter, scale)

  def sinkhorn_single_cloud(cloud, weights, init_dual):
    geom = PointCloud(cloud, mu.cloud,
                      epsilon=sinkhorn_epsilon)
    ot_prob = LinearProblem(geom,
                            weights,
                            mu.weights)
    solver = Sinkhorn(**sinkhorn_solver_kwargs)
    ot = solver(ot_prob, init=init_dual)
    return ot
  
  if parallel:
    if batch_size == -1:
        parallel_sinkhorn = jax.vmap(sinkhorn_single_cloud,
                                    in_axes=(0, 0, (0, 0)),
                                    out_axes=0)
        outs = parallel_sinkhorn(*points.unpack(), init_dual)
        return outs.g
    else:
      raise ValueError("Not coded yet") 
  else:
    list_of_g_potentials = []
    clouds, weights = points.unpack()
    for i in range(len(clouds)):
      ot_problem = sinkhorn_single_cloud(clouds[i], weights[i], init_dual)
      list_of_g_potentials.append(ot_problem.g)
    g_potentials_array = jnp.stack(list_of_g_potentials)
    return g_potentials_array
  

# Set the seed for reproducibility
np.random.seed(42)

In [19]:
## Number of blade one want to consider.
_many_blades = 50

## Creating the list of all file numbers.
padded_numbers = [str(i).zfill(9) for i in range(_many_blades)]

## Lists that will holds the cloud points and the associated efficiency.
distributions = []
efficiency = []
omega = []
P = []

for number in padded_numbers:
    ## File paths Google Colab
    #cgns_file_path = f'/content/drive/MyDrive/Developer/GraduationProject/Experimental Part/Rotor37/dataset/samples/sample_{number}/meshes/mesh_000000000.cgns'
    #coefficient_file_path = f'/content/drive/MyDrive/Developer/GraduationProject/Experimental Part/Rotor37/dataset/samples/sample_{number}/scalars.csv'
    ## File paths Personal Computer
    cgns_file_path = f'Rotor37/dataset/samples/sample_{number}/meshes/mesh_000000000.cgns'
    coefficient_file_path = f'Rotor37/dataset/samples/sample_{number}/scalars.csv'
    ## Computing the coordinates
    x, y, z = read_cgns_coordinates(cgns_file_path)
    blade = np.column_stack((x, y, z))
    ## Computing the coefficient
    scalars = pd.read_csv(coefficient_file_path)
    ## Adding to our data
    distributions.append(blade)
    efficiency.append(scalars["Efficiency"][0])
    omega.append(scalars["Omega"][0])
    P.append(scalars["P"][0])


## Use array objects and reshape them to have the correct form. This is important for the preprocessing steps later.
omega = np.array(omega)
omega = omega.reshape(-1, 1)

P = np.array(P)
P = P.reshape(-1, 1)

efficiency = np.array(efficiency)
efficiency = efficiency.reshape(-1, 1)

mu = distributions[0]

sinkhorn_potentials = pd.read_csv("sinkhorn_potentials_50.csv", sep = ";", header = None)

In [22]:
feature_matrix = np.hstack((sinkhorn_potentials, P, omega))

x_train, x_test, y_train, y_test = train_test_split(feature_matrix, efficiency, train_size = 0.7, random_state = 42)

## Define the kernels for different parts of the feature matrix
kernel_sinkhorn = kernels.RBF()
kernel_scalars = kernels.RBF(length_scale = np.array([1, 1])) # anisotropic kernel

sinkhorn_train = x_train[: , 0:len(mu)]
scalars_train = x_train[: , len(mu):]

# Train the normalizer on the train data
normalize = StandardScaler().fit(scalars_train)
scalars_train = normalize.transform(scalars_train)

## Kernels matrices
kernel_matrix_sinkhorn_train = kernel_sinkhorn(sinkhorn_train)
kernel_matrix_scalars_train = kernel_scalars(scalars_train)
k_train = kernel_matrix_sinkhorn_train*kernel_matrix_scalars_train
print(k_train.shape)

sinkhorn_test = x_test[: , 0:len(mu)]
scalars_test = x_test[: , len(mu):]
# Apply the normalizer on the test data
scalars_test = normalize.transform(scalars_test)
## Kernels matrices
kernel_matrix_sinkhorn_test = kernel_sinkhorn(sinkhorn_test, sinkhorn_train)
kernel_matrix_scalars_test = kernel_scalars(scalars_test, scalars_train)
k_test = kernel_matrix_sinkhorn_test*kernel_matrix_scalars_test
print(k_test.shape)

(35, 35)
(15, 35)


In [25]:
## We perform the Kernel Ridge Regression
krr = KernelRidge(kernel = "precomputed")

## Define the parameter grid
## We can only optimize on the regularization parameter because we use a precomputed kernel.
param_grid = {'alpha': [0.001, 0.01, 0.1, 0.5, 1, 10, 100]}

## Create GridSearchCV object
grid_search = GridSearchCV(estimator = krr, 
                           param_grid = param_grid, 
                           scoring = 'neg_mean_squared_error', 
                           cv = 5)

## Fit the model ie training the model
grid_search.fit(X = k_train, y = y_train)

## Get the best parameters from the cross validation
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

## Get the best model
my_model = grid_search.best_estimator_


## Obtain predictions for the test set
predictions = my_model.predict(X = k_test)

## Compute the MSE
mse = mean_squared_error(y_true = y_test, y_pred = predictions)

# Compute the EVS
evs = explained_variance_score(y_true = y_test, y_pred = predictions)

# Print the MSE and EVS
print(f'Mean Square Error on the Test Set: {mse}')
print(f'Explained Variance Score on the Test Set: {evs}')

plt.figure()

sns.set_theme(context='paper', font_scale=1.5)
sns.scatterplot(x=y_test, y=predictions, color='blue', alpha=0.5)

## Adding the x=y line and the text
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red', linewidth=2, alpha = 0.8)
plt.text(1, 1, f'EVS = {evs:.3f}', ha='left', va='top', color='black', fontsize=10, weight='bold')

plt.title('Predicted vs. Actual Values of Y')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

#plt.savefig('Images/regression_toy_experiment.png', dpi=300, bbox_inches='tight',format="png")
plt.show()

Best Parameters: {'alpha': 0.001}
Mean Square Error on the Test Set: 0.7166156821348948
Explained Variance Score on the Test Set: -165.96731306542324


ValueError: Per-column arrays must each be 1-dimensional

<Figure size 640x480 with 0 Axes>

In [None]:
## Creating the features matrix
feature_matrix = np.hstack((sinkhorn_potentials, P, omega))
feature_matrix_wout_pot = np.hstack((P, omega))
feature_matrix_only_pot = sinkhorn_potentials


## Define the kernels for different parts of the feature matrix
kernel_sinkhorn = kernels.RBF()
kernel_scalars = kernels.RBF(length_scale = np.array([1, 1])) # anisotropic kernel

## Kernels matrices
kernel_matrix_sinkhorn = kernel_sinkhorn(feature_matrix_only_pot)
kernel_matrix_scalars = kernel_scalars(feature_matrix_wout_pot)

kernel_matrix = kernel_matrix_sinkhorn*kernel_matrix_scalars

In [None]:
print(feature_matrix_only_pot.shape)
print(feature_matrix_wout_pot.shape)
print(feature_matrix.shape)

print(kernel_matrix_sinkhorn.shape)
print(kernel_matrix_scalars.shape)

print(kernel_matrix.shape)

In [None]:
x_train[: , len(mu):]