In [25]:
import numpy as np
import matplotlib.pyplot as plt

# Charger le fichier .npy
#scatters_0 = np.load("save/order_0_traindataset_L_2_J_1_sigma_2.0_MNO_(32, 32, 32)_powers_[0.5, 1.0, 2.0, 3.0].npy")
scatters_train = np.load("save/order_0_traindataset_L_3_J_3_sigma_2.0_MNO_(160, 112, 80)_powers_[0.5, 1.0, 2.0, 3.0].npy")

# Afficher le contenu
print(scatters_train.shape)

(6591, 12)


In [26]:
centered_scatters = scatters_train - scatters_train.mean(0)

In [27]:
nb_mol = centered_scatters.shape[0]
nb_viz = nb_mol//1000 + 1

### **Test de régression avec les scatterings centrés**

In [28]:
import numpy as np
import torch
import time
import os

from sklearn import (linear_model, model_selection, preprocessing,
                     pipeline)
from scipy.spatial.distance import pdist

from kymatio.torch import HarmonicScattering3D

from kymatio.scattering3d.backend.torch_backend \
    import TorchBackend3D

from kymatio.scattering3d.utils \
    import generate_weighted_sum_of_gaussians

from kymatio.datasets import fetch_qm7
from kymatio.caching import get_cache_dir

import random

import pandas as pd
from ase.io import read

from xgboost import XGBRegressor

In [29]:
# J = 2
# L = 3
# J = 1
# L = 2
J = 3
L = 3
integral_powers = [0.5, 1.0, 2.0, 3.0]

In [30]:
#M, N, O = 192, 128, 96
#M, N, O = 32, 32, 32
M, N, O = 160, 112, 80

grid = np.mgrid[-M//2:-M//2+M, -N//2:-N//2+N, -O//2:-O//2+O]
grid = np.fft.ifftshift(grid)

In [31]:
# Charger les données
train_energies = pd.read_csv('../data/energies/train.csv')
molecule_ids = train_energies['id'].values
energies = train_energies['energy'].values

# Initialiser les listes pour stocker toutes les positions et charges
all_positions = []
all_charges = []

# Lire tous les fichiers .xyz avec ASE
for mol_id in molecule_ids:
    xyz_path = f'../data/atoms/train/id_{mol_id}.xyz'
    atoms = read(xyz_path)
    
    # Obtenir les positions et numéros atomiques
    coords = atoms.get_positions()
    atomic_numbers = atoms.get_atomic_numbers()
    
    # Padding pour avoir une taille fixe
    max_atoms = 23  # Ajuster selon le nombre maximum d'atomes
    padded_coords = np.zeros((max_atoms, 3))
    padded_charges = np.zeros(max_atoms)
    
    n_atoms = len(coords)
    padded_coords[:n_atoms] = coords
    padded_charges[:n_atoms] = atomic_numbers
    
    all_positions.append(padded_coords)
    all_charges.append(padded_charges)

# Convertir en arrays numpy
pos = np.array(all_positions)
full_charges = np.array(all_charges)
target = energies

# Le reste du code reste identique à partir de la définition de n_molecules
n_molecules = len(pos)

# ...existing code...

In [32]:
overlapping_precision = 1e-1
sigma = 2.0
min_dist = np.inf

for i in range(n_molecules):
    n_atoms = np.sum(full_charges[i] != 0)
    pos_i = pos[i, :n_atoms, :]
    min_dist = min(min_dist, pdist(pos_i).min())

delta = sigma * np.sqrt(-8 * np.log(overlapping_precision))
pos = pos * delta / min_dist

In [33]:
scattering = HarmonicScattering3D(J=J, shape=(M, N, O),
                                  L=L, sigma_0=sigma,
                                  integral_powers=integral_powers)

In [34]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
scattering.to(device)

HarmonicScattering3D()

In [35]:
# order_0 = np.load("save/order_0_traindataset_L_2_J_1_sigma_2.0_MNO_(32, 32, 32)_powers_[0.5, 1.0, 2.0, 3.0].npy")
# orders_1_and_2 = np.load("save/orders_1_and_2traindataset_L_2_J_1_sigma_2.0_MNO_(32, 32, 32)_powers_[0.5, 1.0, 2.0, 3.0].npy")

order_0 = np.load("save/order_0_traindataset_L_3_J_3_sigma_2.0_MNO_(160, 112, 80)_powers_[0.5, 1.0, 2.0, 3.0].npy")
orders_1_and_2 = np.load("save/orders_1_and_2traindataset_L_3_J_3_sigma_2.0_MNO_(160, 112, 80)_powers_[0.5, 1.0, 2.0, 3.0].npy")

In [36]:
orders_1_and_2.shape

(6591, 480)

In [37]:
centered_orders_0 = order_0 - order_0.mean(0)
centered_orders_1_and_2 = orders_1_and_2 - orders_1_and_2.mean(0)

In [38]:
scattering_coef = np.concatenate([centered_orders_0, centered_orders_1_and_2], axis=1)

In [39]:
scattering_coef.shape

(6591, 492)

In [21]:
# from ase.build import molecule
from dscribe.descriptors import CoulombMatrix
import os
from ase.io import read
import pandas as pd
import numpy as np

In [9]:
path_to_test = '../data/atoms/test'
path_to_train = '../data/atoms/train'

# Read all the atoms files in the test directory
test_files = [os.path.join(path_to_test, f) for f in os.listdir(path_to_test) if f.endswith('.xyz')]
# Read all the atoms files in the train directory
train_files = [os.path.join(path_to_train, f) for f in os.listdir(path_to_train) if f.endswith('.xyz')]
# Read the atoms objects
test_atoms = [read(f) for f in test_files]
train_atoms = [read(f) for f in train_files]
# Create a DataFrame for test atoms
test_df = pd.DataFrame({
    'file': test_files,
    'atoms': test_atoms
})
# Create a DataFrame for train atoms
train_df = pd.DataFrame({
    'file': train_files,
    'atoms': train_atoms
})

In [10]:
# Compute the Coulomb matrix for each atom in the test set
# Avec "eigenspectrum" --> 
def compute_coulomb_matrix(atoms, nb_max):
    cm = CoulombMatrix(
        n_atoms_max=nb_max,
        permutation="eigenspectrum",
    )
    matrix = cm.create(atoms)  # Create the Coulomb matrix for the given atoms
    return matrix

# Get the maximum number of atoms in the test set
nb_max = max(len(atoms) for atoms in test_atoms + train_atoms)

# Apply the function to compute Coulomb matrices for test atoms
test_df['coulomb_matrix'] = test_df['atoms'].apply(compute_coulomb_matrix, nb_max=nb_max)
# Apply the function to compute Coulomb matrices for train atoms
train_df['coulomb_matrix'] = train_df['atoms'].apply(compute_coulomb_matrix, nb_max=nb_max)
# Save the DataFrames to CSV files
# test_df.to_csv('../data/atoms/test_coulomb_matrices.csv', index=False)
# train_df.to_csv('../data/atoms/train_coulomb_matrices.csv', index=False)

In [45]:
train_coulomb_matrix = np.array(train_df.coulomb_matrix.to_list())

In [46]:
train_coulomb_matrix.shape

(6591, 23)

In [48]:
scattering_coef_and_coulomb_matrix = np.concatenate([scattering_coef, train_coulomb_matrix], axis=1)

In [79]:
xgb = XGBRegressor(
        n_estimators = 100,
        learning_rate = 0.1,
        max_depth = 10,
        random_state = 42
    )


xgb.fit(scattering_coef_and_coulomb_matrix, target)

In [80]:
batch_size = 8
#batch_size = 16
n_batches = int(np.ceil(n_molecules / batch_size))

In [None]:
"""
# Charger et préparer les données de test
test_positions = []
test_charges = []

# Lister tous les fichiers .xyz dans le dossier test
test_files = sorted(os.listdir('../data/atoms/test'))
test_ids = [int(f.split('_')[1].split('.')[0]) for f in test_files]  # Extraire les IDs

# Lire tous les fichiers .xyz de test
for xyz_file in test_files:
    xyz_path = os.path.join('../data/atoms/test', xyz_file)
    atoms = read(xyz_path)
    
    # Obtenir les positions et numéros atomiques
    coords = atoms.get_positions()
    atomic_numbers = atoms.get_atomic_numbers()
    
    # Padding pour avoir une taille fixe
    padded_coords = np.zeros((max_atoms, 3))
    padded_charges = np.zeros(max_atoms)
    
    n_atoms = len(coords)
    padded_coords[:n_atoms] = coords
    padded_charges[:n_atoms] = atomic_numbers
    
    test_positions.append(padded_coords)
    test_charges.append(padded_charges)

# Convertir en arrays numpy
test_pos = np.array(test_positions)
test_full_charges = np.array(test_charges)
n_test_molecules = len(test_pos)

# Appliquer la même normalisation que pour les données d'entraînement
test_pos = test_pos * delta / min_dist

# Calculer les coefficients de scattering pour les données de test
test_order_0 = []
test_orders_1_2 = []

for i in range(0, n_test_molecules, batch_size):
    end = min(i + batch_size, n_test_molecules)
    pos_batch = test_pos[i:end]
    charges_batch = test_full_charges[i:end]
    
    # Calculer les valence charges
    mask = charges_batch <= 2
    val_batch = charges_batch * mask
    mask = np.logical_and(charges_batch > 2, charges_batch <= 10)
    val_batch += (charges_batch - 2) * mask
    mask = np.logical_and(charges_batch > 10, charges_batch <= 18)
    val_batch += (charges_batch - 10) * mask
    
    # Calculer les descripteurs comme pour l'entraînement
    full_density = generate_weighted_sum_of_gaussians(grid, pos_batch, charges_batch, sigma)
    full_density = torch.from_numpy(full_density).to(device).float()
    
    val_density = generate_weighted_sum_of_gaussians(grid, pos_batch, val_batch, sigma)
    val_density = torch.from_numpy(val_density).to(device).float()
    
    core_density = full_density - val_density
    
    # Calculer les coefficients
    full_0 = TorchBackend3D.compute_integrals(full_density, integral_powers)
    full_s = scattering(full_density)
    
    val_0 = TorchBackend3D.compute_integrals(val_density, integral_powers)
    val_s = scattering(val_density)
    
    core_0 = TorchBackend3D.compute_integrals(core_density, integral_powers)
    core_s = scattering(core_density)
    
    # Empiler les coefficients
    batch_0 = torch.stack((full_0, val_0, core_0), dim=-1)
    batch_s = torch.stack((full_s, val_s, core_s), dim=-1)
    
    test_order_0.append(batch_0.cpu())
    test_orders_1_2.append(batch_s.cpu())

# Concatener tous les batches
test_order_0 = torch.cat(test_order_0, dim=0).numpy()
test_orders_1_2 = torch.cat(test_orders_1_2, dim=0).numpy()

test_order_0_centered = test_order_0 - test_order_0.mean(0)
test_orders_1_2_centered = test_orders_1_2 - test_orders_1_2.mean(0)


# Reshape comme pour l'entraînement
test_order_0_centered = test_order_0_centered.reshape((n_test_molecules, -1))
test_orders_1_2_centered = test_orders_1_2_centered.reshape((n_test_molecules, -1))

# Concaténer les descripteurs
test_scattering_coef = np.concatenate([test_order_0_centered, test_orders_1_2_centered], axis=1)
"""

In [82]:
test_order_0 = order_0 = np.load("save/order_0_testdataset_L_3_J_3_sigma_2.0_MNO_(160, 112, 80)_powers_[0.5, 1.0, 2.0, 3.0].npy")
test_orders_1_and_2 = np.load("save/orders_1_and_2testdataset_L_3_J_3_sigma_2.0_MNO_(160, 112, 80)_powers_[0.5, 1.0, 2.0, 3.0].npy")

In [83]:
n_test_molecules = test_order_0.shape[0]

test_order_0_centered = test_order_0 - test_order_0.mean(0)
test_orders_1_2_centered = test_orders_1_and_2 - test_orders_1_and_2.mean(0)


# Reshape comme pour l'entraînement
test_order_0_centered = test_order_0_centered.reshape((n_test_molecules, -1))
test_orders_1_2_centered = test_orders_1_2_centered.reshape((n_test_molecules, -1))

# Concaténer les descripteurs
test_scattering_coef = np.concatenate([test_order_0_centered, test_orders_1_2_centered], axis=1)

In [84]:
test_coulomb_matrix = np.array(test_df.coulomb_matrix.to_list())

In [85]:
test_scattering_coef_and_coulomb_matrix = np.concatenate([test_scattering_coef, test_coulomb_matrix], axis=1)

In [86]:
test_prediction = xgb.predict(test_scattering_coef_and_coulomb_matrix)
train_prediction = xgb.predict(scattering_coef_and_coulomb_matrix)

In [87]:
from sklearn.metrics import mean_squared_error

In [88]:
rmse_train = np.sqrt(mean_squared_error(target, train_prediction))

In [89]:
rmse_train

0.025152416477474765

In [90]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

alphas = [1e-11, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.01, 0.1, 1, 10, 100]
rmse_results = []

for alpha in alphas:
    ridge_pipeline = make_pipeline(
        StandardScaler(),
        Ridge(alpha=alpha)
    )
    ridge_pipeline.fit(scattering_coef_and_coulomb_matrix, target)
    train_pred = ridge_pipeline.predict(scattering_coef_and_coulomb_matrix)
    rmse = np.sqrt(mean_squared_error(target, train_pred))
    rmse_results.append(rmse)
    print(f"alpha={alpha:<7} --> RMSE train: {rmse:.4f}")

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


alpha=1e-11   --> RMSE train: 0.0403
alpha=1e-09   --> RMSE train: 0.0405
alpha=1e-08   --> RMSE train: 0.0412
alpha=1e-07   --> RMSE train: 0.0427
alpha=1e-06   --> RMSE train: 0.0450
alpha=1e-05   --> RMSE train: 0.0483
alpha=0.0001  --> RMSE train: 0.0544
alpha=0.001   --> RMSE train: 0.0656
alpha=0.01    --> RMSE train: 0.0876
alpha=0.1     --> RMSE train: 0.1260
alpha=1       --> RMSE train: 0.1863
alpha=10      --> RMSE train: 0.3091
alpha=100     --> RMSE train: 0.5856


In [93]:
# Entraîner le modèle final sur toutes les données d'entraînement
final_scaler = preprocessing.StandardScaler()
final_ridge = Ridge(alpha=0.0001)  # Utiliser le meilleur alpha trouvé
final_model = pipeline.make_pipeline(final_scaler, final_ridge)
final_model.fit(scattering_coef_and_coulomb_matrix, target)

# Faire les prédictions sur les données de test
test_predictions = final_model.predict(test_scattering_coef_and_coulomb_matrix)

In [94]:
# Lister tous les fichiers .xyz dans le dossier test
test_files = sorted(os.listdir('../data/atoms/test'))
test_ids = [int(f.split('_')[1].split('.')[0]) for f in test_files]  # Extraire les IDs

In [95]:
# Créer et sauvegarder le fichier de prédictions
predictions_df = pd.DataFrame({
    'id': test_ids,
    'energy': test_predictions
})
predictions_df.to_csv(f'../data/energies/test_pred_scattering_fin_et_coulomb_ridge_alpha_0.0001.csv', index=False)
#print("Prédictions sauvegardées dans test_pred_scattering_xgboost.csv")