# Prediction of the energy with Coulomb matrices

In [26]:
# !pip install ase
# !pip install dscribe
from ase.build import molecule
from dscribe.descriptors import CoulombMatrix
from ase.io import read
import os
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import os
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

In [27]:
path_to_test = '../data/atoms/test'
path_to_train = '../data/atoms/train'

In [28]:
# Read all the atoms files in the test directory
test_files = [os.path.join(path_to_test, f) for f in os.listdir(path_to_test) if f.endswith('.xyz')]
# Read all the atoms files in the train directory
train_files = [os.path.join(path_to_train, f) for f in os.listdir(path_to_train) if f.endswith('.xyz')]
# Read the atoms objects
test_atoms = [read(f) for f in test_files]
train_atoms = [read(f) for f in train_files]
# Create a DataFrame for test atoms
test_df = pd.DataFrame({
    'file': test_files,
    'atoms': test_atoms
})
# Create a DataFrame for train atoms
train_df = pd.DataFrame({
    'file': train_files,
    'atoms': train_atoms
})

In [29]:
# Compute the Coulomb matrix for each atom in the test set
# Avec "eigenspectrum" --> 
def compute_coulomb_matrix(atoms, nb_max):
    cm = CoulombMatrix(
        n_atoms_max=nb_max,
        permutation="eigenspectrum",
    )
    matrix = cm.create(atoms)  # Create the Coulomb matrix for the given atoms
    return matrix

# Get the maximum number of atoms in the test set
nb_max = max(len(atoms) for atoms in test_atoms + train_atoms)

# Apply the function to compute Coulomb matrices for test atoms
test_df['coulomb_matrix'] = test_df['atoms'].apply(compute_coulomb_matrix, nb_max=nb_max)
# Apply the function to compute Coulomb matrices for train atoms
train_df['coulomb_matrix'] = train_df['atoms'].apply(compute_coulomb_matrix, nb_max=nb_max)
# Save the DataFrames to CSV files
test_df.to_csv('../data/atoms/test_coulomb_matrices.csv', index=False)
train_df.to_csv('../data/atoms/train_coulomb_matrices.csv', index=False)
# Print the first few rows of the test DataFrame
print(test_df.head())
# Print the first few rows of the train DataFrame
print(train_df.head())

                             file  \
0  ../data/atoms/test/id_7475.xyz   
1  ../data/atoms/test/id_8126.xyz   
2  ../data/atoms/test/id_6986.xyz   
3  ../data/atoms/test/id_7252.xyz   
4  ../data/atoms/test/id_7830.xyz   

                                               atoms  \
0  (Atom('C', [1.361622, 2.59097, 1.656713], inde...   
1  (Atom('C', [-1.401807, 1.300455, 1.370504], in...   
2  (Atom('N', [-1.899187, -0.508295, 0.543062], i...   
3  (Atom('C', [-0.839082, -0.01128, 2.179564], in...   
4  (Atom('C', [1.228525, 0.356839, -1.70174], ind...   

                                      coulomb_matrix  
0  [131.27033922447575, 51.885033674108264, 32.56...  
1  [146.53662206037504, 49.121422928152725, 40.65...  
2  [137.99911848227063, 47.15879348174412, 40.445...  
3  [159.24598901252412, 50.72511775792264, 34.887...  
4  [151.11817684977424, 43.97989994328557, 31.437...  
                              file  \
0   ../data/atoms/train/id_438.xyz   
1  ../data/atoms/train/id_4184.xyz

In [30]:
# Print the size of the Coulomb matrix for the first test atom
print("Size of the Coulomb matrix for the first test atom:", test_df['coulomb_matrix'].iloc[0].shape)
# Print the size of the Coulomb matrix for the first train atom
print("Size of the Coulomb matrix for the first train atom:", train_df['coulomb_matrix'].iloc[0].shape)

Size of the Coulomb matrix for the first test atom: (23,)
Size of the Coulomb matrix for the first train atom: (23,)


In [31]:
# Aplatir les matrices de Coulomb
X_train = [m.flatten() for m in train_df['coulomb_matrix']]
X_test = [m.flatten() for m in test_df['coulomb_matrix']]

# Extraire les IDs à partir des noms de fichiers
# Forcer les deux colonnes à être de type string
train_df['ID'] = train_df['file'].apply(lambda x: os.path.splitext(os.path.basename(x))[0].replace("id_", ""))
train_energies = pd.read_csv('../data/energies/train.csv')
train_energies['id'] = train_energies['id'].astype(str)

# Renommer la colonne 'id' en 'ID' pour la fusion
train_energies.rename(columns={'id': 'ID'}, inplace=True)

# Fusionner correctement
train_df = train_df.merge(train_energies, on='ID')

# Extraire y_train
y_train = train_df['energy'].values

# Vérifier la distribution des énergies
print(f"Énergies min: {y_train.min():.6f}, max: {y_train.max():.6f}")
print(f"Énergies négatives: {np.sum(y_train < 0)} / {len(y_train)}")

# Transformation logarithmique sécurisée
# Décaler les valeurs pour éviter les problèmes avec les nombres négatifs
min_energy = y_train.min()
if min_energy <= 0:
    # Décaler pour que toutes les valeurs soient positives
    shift = abs(min_energy) + 1e-6
    y_train_shifted = y_train + shift
    print(f"Décalage appliqué: +{shift:.6f}")
else:
    y_train_shifted = y_train
    shift = 0

# Appliquer le logarithme
y_train_log = np.log(y_train_shifted)
print(f"y_train_log range: [{y_train_log.min():.6f}, {y_train_log.max():.6f}]")

# --- Définir l'espace de recherche des hyperparamètres ---
param_distributions = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 6, 8, 10],
    'subsample': [0.5, 0.7, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.5, 1],
    'min_child_weight': [1, 3, 5, 7],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [0.1, 0.5, 1, 5]
}

# --- Définir le modèle de base ---
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

# Utiliser la MSE négative comme score (convention de scikit-learn)
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# --- Lancer la recherche aléatoire (RandomizedSearchCV) ---
search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions,
    n_iter=50,      # nombre d'itérations pour explorer l'espace
    scoring=scorer,
    cv=10,           # validation croisée à 10 plis
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Entraîner la recherche sur les données d'entraînement
search.fit(X_train, y_train_log)

# Meilleurs paramètres
print("Meilleurs paramètres :", search.best_params_)

# Utiliser le meilleur modèle
best_model = search.best_estimator_
y_pred_log = best_model.predict(X_test)

# IMPORTANT: Inverser la transformation logarithmique
y_pred = np.exp(y_pred_log) - shift  # Inverser le décalage aussi
print(f"Prédictions range: [{y_pred.min():.6f}, {y_pred.max():.6f}]")

predictions_df = pd.DataFrame({
    'ID': [os.path.splitext(os.path.basename(f))[0].replace("id_", "") for f in test_files],
    'energy': -np.exp(y_pred),  # Énergies dans l'échelle originale
})

# Vérification finale des IDs
print("Exemple d'IDs générés:", predictions_df['ID'].head().tolist())
predictions_df.to_csv('../results/predicted_energies_eigen_XGBoost_log.csv', index=False)
print(f"Prédictions sauvegardées: {len(predictions_df)} lignes")


Énergies min: -103.413076, max: -19.313757
Énergies négatives: 6591 / 6591
Décalage appliqué: +103.413077
y_train_log range: [-13.815511, 4.431998]
Fitting 10 folds for each of 50 candidates, totalling 500 fits
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.2, max_depth=10, min_child_weight=1, n_estimators=200, reg_alpha=0.1, reg_lambda=0.5, subsample=0.8; total time=   2.4s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.2, max_depth=10, min_child_weight=1, n_estimators=200, reg_alpha=0.1, reg_lambda=0.5, subsample=0.8; total time=   2.5s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.2, max_depth=10, min_child_weight=1, n_estimators=200, reg_alpha=0.1, reg_lambda=0.5, subsample=0.8; total time=   2.4s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.2, max_depth=10, min_child_weight=1, n_estimators=200, reg_alpha=0.1, reg_lambda=0.5, subsample=0.8; total time=   2.5s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.2, max_depth=10, min_child_we

In [33]:
# Vérification: les IDs sont déjà au bon format
print("Format actuel des IDs:", predictions_df['ID'].head().tolist())
print("Nombre de prédictions:", len(predictions_df))

# Optionnel: double vérification du format
# predictions_df['ID'] = predictions_df['ID'].astype(int).astype(str)
# predictions_df.to_csv('../results/predicted_energies_eigen_XGBoost_log.csv', index=False)

Format actuel des IDs: ['7475', '8126', '6986', '7252', '7830']
Nombre de prédictions: 1647
