# Prediction of the energy with Coulomb matrices

In [9]:
# !pip install ase
# !pip install dscribe
from ase.build import molecule
from dscribe.descriptors import CoulombMatrix
from ase.io import read
import os
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import os
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

In [10]:
path_to_test = '../data/atoms/test'
path_to_train = '../data/atoms/train'

In [11]:
# Read all the atoms files in the test directory
test_files = [os.path.join(path_to_test, f) for f in os.listdir(path_to_test) if f.endswith('.xyz')]
# Read all the atoms files in the train directory
train_files = [os.path.join(path_to_train, f) for f in os.listdir(path_to_train) if f.endswith('.xyz')]
# Read the atoms objects
test_atoms = [read(f) for f in test_files]
train_atoms = [read(f) for f in train_files]
# Create a DataFrame for test atoms
test_df = pd.DataFrame({
    'file': test_files,
    'atoms': test_atoms
})
# Create a DataFrame for train atoms
train_df = pd.DataFrame({
    'file': train_files,
    'atoms': train_atoms
})

In [12]:
# Compute the Coulomb matrix for each atom in the test set
def compute_coulomb_matrix(atoms, nb_max):
    cm = CoulombMatrix(
        n_atoms_max=nb_max,
        permutation="sorted_l2",
    )
    matrix = cm.create(atoms)  # Create the Coulomb matrix for the given atoms
    return matrix

# Get the maximum number of atoms in the test set
nb_max = max(len(atoms) for atoms in test_atoms + train_atoms)

# Apply the function to compute Coulomb matrices for test atoms
test_df['coulomb_matrix'] = test_df['atoms'].apply(compute_coulomb_matrix, nb_max=nb_max)
# Apply the function to compute Coulomb matrices for train atoms
train_df['coulomb_matrix'] = train_df['atoms'].apply(compute_coulomb_matrix, nb_max=nb_max)
# Save the DataFrames to CSV files
test_df.to_csv('../data/atoms/test_coulomb_matrices.csv', index=False)
train_df.to_csv('../data/atoms/train_coulomb_matrices.csv', index=False)
# Print the first few rows of the test DataFrame
print(test_df.head())
# Print the first few rows of the train DataFrame
print(train_df.head())

                             file  \
0  ../data/atoms/test/id_7475.xyz   
1  ../data/atoms/test/id_8126.xyz   
2  ../data/atoms/test/id_6986.xyz   
3  ../data/atoms/test/id_7252.xyz   
4  ../data/atoms/test/id_7830.xyz   

                                               atoms  \
0  (Atom('C', [1.361622, 2.59097, 1.656713], inde...   
1  (Atom('C', [-1.401807, 1.300455, 1.370504], in...   
2  (Atom('N', [-1.899187, -0.508295, 0.543062], i...   
3  (Atom('C', [-0.839082, -0.01128, 2.179564], in...   
4  (Atom('C', [1.228525, 0.356839, -1.70174], ind...   

                                      coulomb_matrix  
0  [36.85810519942594, 23.71796061787803, 14.2611...  
1  [53.3587073998281, 28.550156641732208, 28.7658...  
2  [53.3587073998281, 13.045690459915974, 28.3082...  
3  [53.3587073998281, 20.719548024459403, 30.4262...  
4  [73.51669471981023, 33.42958695343809, 33.6380...  
                              file  \
0   ../data/atoms/train/id_438.xyz   
1  ../data/atoms/train/id_4184.xyz

In [13]:
# Aplatir les matrices de Coulomb
X_train = [m.flatten() for m in train_df['coulomb_matrix']]
X_test = [m.flatten() for m in test_df['coulomb_matrix']]

# Extraire les IDs à partir des noms de fichiers
# Forcer les deux colonnes à être de type string
train_df['ID'] = train_df['file'].apply(lambda x: os.path.splitext(os.path.basename(x))[0].replace("id_", ""))
train_energies = pd.read_csv('../data/energies/train.csv')
train_energies['id'] = train_energies['id'].astype(str)

# Renommer la colonne 'id' en 'ID' pour la fusion
train_energies.rename(columns={'id': 'ID'}, inplace=True)

# Fusionner correctement
train_df = train_df.merge(train_energies, on='ID')

# Extraire y_train
y_train = train_df['energy'].values

In [14]:
# Entraîner le modèle
model = RandomForestRegressor()
# Tester avec XGBoost et RandomForestRegressor
model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)
model.fit(X_train, y_train)

# Prédictions sur le test set
y_pred = model.predict(X_test)

# RMSE de train
y_train_pred = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print("RMSE on training set:", rmse_train)

RMSE on training set: 0.14762184669441467


In [15]:
predictions_df = pd.DataFrame({
    'ID': [os.path.splitext(os.path.basename(f))[0].replace("id_", "") for f in test_files],
    'energy': y_pred
})
predictions_df.to_csv('../results/predicted_energies_coulomb_xgbregressor_sorted_l2.csv', index=False)

In [None]:
# --- Définir l'espace de recherche des hyperparamètres ---
param_distributions = {
    'n_estimators': [300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [6, 8, 10],
    'subsample': [0.5, 0.7, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.5, 1],
    'min_child_weight': [1, 3, 5, 7],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [0.1, 0.5, 1, 5]
}

# --- Définir le modèle de base ---
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

# Utiliser la MSE négative comme score (convention de scikit-learn)
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# --- Lancer la recherche aléatoire (RandomizedSearchCV) ---
search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions,
    n_iter=50,      # nombre d'itérations pour explorer l'espace
    scoring=scorer,
    cv=10,           # validation croisée à 10 plis
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Entraîner la recherche sur les données d'entraînement
search.fit(X_train, y_train)

# Meilleurs paramètres
print("Meilleurs paramètres :", search.best_params_)

# Utiliser le meilleur modèle
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)

# Print RMSE sur l'ensemble d'entraînement avec le meilleur modèle
y_train_pred = best_model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print("RMSE on training set with best model:", rmse_train)

predictions_df = pd.DataFrame({
    'ID': [os.path.splitext(os.path.basename(f))[0].replace("id_", "") for f in test_files],
    'energy': y_pred
})
predictions_df.to_csv('../results/predicted_energies_coulomb_gridsearch_random.csv', index=False)

Fitting 10 folds for each of 50 candidates, totalling 500 fits
[CV] END colsample_bytree=1.0, gamma=1, learning_rate=0.2, max_depth=8, min_child_weight=1, n_estimators=500, reg_alpha=0.1, reg_lambda=0.5, subsample=0.8; total time=  22.4s
[CV] END colsample_bytree=1.0, gamma=1, learning_rate=0.2, max_depth=8, min_child_weight=1, n_estimators=500, reg_alpha=0.1, reg_lambda=0.5, subsample=0.8; total time=  24.8s
[CV] END colsample_bytree=1.0, gamma=1, learning_rate=0.2, max_depth=8, min_child_weight=1, n_estimators=500, reg_alpha=0.1, reg_lambda=0.5, subsample=0.8; total time=  26.3s
[CV] END colsample_bytree=1.0, gamma=1, learning_rate=0.2, max_depth=8, min_child_weight=1, n_estimators=500, reg_alpha=0.1, reg_lambda=0.5, subsample=0.8; total time=  27.3s
[CV] END colsample_bytree=0.5, gamma=0.2, learning_rate=0.1, max_depth=6, min_child_weight=7, n_estimators=300, reg_alpha=1, reg_lambda=0.1, subsample=1.0; total time=  25.6s
[CV] END colsample_bytree=0.5, gamma=0.2, learning_rate=0.1, m

In [10]:
# Print les paramètres du meilleur modèle
print("Best model parameters:", best_model.get_params())

Best model parameters: {'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 1.0, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': 0.5, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.05, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 10, 'max_leaves': None, 'min_child_weight': 3, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 500, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': 1, 'reg_lambda': 1, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.5, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}
