# Prediction of the energy with Coulomb matrices

In [18]:
# !pip install ase
# !pip install dscribe
from ase.build import molecule
from dscribe.descriptors import CoulombMatrix
from ase.io import read
import os
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import os
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
#import linear model
from sklearn.linear_model import LinearRegression

In [19]:
path_to_test = '../data/atoms/test'
path_to_train = '../data/atoms/train'

In [20]:
# Read all the atoms files in the test directory
test_files = [os.path.join(path_to_test, f) for f in os.listdir(path_to_test) if f.endswith('.xyz')]
# Read all the atoms files in the train directory
train_files = [os.path.join(path_to_train, f) for f in os.listdir(path_to_train) if f.endswith('.xyz')]
# Read the atoms objects
test_atoms = [read(f) for f in test_files]
train_atoms = [read(f) for f in train_files]
# Create a DataFrame for test atoms
test_df = pd.DataFrame({
    'file': test_files,
    'atoms': test_atoms
})
# Create a DataFrame for train atoms
train_df = pd.DataFrame({
    'file': train_files,
    'atoms': train_atoms
})

In [22]:
# Compute the Coulomb matrix for each atom in the test set
def compute_coulomb_matrix(atoms, nb_max):
    cm = CoulombMatrix(
        n_atoms_max=nb_max,
        permutation="random",
        sigma=0.1,
    )
    matrix = cm.create(atoms)  # Create the Coulomb matrix for the given atoms
    return matrix

# Get the maximum number of atoms in the test set
nb_max = max(len(atoms) for atoms in test_atoms + train_atoms)

# Apply the function to compute Coulomb matrices for test atoms
test_df['coulomb_matrix'] = test_df['atoms'].apply(compute_coulomb_matrix, nb_max=nb_max)
# Apply the function to compute Coulomb matrices for train atoms
train_df['coulomb_matrix'] = train_df['atoms'].apply(compute_coulomb_matrix, nb_max=nb_max)
# Save the DataFrames to CSV files
test_df.to_csv('../data/atoms/test_coulomb_matrices.csv', index=False)
train_df.to_csv('../data/atoms/train_coulomb_matrices.csv', index=False)
# Print the first few rows of the test DataFrame
print(test_df.head())
# Print the first few rows of the train DataFrame
print(train_df.head())

                             file  \
0  ../data/atoms/test/id_7475.xyz   
1  ../data/atoms/test/id_8126.xyz   
2  ../data/atoms/test/id_6986.xyz   
3  ../data/atoms/test/id_7252.xyz   
4  ../data/atoms/test/id_7830.xyz   

                                               atoms  \
0  (Atom('C', [1.361622, 2.59097, 1.656713], inde...   
1  (Atom('C', [-1.401807, 1.300455, 1.370504], in...   
2  (Atom('N', [-1.899187, -0.508295, 0.543062], i...   
3  (Atom('C', [-0.839082, -0.01128, 2.179564], in...   
4  (Atom('C', [1.228525, 0.356839, -1.70174], ind...   

                                      coulomb_matrix  
0  [36.85810519942594, 23.71796061787803, 14.2611...  
1  [53.3587073998281, 28.550156641732208, 28.7658...  
2  [53.3587073998281, 13.045690459915974, 28.3082...  
3  [53.3587073998281, 20.719548024459403, 30.4262...  
4  [73.51669471981023, 33.42958695343809, 33.6380...  
                              file  \
0   ../data/atoms/train/id_438.xyz   
1  ../data/atoms/train/id_4184.xyz

In [23]:
# Aplatir les matrices de Coulomb
X_train = [m.flatten() for m in train_df['coulomb_matrix']]
X_test = [m.flatten() for m in test_df['coulomb_matrix']]

# Extraire les IDs à partir des noms de fichiers
# Forcer les deux colonnes à être de type string
train_df['ID'] = train_df['file'].apply(lambda x: os.path.splitext(os.path.basename(x))[0].replace("id_", ""))
train_energies = pd.read_csv('../data/energies/train.csv')
train_energies['id'] = train_energies['id'].astype(str)

# Renommer la colonne 'id' en 'ID' pour la fusion
train_energies.rename(columns={'id': 'ID'}, inplace=True)

# Fusionner correctement
train_df = train_df.merge(train_energies, on='ID')

# Extraire y_train
y_train = train_df['energy'].values

In [24]:
# Regression linéaire
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Prédictions sur le jeu de train
y_train_pred = linear_model.predict(X_train)

#Print la RMSE sur le jeu de train
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"RMSE on training set: {train_rmse}")

# Prédictions sur le jeu de test
y_test_pred = linear_model.predict(X_test)

predictions_df = pd.DataFrame({
    'ID': [os.path.splitext(os.path.basename(f))[0].replace("id_", "") for f in test_files],
    'energy': y_test_pred
})
predictions_df.to_csv('../results/predicted_energies_coulomblinear_random.csv', index=False)

RMSE on training set: 1.4256911894373376
