# Prédiction de l'énergie à partir des angles entre les atomes.

In [31]:
from ase.io import read
import os
import pandas as pd
import numpy as np
from ase.io import read
from itertools import combinations
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from itertools import combinations

# Path to the molecule files
path_to_test = "../data/atoms/test"
path_to_train = "../data/atoms/train"


In [32]:
# Read all the atoms files in the test directory
test_files = [os.path.join(path_to_test, f) for f in os.listdir(path_to_test) if f.endswith('.xyz')]
# Read all the atoms files in the train directory
train_files = [os.path.join(path_to_train, f) for f in os.listdir(path_to_train) if f.endswith('.xyz')]
# Read the atoms objects
test_atoms = [read(f) for f in test_files]
train_atoms = [read(f) for f in train_files]
# Create a DataFrame for test atoms
test_df = pd.DataFrame({
    'file': test_files,
    'atoms': test_atoms
})
# Create a DataFrame for train atoms
train_df = pd.DataFrame({
    'file': train_files,
    'atoms': train_atoms
})

In [40]:
def angle_between(a, b, c):
    """Calcule l'angle ABC (en degrés) entre trois positions atomiques a-b-c."""
    ba = a - b
    bc = c - b
    cos_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
    angle_rad = np.arccos(np.clip(cos_angle, -1.0, 1.0))
    return np.degrees(angle_rad)


# def extract_angles(atoms):
#     angles = []
#     pos = atoms.get_positions()
#     num_atoms = len(atoms)

#     # On parcourt tous les triplets (i, j, k) pour former des angles i-j-k
#     for i, j, k in combinations(range(num_atoms), 3):
#         angle = angle_between(pos[i], pos[j], pos[k])
#         angles.append(angle)
#     return np.array(angles)

from ase.neighborlist import NeighborList

def get_local_angles(atoms, cutoff=3.5):
    positions = atoms.get_positions()
    n_atoms = len(atoms)
    cutoffs = [cutoff] * n_atoms

    nl = NeighborList(cutoffs, self_interaction=False, bothways=True)
    nl.update(atoms)

    angles = []

    for j in range(n_atoms):  # atome central
        neighbors = nl.get_neighbors(j)[0]
        for i in range(len(neighbors)):
            for k in range(i+1, len(neighbors)):
                a = positions[neighbors[i]]
                b = positions[j]
                c = positions[neighbors[k]]
                angle = angle_between(a, b, c)
                angles.append(angle)
    return np.array(angles)

def get_angle_features(atoms):
    angles = get_local_angles(atoms)
    if len(angles) == 0:
        return np.zeros(4)
    return np.array([
        np.mean(angles),
        np.std(angles),
        np.min(angles),
        np.max(angles)
    ])

In [41]:
# Avant la fusion, supprime la colonne 'energy' de train_df si elle existe
if 'energy' in train_df.columns:
    train_df = train_df.drop(columns=['energy'])

# Charger les énergies depuis le CSV officiel
train_energies = pd.read_csv('../data/energies/train.csv')
train_energies['id'] = train_energies['id'].astype(str)

# Extraire l'ID à partir du nom de fichier
train_df['ID'] = train_df['file'].apply(lambda x: os.path.splitext(os.path.basename(x))[0].replace('id_', ''))

# Fusionner les features avec les énergies
train_df = train_df.merge(train_energies, left_on='ID', right_on='id')

print(train_df.head())

                              file  \
0   ../data/atoms/train/id_438.xyz   
1  ../data/atoms/train/id_4184.xyz   
2  ../data/atoms/train/id_3188.xyz   
3  ../data/atoms/train/id_2474.xyz   
4  ../data/atoms/train/id_5056.xyz   

                                               atoms    ID  id_x  \
0  (Atom('C', [-1.657367, 0.622253, 1.29429], ind...   438   438   
1  (Atom('C', [-0.533633, -0.118721, 2.011296], i...  4184  4184   
2  (Atom('C', [-0.994168, -2.041154, -0.488546], ...  3188  3188   
3  (Atom('C', [-2.022245, 0.907156, -1.281658], i...  2474  2474   
4  (Atom('C', [-0.298504, -0.532232, -2.197911], ...  5056  5056   

                                            features  id_y     energy  
0  [58.80754767988597, 32.612595519313764, 1.9475...   438 -74.930323  
1  [61.87483630008167, 37.90891735156741, 1.01143...  4184 -70.779999  
2  [58.88943676716564, 36.06149113249634, 3.52061...  3188 -69.265664  
3  [64.55216174112691, 35.59530642879029, 0.58980...  2474 -97.212364  
4 

In [42]:
train_df['features'] = train_df['atoms'].apply(get_angle_features)
test_df['features'] = test_df['atoms'].apply(get_angle_features)

X_train = np.vstack(train_df['features'].values)
y_train = train_df['energy'].values

X_test = np.vstack(test_df['features'].values)
# y_test n'est pas utilisé pour la prédiction

In [43]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [44]:
print(y_pred)

[-97.19878286 -86.56834952 -85.21795504 ... -78.3627811  -68.66495994
 -80.86416366]


In [45]:
predictions_df = pd.DataFrame({
    'id': [os.path.splitext(os.path.basename(f))[0].replace("id_", "") for f in test_files],
    'energy': y_pred
})

# Sauvegarder en CSV
predictions_df.to_csv('../results/predicted_energies_angles.csv', index=False)