## Training energy-prediction models for all adsorbants

**Features**: Adsorbant, Surface

**Target**: The energy associated with both the slab and the adsorbant

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ase
from ase.db import connect
import sys
import os

In [2]:
# Attach Jacks formic acid data
formic_acid_data_path = "../Experiments/AgAuCuPdPt_data/1_databases/relaxed/"

single_element_dbs = []
HEA_dbs = []

for filename in os.listdir(formic_acid_data_path):
    if filename[-3:] == ".db":
        if filename[0:14] == "single_element":
            single_element_dbs.append(filename)
        if filename[0] == "H" or filename[0] == "C":
            HEA_dbs.append(filename)

print(f"Single element: \n{single_element_dbs}\n")
print(f"HEA: \n{HEA_dbs}")

Single element: 
['single_element_H_out.db', 'single_element_COOH_COH_adsorbed_out.db', 'single_element_COOH_H_CO_adsorbed_out.db', 'single_element_COOH_C_adsorbed_out.db', 'single_element_COOH_CO_adsorbed_out.db', 'single_element_CO_out.db', 'single_elements_COOH_CO_adsorbed_out.db', 'single_element_COOH_H_on_O_OO_adsorbed_out.db', 'single_element_slabs_out.db', 'single_elements_COOH_COH_adsorbed_out.db', 'single_element_COOH_H_CO_and_H2O_like_out.db', 'single_element_HCOOH_O_adsorbed_out.db', 'single_element_COOH_O_adsorbed_out.db', 'single_element_COOH_H_COH_adsorbed_out.db']

HEA: 
['HCOOH_O_adsorbed_out.db', 'H_out.db', 'COOH_H_on_O_OO_adsorbed_out.db', 'COOH_O_adsorbed_out.db', 'CO_out.db', 'COOH_CO_adsorbed_out.db', 'COOH_H_on_C_OO_adsorbed_out.db', 'COOH_C_adsorbed_out.db', 'COOH_COH_adsorbed_out.db']


In [3]:
# Make some kind of training loop, which trains and saves a model for each dataset
# I could start with an XGBoost model, but the 

for adsorbant_db in HEA_dbs:
    print(f"Adsorbant dataset: {adsorbant_db}")
    
    # Any preprocessing?
    
    # Train model here
    
    # Test model here and show performance
    
    # Save a parity plot
    
    # Save model 
    
    
# Maybe save them in a dictionary, such that one could just utilize a single E_models dictionary, that contains all models


Adsorbant dataset: HCOOH_O_adsorbed_out.db
Adsorbant dataset: H_out.db
Adsorbant dataset: COOH_H_on_O_OO_adsorbed_out.db
Adsorbant dataset: COOH_O_adsorbed_out.db
Adsorbant dataset: CO_out.db
Adsorbant dataset: COOH_CO_adsorbed_out.db
Adsorbant dataset: COOH_H_on_C_OO_adsorbed_out.db
Adsorbant dataset: COOH_C_adsorbed_out.db
Adsorbant dataset: COOH_COH_adsorbed_out.db


In [4]:
# I am interested in energy prediction models of the following adsorbates:
## *COOH ## On-top
##  *CO  ## On-top

## Using the Graph Convolutional Network that CMC worked on

### Step 1: Make graphs from all slabs

In [5]:
sys.path.append('../utils')

In [9]:
from features import db_to_graphs

In [10]:
db_to_graphs

<function features.db_to_graphs(surface_elements, adsorbate_elements, n_neighbors, fmax, db, ref_dict)>

In [57]:
project_name = 'Formic_Acid'

site_list = ['ontop']
ads_list = ['COOH','CO']

surface_elements = ['Ag','Au','Cu','Pd','Pt']
adsorbate_elements = ['C','C']

n_neighbours = 2
fmax = 0.1

db = connect("../Database_files/COOH_C_adsorbed_out.db")

ref_dict = {'ontop_COOH':connect(formic_acid_data_path + 'single_element_COOH_C_adsorbed_out.db').get(id = "5").energy,
        'ontop_CO':connect("../Database_files/single_element_CO_out.db").get(id = "5").energy,
        'slab':connect("../Database_files/slabs_out.db").get(id = "5").energy}



### Bugfixing

In [53]:
connect(formic_acid_data_path + 'single_element_COOH_C_adsorbed_out.db').get(id = "5").energy

-271.5086386023385

In [54]:
connect("../Database_files/single_element_CO_out.db").get(id = "5").energy

-263.30312496328355

In [55]:
connect("../Database_files/slabs_out.db").get(id = "5").energy

-143.3025635473736

In [43]:
import os.path
print(os.path.exists(formic_acid_data_path+"single_element_CO.db"))
print(os.path.isfile(formic_acid_data_path+"single_element_CO.db"))

True
True


In [31]:
connect(formic_acid_data_path + 'single_element_CO.db').get(id = "5")#.energy

KeyError: 'no match'

In [32]:
connect(formic_acid_data_path + 'single_element_CO.db').get(symbol = "Pt")#.energy

KeyError: 'no match'

In [34]:
connect(formic_acid_data_path + 'single_element_CO.db').select(symbol = "Pt").energy

AttributeError: 'generator' object has no attribute 'energy'

In [None]:

db_to_graphs(surface_elements, adsorbate_elements, n_neighbours, fmax, db, ref_dict)

In [20]:
connect(formic_acid_data_path + 'single_element_COOH_C_adsorbed_out.db').get(symbol = "Pt").energy

KeyError: 'no match'

In [21]:
connect(formic_acid_data_path + 'single_element_COOH_C_adsorbed_out.db').get(id = "5").energy

-271.5086386023385

In [19]:
connect(formic_acid_data_path + 'single_element_COOH_C_adsorbed_out.db').select(symbol="Pt")

<generator object Database.select at 0x1628d5a10>

### Step 2: Load adsorbate binding energies for each slab

### Step 3: Train model on graphs and adsorbate binding energies

## Using the XGBoost method from the PUK

In [7]:
import sys
sys.path.append('../scripts')
from Slab import expand_triangle, Slab, inside_triangle
from FeatureReader import OntopStandard111, FccStandard111
from ase.db import connect
from ase.visualize import view
import numpy as np

In [9]:
db_folder = "../Database_files/"
features_folder = "../csv_features/"

In [14]:
# Specify metals
metals = ['Ag', 'Au', 'Cu', 'Pd', 'Pt']
alloy = ''.join(metals)

# Specify name of databases
db_name_COOH = 'COOH_C_adsorbed_out.db'
db_name_CO = 'CO_out.db'

# Initiate feature readers
reader_COOH = OntopStandard111(metals)
reader_CO   = OntopStandard111(metals)

site_ids_COOH = [16, 17, 18]
site_ids_CO = [16, 17, 18]

# Initiate counters of rejected samples
rejected_COOH = 0
rejected_CO = 0

# Writer headers to files
with open(f'{features_folder}COOH_features.csv', 'w') as file_COOH:
    file_COOH.write(f'Features, G_ads (eV), slab db row, {db_name_COOH} row')

with open(f'{features_folder}CO_features.csv', 'w') as file_CO:
    file_CO.write(f'Features, G_ads (eV), slab db row, {db_name_CO} row')

# Load HEA(111) databases
with connect(f'{db_folder}{db_name_COOH}') as db_COOH,\
     connect(f'{db_folder}{db_name_CO}') as db_CO,\
     connect(f'{db_folder}slabs_out.db') as db_slab,\
     open('COOH_features.csv', 'a') as file_COOH,\
     open('CO_features.csv', 'a') as file_CO:

    # Iterate through slabs without adsorbates
    for row_slab in db_slab.select('energy', H=0, C=0, O=0):

        # Iterate through the two adsorbates
        for ads in ['COOH', 'CO']:

            # Set adsorbate-specific parameters
            if ads == 'COOH':
                db = db_COOH
                kw = {'C':1, 'O': 2, 'H': 1}
                db_name = db_name_COOH
                out_file = file_COOH

            elif ads == 'CO':
                db = db_O
                kw = {'C':1, 'O': 1, 'H': 0}
                db_name = db_name_CO
                out_file = file_CO

            # Set counter of matched slabs between the databases to zero
            n_matched = 0

            # Get the corresponding slab with adsorbate
            for row in db.select('energy', **kw, **row_slab.count_atoms()):

                # If symbols match up
                if row.symbols[:-len(ads)] == row_slab.symbols:

                    # Increment the counter of matched structures
                    n_matched += 1

                    # Get atoms object
                    atoms = db.get_atoms(row.id)

                    # Make slab instance
                    slab = Slab(atoms, ads=ads, ads_atom='O')

                    # If the adsorbate is *COOH
                    if ads == 'COOH':

                        # Get adsorption site elements as neighbors within a radius
                        site_elems, site = slab.get_adsorption_site(radius=2.6, hollow_radius=2.6)

                        # If the site does not consist of exactly one atom, then skip this sample
                        # as the *OH has moved too far away from an on-top site
                        try:
                            if len(site_elems) !=1:
                                rejected_COOH += 1
                                #slab.view()
                                continue
                        except TypeError:
                            print(site_elems, site)
                            print(row_slab.id, row.id)
                            slab.view()
                            exit()

                        # Get features of structure
                        features = reader_COOH.get_features(slab, radius=2.6)
                        

                    # Else, if the adsorbate is O*
                    elif ads == 'CO':

                        # Get hollow site planar corner coordinates
                        site_atoms_pos_orig = atoms.positions[site_ids_CO, :2]

                        # Get expanded triangle vertices
                        site_atoms_pos = expand_triangle(site_atoms_pos_orig, expansion=1.45)

                        # Get position of adsorbate atom (with atom index XXX 20 XXX)
                        ads_pos = atoms.positions[20][:2]

                        # If the oxygen is outside the expanded fcc triangle,
                        # then it is most likely in an hcp site, that is not
                        # being modeled
                        if not inside_triangle(ads_pos, site_atoms_pos):
                            rejected_CO += 1
                            continue

                        # Get features of structure
                        features = reader_O.get_features(slab, radius=2.6, site_ids=site_ids_CO)

                    # Get adsorption energy
                    E_ads = row.energy - row_slab.energy + ref[ads]

                    # Write output to file
                    features = ','.join(map(str, features))
                    out_file.write(f'\n{features},{E_ads:.6f},{row_slab.id},{row.id}')

            # Print a message if more than one slabs were matched. This probably means that
            # the same slab has accidentally been saved multiple to the database
            if n_matched > 1:
                print(f'[INFO] {n_matched} {ads} and slab matched for row {row_slab.id} in {db_name_slab}')

            # Print a message if no slabs were matched. This probably means that the DFT calculation
            # did not converge and was left out
            #elif n_matched == 0:
                #print(f'[INFO] No match found in {db_name} for row {row_slab.id} in {db_name_slab}')

# Print the number of rejected samples to screen
print('rejected COOH samples: ', rejected_COOH)
print('rejected CO samples: ', rejected_CO)

ValueError: Number of atoms in site is 0 which is not recognized

In [None]:
# I am not going to figure this bullshit out without Dr. Jekyll