In [13]:
import os
import pubchempy as pcp
from collections import Counter
import re

def get_molecule_name_from_dir(directory):
    parts = directory.split(os.sep)
    molecule_name = parts[-4]  # Adjust this if your structure is different
    return molecule_name

def get_pubchem_formula(molecule_name):
    compounds = pcp.get_compounds(molecule_name, 'name')
    if compounds:
        pubchem_formula = compounds[0].molecular_formula
        return parse_pubchem_formula(pubchem_formula)
    return None

def parse_pubchem_formula(formula):
    pattern = r'([A-Z][a-z]*)(\d*)'
    parsed = re.findall(pattern, formula)
    atom_counts = {element: int(count) if count else 1 for element, count in parsed}
    return Counter(atom_counts)

def get_adsorbate_indices_from_vasp(filepath, total_adsorbate_atoms):
    with open(filepath, 'r') as file:
        lines = file.readlines()

    # POSCAR format: atomic symbols on line 5, atom counts on line 6
    element_symbols = lines[5].split()
    atom_counts = list(map(int, lines[6].split()))
    
    total_atoms = sum(atom_counts)
    adsorbate_start_index = total_atoms - total_adsorbate_atoms
    adsorbate_indices = list(range(adsorbate_start_index, total_atoms))  # 0-based indexing

    return adsorbate_indices

# Example usage
directory = "/BACKUP/database/surface_adsorbates/IrO2/thiols_thials_thioketones_thioethers/Dimethylsulfane/metal/conf_1/CONTCAR"

name = get_molecule_name_from_dir(directory)
total_adsorbate_atoms = sum(get_pubchem_formula(name).values())

adsorbate_indices = get_adsorbate_indices_from_vasp(directory, total_adsorbate_atoms)

print(f"Adsorbate indices: {adsorbate_indices}")

Adsorbate indices: [120, 121, 122, 123, 124, 125, 126, 127, 128]


In [None]:
import sys

# Add src folder to the sys.path
src_path = "../src"
sys.path.insert(0, src_path)

from oxides_ml.graph_test import atoms_to_pyg




