In [None]:
import os
import pubchempy as pcp
from collections import Counter
from pathlib import Path
import re

def get_molecule_names_from_dirs(directories):
    """
    Extracts molecule names from directory paths (parent directory name).
    """
    molecule_names = [Path(directory).parent.name for directory in directories]
    return molecule_names

def raw_file_names(vasp_directory):
    """
    Get paths to all CONTCAR files from VASP calculations in a directory structure.
    """
    contcar_dirs = []
    for root, _, files in os.walk(vasp_directory):
        if "CONTCAR" in files:
            contcar_dirs.append(os.path.join(root, "CONTCAR"))
    return contcar_dirs

def get_pubchem_formula(molecule_name):
    """
    Fetch molecular formula from PubChem using a molecule name.
    """
    compounds = pcp.get_compounds(molecule_name, 'name')
    if compounds:
        pubchem_formula = compounds[0].molecular_formula
        return parse_pubchem_formula(pubchem_formula)
    return None

def parse_pubchem_formula(formula):
    """
    Parse a chemical formula string into element counts.
    """
    pattern = r'([A-Z][a-z]*)(\d*)'
    parsed = re.findall(pattern, formula)
    atom_counts = {element: int(count) if count else 1 for element, count in parsed}
    return Counter(atom_counts)

def get_adsorbate_indices_from_vasp(filepath, total_adsorbate_atoms):
    """
    Extract adsorbate atomic indices from POSCAR file.
    """
    with open(filepath, 'r') as file:
        lines = file.readlines()
    element_symbols = lines[5].split()
    atom_counts = list(map(int, lines[6].split()))
    total_atoms = sum(atom_counts)
    adsorbate_start_index = total_atoms - total_adsorbate_atoms
    adsorbate_indices = list(range(adsorbate_start_index, total_atoms))
    return adsorbate_indices

def compare_formulas(contcar_dirs, molecule_names):
    """
    Compare VASP POSCAR formulas with PubChem formulas.
    """
    non_matching_molecules = []
    
    for contcar_dir, molecule_name in zip(contcar_dirs, molecule_names):
        pubchem_formula = get_pubchem_formula(molecule_name)
        
        if pubchem_formula is None:
            continue
        
        total_adsorbate_atoms = sum(pubchem_formula.values())
        adsorbate_indices = get_adsorbate_indices_from_vasp(contcar_dir, total_adsorbate_atoms)
        
        if not adsorbate_indices:
            non_matching_molecules.append(molecule_name)
        else:
            print(f"Match found for {molecule_name}")
    
    return non_matching_molecules

# ============================================================================
# CONFIGURATION - Replace this path with your data location
# Or set environment variable: GAS_PHASE_DATA_DIR
# ============================================================================

vasp_directory = os.environ.get(
    "GAS_PHASE_DATA_DIR",
    "/path/to/database/gas_phase"
)

print(f"Gas phase data directory: {vasp_directory}")

# Run the comparison and collect non-matching molecules
contcar_dirs = raw_file_names(vasp_directory)
molecule_names = get_molecule_names_from_dirs(contcar_dirs)

non_matching_molecules = compare_formulas(contcar_dirs, molecule_names)

# Print the list of non-matching molecules
print("\nNon-matching molecules:")
for molecule in non_matching_molecules:
    print(molecule)