In [19]:
import os
import pubchempy as pcp
from collections import Counter
import re

def raw_file_names(vasp_directory): 
    vasp_files = []
    for root, _, files in os.walk(vasp_directory):
        if "CONTCAR" in files:  
            vasp_files.append(os.path.join(root, "CONTCAR"))
    return vasp_files

def get_molecule_name_from_dir(directory):
    parts = directory.split(os.sep)
    molecule_name = parts[-2]  # Assuming molecule name is the second-to-last part of the path
    return molecule_name

def get_molecule_names_from_dirs(directories):
    return [get_molecule_name_from_dir(directory) for directory in directories]

def read_contcar(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    elements = lines[5].split()  # Atomic symbols
    atom_counts = list(map(int, lines[6].split()))  # Number of atoms of each type
    
    atoms = []
    for element, count in zip(elements, atom_counts):
        atoms.extend([element] * count)
    
    return atoms

def generate_formula(atoms):
    return Counter(atoms)

# Function to parse PubChem molecular formula into a Counter
def parse_pubchem_formula(formula):
    # This regex captures both the element and its count
    pattern = r'([A-Z][a-z]*)(\d*)'
    parsed = re.findall(pattern, formula)
    
    atom_counts = {}
    for element, count in parsed:
        atom_counts[element] = int(count) if count else 1
    
    return Counter(atom_counts)

def get_pubchem_formula(molecule_name):
    compounds = pcp.get_compounds(molecule_name, 'name')
    if compounds:
        pubchem_formula = compounds[0].molecular_formula
        return parse_pubchem_formula(pubchem_formula)
    return None

def compare_formulas(contcar_dirs, molecule_names):
    non_matching_molecules = []  # List to store non-matching molecule names

    for contcar_path, molecule_name in zip(contcar_dirs, molecule_names):
        atoms = read_contcar(contcar_path)
        contcar_formula = generate_formula(atoms)
        
        pubchem_formula = get_pubchem_formula(molecule_name)
        
        if pubchem_formula:
            if contcar_formula != pubchem_formula:
                # If formulas don't match, add the molecule name to the non-matching list
                non_matching_molecules.append(molecule_name)
        else:
            print(f"No match found for {molecule_name} in PubChem.")
    
    return non_matching_molecules

# Run the comparison and collect non-matching molecules
vasp_directory = "/BACKUP/database/gas_phase"
contcar_dirs = raw_file_names(vasp_directory)
molecule_names = get_molecule_names_from_dirs(contcar_dirs)

non_matching_molecules = compare_formulas(contcar_dirs, molecule_names)

# Print the list of non-matching molecules
print("\nNon-matching molecules:")
for molecule in non_matching_molecules:
    print(molecule)



Non-matching molecules:
