In [1]:
from mendeleev import element
import numpy as np
import re
import pandas as pd
import ast

# Define the function to calculate weighted average descriptors
def calculate_descriptors(formula):
    # Parse the formula and count the elements and their stoichiometric ratios
    pattern = re.compile(r"([A-Z][a-z]*)(\d*)")
    elements = pattern.findall(formula)

    # Convert elements list into a dictionary with counts as integers
    element_counts = {}
    total_atoms = 0
    for symbol, count in elements:
        count = int(count) if count else 1
        element_counts[symbol] = count
        total_atoms += count  # Track total number of atoms in the formula

    # Retrieve each element's atomic properties
    atomic_numbers = []
    electronegativities = []
    atomic_radii = []
    atomic_masses = []
    valence_electrons = []

    # Collect weighted properties
    for symbol, count in element_counts.items():
        elem = element(symbol)
        fraction = count / total_atoms  # Fractional contribution of the element

        atomic_numbers.append(elem.atomic_number * fraction)
        electronegativities.append(elem.en_pauling * fraction if elem.en_pauling is not None else 0)
        atomic_radii.append(elem.atomic_radius * fraction if elem.atomic_radius is not None else 0)
        atomic_masses.append(elem.atomic_weight * fraction)
        valence_electrons.append(elem.nvalence() * fraction)

    # Calculate weighted average descriptors

    # Elemental Properties
    weighted_mean_electronegativity = np.sum(electronegativities)
    atomic_number_diff = max(element_counts.values()) - min(element_counts.values())
    weighted_avg_atomic_radius = np.sum(atomic_radii)
    weighted_avg_atomic_mass = np.sum(atomic_masses)
    weighted_mean_valence_electron_count = np.sum(valence_electrons)

    # Retrieve element-specific min and max properties
    min_atomic_radius = np.min([r for r in atomic_radii if r != 0])
    max_atomic_radius = np.max(atomic_radii)

    # Calculate electronegativity difference
    actual_electronegativities = [element(symbol).en_pauling for symbol in element_counts if element(symbol).en_pauling is not None]
    electronegativity_diff = max(actual_electronegativities) - min(actual_electronegativities) if actual_electronegativities else 0

    # Standard deviation of electronegativities (for non-zero values)
    std_electronegativity = np.std([en for en in electronegativities if en != 0])

    # Organize descriptors into a vector
    feature_vector = [
        weighted_mean_electronegativity,     # 1. Weighted Mean Electronegativity
        atomic_number_diff,                  # 2. Atomic Number Difference
        weighted_avg_atomic_radius,          # 3. Weighted Average Atomic Radius
        weighted_avg_atomic_mass,            # 4. Weighted Average Atomic Mass
        std_electronegativity,               # 5. Standard Deviation of Electronegativity
        min_atomic_radius,                   # 6. Minimum Atomic Radius
        max_atomic_radius,                   # 7. Maximum Atomic Radius
        electronegativity_diff,              # 8. Electronegativity Difference
        weighted_mean_valence_electron_count # 9. Weighted Mean Valence Electron Count
    ]

    return feature_vector

def get_properties(row):

    e_f = ast.literal_eval(row['formation_energy'])['value']
    
    
    return  e_f

In [2]:
df1 = pd.read_csv('./Data/1_MatDX/MatDX_nomad_EF_Binary.csv')
df2 = pd.read_csv('./Data/1_MatDX/MatDX_nomad_EF_Ternary.csv')

# Concatenate the DataFrames
df = pd.concat([df1, df2], ignore_index=True)

In [None]:
import json
from tqdm import tqdm

target = {}
descriptors = {}

# Open the JSON file in append/update mode
output_file = 'descriptors_WA.json'
try:
    # Load existing data if the file already exists
    with open(output_file, 'r') as file:
        data = json.load(file)
except (FileNotFoundError, json.JSONDecodeError):
    # If the file doesn't exist or is empty, initialize an empty dictionary
    data = {}

# Loop through the DataFrame and process each row
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Rows"):
    try:
        formula = row['formula']
        e_f = get_properties(row)
        target[formula] = e_f
        descriptor_values = calculate_descriptors(formula)
        descriptors[formula] = descriptor_values

        # Store the descriptor and e_f in the desired format
        data[formula] = [descriptor_values, e_f]
        
        # Save to JSON file after each entry (or periodically for large datasets)
        with open(output_file, 'w') as file:
            json.dump(data, file, indent=4)
            
    except Exception as e:
        print(f"Error processing row {index}: {e}")
        pass

# Optionally, print the final number of descriptors processed
print("Total descriptors processed:", len(descriptors.keys()))


Processing Rows:  68%|██████▊   | 13524/20000 [10:34:09<5:51:19,  3.26s/it]   