In [2]:

from mendeleev import element
import numpy as np
import re
import pandas as pd
import ast
from pymatgen.symmetry.groups import SpaceGroup

# Define the function to calculate descriptors
def calculate_descriptors(formula):
    # Parse the formula and count the elements and their stoichiometric ratios
    pattern = re.compile(r"([A-Z][a-z]*)(\d*)")
    elements = pattern.findall(formula)

    # Convert elements list into a dictionary with counts as integers
    element_counts = {}
    for symbol, count in elements:
        count = int(count) if count else 1
        element_counts[symbol] = count

    # Retrieve each element's atomic properties
    atomic_numbers = []
    electronegativities = []
    atomic_radii = []
    atomic_masses = []
    valence_electrons = []

    for symbol, count in element_counts.items():
        elem = element(symbol)
        atomic_numbers.extend([elem.atomic_number] * count)
        electronegativities.extend([elem.en_pauling] * count)
        atomic_radii.extend([elem.atomic_radius] * count)
        atomic_masses.extend([elem.atomic_weight] * count)
        valence_electrons.extend([elem.nvalence()] * count)

    # Calculate descriptors

    # Elemental Properties
    mean_electronegativity = np.mean(electronegativities)
    atomic_number_diff = max(atomic_numbers) - min(atomic_numbers)
    avg_atomic_radius = np.mean(atomic_radii)

    # Compositional Features
    avg_atomic_mass = np.mean(atomic_masses)

    # Statistical Measures of Elemental Properties
    std_electronegativity = np.std(electronegativities)
    min_atomic_radius = np.min(atomic_radii)
    max_atomic_radius = np.max(atomic_radii)

    # Heuristic Quantities
    electronegativity_diff = max(electronegativities) - min(electronegativities)
    mean_valence_electron_count = np.mean(valence_electrons)

    # Organize descriptors into a vector
    feature_vector = [
        mean_electronegativity,        # 1. Mean Electronegativity
        atomic_number_diff,            # 2. Atomic Number Difference
        avg_atomic_radius,             # 3. Average Atomic Radius
        avg_atomic_mass,               # 4. Average Atomic Mass
        std_electronegativity,         # 5. Standard Deviation of Electronegativity
        min_atomic_radius,             # 6. Minimum Atomic Radius
        max_atomic_radius,             # 7. Maximum Atomic Radius
        electronegativity_diff,        # 8. Electronegativity Difference
        mean_valence_electron_count    # 9. Mean Valence Electron Count
    ]

    return feature_vector



def get_properties(row):

    
    data = ast.literal_eval(row['structure'])
    
    lattice_data = data[0]['data']
    
    lat_a, lat_b, lat_c = lattice_data['a'], lattice_data['b'], lattice_data['c']
    
    volume = abs(np.dot(lat_a, np.cross(lat_b, lat_c))) * 1e30
    e_f = ast.literal_eval(row['formation_energy'])['value']
    
    
    sg = SpaceGroup(row['space_group']).int_number
    
    return (volume, len(lattice_data['atoms']), sg, e_f)

In [3]:
df1 = pd.read_csv('./Data/1_MatDX/MatDX_nomad_EF_Binary.csv')
df2 = pd.read_csv('./Data/1_MatDX/MatDX_nomad_EF_Ternary.csv')

# Concatenate the DataFrames
df = pd.concat([df1, df2], ignore_index=True)

In [5]:
import json
from tqdm import tqdm

target = {}
descriptors = {}

# Open the JSON file in append/update mode
output_file = 'descriptors.json'
try:
    # Load existing data if the file already exists
    with open(output_file, 'r') as file:
        data = json.load(file)
except (FileNotFoundError, json.JSONDecodeError):
    # If the file doesn't exist or is empty, initialize an empty dictionary
    data = {}

# Loop through the DataFrame and process each row
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Rows"):
    try:
        formula = row['formula']
        (_, _, _, e_f) = get_properties(row)
        target[formula] = e_f
        descriptor_values = calculate_descriptors(formula)
        descriptors[formula] = descriptor_values

        # Store the descriptor and e_f in the desired format
        data[formula] = [descriptor_values, e_f]
        
        # Save to JSON file after each entry (or periodically for large datasets)
        with open(output_file, 'w') as file:
            json.dump(data, file, indent=4)
            
    except Exception as e:
        print(f"Error processing row {index}: {e}")
        pass

# Optionally, print the final number of descriptors processed
print("Total descriptors processed:", len(descriptors.keys()))


Processing Rows:   0%|          | 50/20000 [00:36<3:53:51,  1.42it/s]

Error processing row 49: unsupported operand type(s) for +: 'float' and 'NoneType'


Processing Rows:   1%|          | 127/20000 [01:30<4:07:22,  1.34it/s]

Error processing row 126: unsupported operand type(s) for +: 'NoneType' and 'float'


Processing Rows:   1%|          | 182/20000 [02:09<3:50:12,  1.43it/s]

Error processing row 181: unsupported operand type(s) for +: 'float' and 'NoneType'


Processing Rows:   2%|▏         | 378/20000 [04:28<3:41:26,  1.48it/s]

Error processing row 378: list indices must be integers or slices, not str
Error processing row 379: list indices must be integers or slices, not str
Error processing row 380: list indices must be integers or slices, not str
Error processing row 381: list indices must be integers or slices, not str
Error processing row 382: list indices must be integers or slices, not str
Error processing row 383: list indices must be integers or slices, not str
Error processing row 384: list indices must be integers or slices, not str
Error processing row 385: list indices must be integers or slices, not str
Error processing row 386: list indices must be integers or slices, not str
Error processing row 387: list indices must be integers or slices, not str
Error processing row 388: list indices must be integers or slices, not str
Error processing row 389: list indices must be integers or slices, not str
Error processing row 390: list indices must be integers or slices, not str
Error processing row 391:

Processing Rows:   2%|▏         | 440/20000 [04:29<16:17, 20.01it/s]  

Error processing row 437: unsupported operand type(s) for +: 'float' and 'NoneType'
Error processing row 438: list indices must be integers or slices, not str
Error processing row 439: list indices must be integers or slices, not str
Error processing row 440: list indices must be integers or slices, not str
Error processing row 441: list indices must be integers or slices, not str
Error processing row 442: list indices must be integers or slices, not str
Error processing row 443: list indices must be integers or slices, not str
Error processing row 444: list indices must be integers or slices, not str
Error processing row 445: list indices must be integers or slices, not str
Error processing row 446: list indices must be integers or slices, not str
Error processing row 447: list indices must be integers or slices, not str
Error processing row 448: list indices must be integers or slices, not str
Error processing row 449: list indices must be integers or slices, not str
Error processing

Processing Rows:   2%|▏         | 464/20000 [04:30<14:01, 23.20it/s]

Error processing row 465: unsupported operand type(s) for +: 'float' and 'NoneType'


Processing Rows:   2%|▏         | 467/20000 [04:32<28:16, 11.51it/s]

Error processing row 467: list indices must be integers or slices, not str
Error processing row 468: list indices must be integers or slices, not str
Error processing row 469: list indices must be integers or slices, not str
Error processing row 470: list indices must be integers or slices, not str
Error processing row 471: list indices must be integers or slices, not str


Processing Rows:   2%|▏         | 475/20000 [04:34<44:31,  7.31it/s]

Error processing row 475: list indices must be integers or slices, not str
Error processing row 476: list indices must be integers or slices, not str
Error processing row 477: list indices must be integers or slices, not str
Error processing row 478: list indices must be integers or slices, not str
Error processing row 479: list indices must be integers or slices, not str
Error processing row 480: list indices must be integers or slices, not str


Processing Rows:   2%|▏         | 482/20000 [04:35<41:00,  7.93it/s]

Error processing row 482: list indices must be integers or slices, not str
Error processing row 483: list indices must be integers or slices, not str
Error processing row 484: list indices must be integers or slices, not str
Error processing row 485: list indices must be integers or slices, not str
Error processing row 486: list indices must be integers or slices, not str
Error processing row 487: list indices must be integers or slices, not str
Error processing row 488: list indices must be integers or slices, not str
Error processing row 489: list indices must be integers or slices, not str


Processing Rows:   2%|▏         | 492/20000 [04:36<48:57,  6.64it/s]

Error processing row 492: list indices must be integers or slices, not str
Error processing row 493: list indices must be integers or slices, not str
Error processing row 494: list indices must be integers or slices, not str
Error processing row 495: list indices must be integers or slices, not str
Error processing row 496: list indices must be integers or slices, not str
Error processing row 497: list indices must be integers or slices, not str
Error processing row 498: list indices must be integers or slices, not str


Processing Rows:   3%|▎         | 505/20000 [04:41<2:18:05,  2.35it/s]

Error processing row 505: list indices must be integers or slices, not str
Error processing row 506: list indices must be integers or slices, not str
Error processing row 507: list indices must be integers or slices, not str


Processing Rows:   3%|▎         | 512/20000 [04:44<2:31:53,  2.14it/s]

Error processing row 512: list indices must be integers or slices, not str
Error processing row 513: list indices must be integers or slices, not str
Error processing row 514: list indices must be integers or slices, not str
Error processing row 515: list indices must be integers or slices, not str
Error processing row 516: list indices must be integers or slices, not str


Processing Rows:   3%|▎         | 522/20000 [04:47<2:36:20,  2.08it/s]

Error processing row 522: list indices must be integers or slices, not str
Error processing row 523: list indices must be integers or slices, not str
Error processing row 524: list indices must be integers or slices, not str
Error processing row 525: list indices must be integers or slices, not str


Processing Rows:   3%|▎         | 533/20000 [04:52<3:18:51,  1.63it/s]

Error processing row 533: list indices must be integers or slices, not str
Error processing row 534: list indices must be integers or slices, not str


Processing Rows:   3%|▎         | 539/20000 [04:55<3:08:30,  1.72it/s]

Error processing row 539: list indices must be integers or slices, not str
Error processing row 540: list indices must be integers or slices, not str
Error processing row 541: list indices must be integers or slices, not str
Error processing row 542: list indices must be integers or slices, not str
Error processing row 543: list indices must be integers or slices, not str


Processing Rows:   3%|▎         | 552/20000 [05:00<3:17:13,  1.64it/s]

Error processing row 552: list indices must be integers or slices, not str


Processing Rows:   3%|▎         | 559/20000 [05:05<3:29:27,  1.55it/s]

Error processing row 559: list indices must be integers or slices, not str
Error processing row 560: list indices must be integers or slices, not str
Error processing row 561: list indices must be integers or slices, not str


Processing Rows:   3%|▎         | 588/20000 [05:23<3:37:51,  1.49it/s]

Error processing row 588: list indices must be integers or slices, not str


Processing Rows:   3%|▎         | 597/20000 [05:28<3:49:21,  1.41it/s]

Error processing row 597: list indices must be integers or slices, not str


Processing Rows:   3%|▎         | 606/20000 [05:34<3:33:34,  1.51it/s]

Error processing row 606: list indices must be integers or slices, not str


Processing Rows:   3%|▎         | 657/20000 [06:10<3:45:54,  1.43it/s]

Error processing row 656: unsupported operand type(s) for +: 'float' and 'NoneType'


Processing Rows:   4%|▍         | 768/20000 [07:30<3:51:05,  1.39it/s]

Error processing row 767: unsupported operand type(s) for +: 'float' and 'NoneType'


Processing Rows:   4%|▍         | 788/20000 [07:43<3:41:52,  1.44it/s]

Error processing row 787: unsupported operand type(s) for +: 'float' and 'NoneType'


Processing Rows:   4%|▍         | 789/20000 [07:44<3:38:07,  1.47it/s]

Error processing row 788: unsupported operand type(s) for +: 'NoneType' and 'NoneType'


Processing Rows:   4%|▍         | 895/20000 [09:01<3:39:10,  1.45it/s]

Error processing row 895: list indices must be integers or slices, not str
Error processing row 896: list indices must be integers or slices, not str
Error processing row 897: list indices must be integers or slices, not str
Error processing row 898: list indices must be integers or slices, not str
Error processing row 899: list indices must be integers or slices, not str
Error processing row 900: list indices must be integers or slices, not str
Error processing row 901: list indices must be integers or slices, not str
Error processing row 902: list indices must be integers or slices, not str
Error processing row 903: list indices must be integers or slices, not str
Error processing row 904: list indices must be integers or slices, not str
Error processing row 905: list indices must be integers or slices, not str
Error processing row 906: list indices must be integers or slices, not str
Error processing row 907: list indices must be integers or slices, not str
Error processing row 908:

Processing Rows:   5%|▍         | 976/20000 [09:07<48:02,  6.60it/s]  

Error processing row 975: unsupported operand type(s) for +: 'float' and 'NoneType'


Processing Rows:   6%|▌         | 1145/20000 [11:03<3:26:28,  1.52it/s]

Error processing row 1144: unsupported operand type(s) for +: 'float' and 'NoneType'


Processing Rows:   6%|▋         | 1254/20000 [12:21<3:22:40,  1.54it/s]

Error processing row 1253: unsupported operand type(s) for +: 'NoneType' and 'float'


Processing Rows:   6%|▋         | 1265/20000 [12:28<3:36:47,  1.44it/s]

Error processing row 1264: unsupported operand type(s) for +: 'NoneType' and 'float'


Processing Rows:   7%|▋         | 1318/20000 [13:06<3:39:30,  1.42it/s]

Error processing row 1317: unsupported operand type(s) for +: 'float' and 'NoneType'


Processing Rows:   7%|▋         | 1352/20000 [13:31<3:49:11,  1.36it/s]

Error processing row 1351: unsupported operand type(s) for +: 'float' and 'NoneType'


Processing Rows:   7%|▋         | 1384/20000 [13:54<3:42:33,  1.39it/s]

Error processing row 1383: unsupported operand type(s) for +: 'NoneType' and 'NoneType'


Processing Rows:   7%|▋         | 1445/20000 [14:37<3:43:42,  1.38it/s]

Error processing row 1445: list indices must be integers or slices, not str


Processing Rows:   7%|▋         | 1469/20000 [14:54<3:48:57,  1.35it/s]

Error processing row 1468: unsupported operand type(s) for +: 'float' and 'NoneType'


Processing Rows:   7%|▋         | 1472/20000 [14:56<3:50:47,  1.34it/s]

Error processing row 1472: list indices must be integers or slices, not str


Processing Rows:   7%|▋         | 1490/20000 [15:09<3:34:58,  1.44it/s]

Error processing row 1490: list indices must be integers or slices, not str


Processing Rows:   8%|▊         | 1545/20000 [15:48<3:43:23,  1.38it/s]

Error processing row 1544: unsupported operand type(s) for +: 'float' and 'NoneType'


Processing Rows:   8%|▊         | 1698/20000 [17:41<3:57:11,  1.29it/s]

In [None]:
descriptors

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Assuming target_property and descriptors are defined
df_target = pd.Series(target_property, name='target')
df_descriptors = pd.DataFrame(descriptors).T

df = df_descriptors.join(df_target, how='inner')

# Split data into features (X) and target (y)
X = df.drop(columns='target')
y = df['target']

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# List of models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Decision Tree": DecisionTreeRegressor(max_depth=10),
    "Random Forest": RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=50, max_depth=3, random_state=42),
    "Support Vector Regressor": SVR(kernel="linear"),
    "k-Nearest Neighbors": KNeighborsRegressor(n_neighbors=5)
}

# Store results
results = {}

# Iterate over models
for model_name, model in models.items():
    # Optional: Use PCA for dimensionality reduction if needed
    pipeline = Pipeline([
        ('pca', PCA(n_components=5)),  # Adjust n_components based on explained variance
        ('regressor', model)
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = pipeline.predict(X_test)
    
    # Calculate the R-squared score
    r2_test = r2_score(y_test, y_pred)
    r2_train = r2_score(y_train, pipeline.predict(X_train))
    
    # Store the results
    results[model_name] = r2
    print(f"{model_name} R-squared: {r2_test:.2f} (Test), {r2_train:.2f} (Train)")   

In [11]:
from mendeleev import element
import numpy as np
import re

# Function to parse compound formula (e.g., AxByCz)
def parse_formula(formula):
    pattern = re.compile(r"([A-Z][a-z]*)(\d*)")
    elements = pattern.findall(formula)
    return {el: int(num) if num else 1 for el, num in elements}

# Function to retrieve basic properties of each element
def get_element_properties(element_symbol):
    elem = element(element_symbol)
    return {
        'atomic_number': elem.atomic_number,
        'electronegativity': elem.en_pauling or 0.0,
        'atomic_radius': elem.atomic_radius or 0.0,
        'valence_electrons': elem.nvalence if elem.nvalence else 0
    }

# Descriptor function
def generate_descriptor(formula):
    composition = parse_formula(formula)
    elements = list(composition.keys())
    
    # Initialize dictionaries to store properties
    atomic_numbers = []
    electronegativities = []
    atomic_radii = []
    valence_electrons = []
    atomic_masses = []
    
    # Gather elemental properties
    for element_symbol, count in composition.items():
        props = get_element_properties(element_symbol)
        atomic_numbers.extend([props['atomic_number']] * count)
        electronegativities.extend([props['electronegativity']] * count)
        atomic_radii.extend([props['atomic_radius']] * count)
        valence_electrons.extend([props['valence_electrons']] * count)
        atomic_masses.extend([element(element_symbol).atomic_weight] * count)
    
    # Compositional features
    total_atoms = sum(composition.values())
    stoichiometric_ratios = [composition[el] / total_atoms for el in elements]
    average_atomic_mass = np.mean(atomic_masses)
    
    # Statistical measures for elemental properties
    stats = {
        'mean_atomic_number': np.mean(atomic_numbers),
        'std_atomic_number': np.std(atomic_numbers),
        'min_atomic_number': np.min(atomic_numbers),
        'max_atomic_number': np.max(atomic_numbers),
        'mean_electronegativity': np.mean(electronegativities),
        'std_electronegativity': np.std(electronegativities),
        'min_electronegativity': np.min(electronegativities),
        'max_electronegativity': np.max(electronegativities),
        'mean_atomic_radius': np.mean(atomic_radii),
        'std_atomic_radius': np.std(atomic_radii),
        'min_atomic_radius': np.min(atomic_radii),
        'max_atomic_radius': np.max(atomic_radii),
        'mean_valence_electrons': np.mean(valence_electrons),
        'std_valence_electrons': np.std(valence_electrons),
        'min_valence_electrons': np.min(valence_electrons),
        'max_valence_electrons': np.max(valence_electrons),
    }
    
    # Heuristic quantities
    # Differences in electronegativity between neighboring atoms
    electronegativity_diffs = []
    for i, el1 in enumerate(elements):
        for el2 in elements[i + 1:]:
            diff = abs(get_element_properties(el1)['electronegativity'] - get_element_properties(el2)['electronegativity'])
            electronegativity_diffs.append(diff)
    mean_electronegativity_diff = np.mean(electronegativity_diffs) if electronegativity_diffs else 0.0

    # Atomic packing fraction placeholder (assuming crystal structure details are unknown)
    # For a rough estimate, we'll use the approximate atomic volume based on atomic radii
    if atomic_radii:
        atomic_volumes = [(4 / 3) * np.pi * (r ** 3) for r in atomic_radii]
        packing_fraction = np.sum(atomic_volumes) / total_atoms  # A rough average
    else:
        packing_fraction = 0.0

    # Descriptor vector
    descriptor_vector = [
        *stoichiometric_ratios,               # Compositional stoichiometric ratios
        average_atomic_mass,                  # Average atomic mass
        mean_electronegativity_diff,          # Mean electronegativity difference
        packing_fraction,                     # Estimated atomic packing fraction
        *stats.values()                       # Statistical measures
    ]
    
    return descriptor_vector

# Example Usage
compound_formula = 'H2O3Ge2'
descriptor_vector = generate_descriptor(compound_formula)
print(f"Descriptor vector for {compound_formula}:")
print(descriptor_vector)


TypeError: unsupported operand type(s) for +: 'method' and 'method'