In [6]:
from mendeleev import element
import numpy as np
import re
import pandas as pd
import ast
from pymatgen.symmetry.groups import SpaceGroup

# Define the function to calculate descriptors
def calculate_descriptors(formula):
    # Parse the formula and count the elements and their stoichiometric ratios
    pattern = re.compile(r"([A-Z][a-z]*)(\d*)")
    elements = pattern.findall(formula)

    # Convert elements list into a dictionary with counts as integers
    element_counts = {}
    for symbol, count in elements:
        count = int(count) if count else 1
        element_counts[symbol] = count

    # Retrieve each element's atomic properties
    atomic_numbers = []
    electronegativities = []
    atomic_radii = []
    atomic_masses = []
    valence_electrons = []

    for symbol, count in element_counts.items():
        elem = element(symbol)
        atomic_numbers.extend([elem.atomic_number] * count)
        electronegativities.extend([elem.en_pauling] * count)
        atomic_radii.extend([elem.atomic_radius] * count)
        atomic_masses.extend([elem.atomic_weight] * count)
        valence_electrons.extend([elem.nvalence()] * count)

    # Calculate descriptors

    # Elemental Properties
    mean_electronegativity = np.mean(electronegativities)
    atomic_number_diff = max(atomic_numbers) - min(atomic_numbers)
    avg_atomic_radius = np.mean(atomic_radii)

    # Compositional Features
    avg_atomic_mass = np.mean(atomic_masses)

    # Statistical Measures of Elemental Properties
    std_electronegativity = np.std(electronegativities)
    min_atomic_radius = np.min(atomic_radii)
    max_atomic_radius = np.max(atomic_radii)

    # Heuristic Quantities
    electronegativity_diff = max(electronegativities) - min(electronegativities)
    mean_valence_electron_count = np.mean(valence_electrons)

    # Organize descriptors into a vector
    feature_vector = [
        mean_electronegativity,        # 1. Mean Electronegativity
        atomic_number_diff,            # 2. Atomic Number Difference
        avg_atomic_radius,             # 3. Average Atomic Radius
        avg_atomic_mass,               # 4. Average Atomic Mass
        std_electronegativity,         # 5. Standard Deviation of Electronegativity
        min_atomic_radius,             # 6. Minimum Atomic Radius
        max_atomic_radius,             # 7. Maximum Atomic Radius
        electronegativity_diff,        # 8. Electronegativity Difference
        mean_valence_electron_count    # 9. Mean Valence Electron Count
    ]

    return feature_vector



def get_properties(row):

    e_f = ast.literal_eval(row['formation_energy'])['value']
    
    
    return  e_f

In [7]:
df1 = pd.read_csv('./Data/1_MatDX/MatDX_nomad_EF_Binary.csv')
df2 = pd.read_csv('./Data/1_MatDX/MatDX_nomad_EF_Ternary.csv')

# Concatenate the DataFrames
df = pd.concat([df1, df2], ignore_index=True)

In [4]:
import json
from tqdm import tqdm

target = {}
descriptors = {}

# Open the JSON file in append/update mode
output_file = 'descriptors.json'
try:
    # Load existing data if the file already exists
    with open(output_file, 'r') as file:
        data = json.load(file)
except (FileNotFoundError, json.JSONDecodeError):
    # If the file doesn't exist or is empty, initialize an empty dictionary
    data = {}

# Loop through the DataFrame and process each row
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Rows"):
    try:
        formula = row['formula']
        e_f = get_properties(row)
        target[formula] = e_f
        descriptor_values = calculate_descriptors(formula)
        descriptors[formula] = descriptor_values

        # Store the descriptor and e_f in the desired format
        data[formula] = [descriptor_values, e_f]
        
        # Save to JSON file after each entry (or periodically for large datasets)
        with open(output_file, 'w') as file:
            json.dump(data, file, indent=4)
            
    except Exception as e:
        print(f"Error processing row {index}: {e}")
        pass

# Optionally, print the final number of descriptors processed
print("Total descriptors processed:", len(descriptors.keys()))


Processing Rows:   0%|          | 2/20000 [00:05<14:21:35,  2.59s/it]


KeyboardInterrupt: 

In [9]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Assuming target_property and descriptors are defined
df_target = pd.Series(target_property, name='target')
df_descriptors = pd.DataFrame(descriptors).T

df = df_descriptors.join(df_target, how='inner')

# Split data into features (X) and target (y)
X = df.drop(columns='target')
y = df['target']

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# List of models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Decision Tree": DecisionTreeRegressor(max_depth=10),
    "Random Forest": RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=50, max_depth=3, random_state=42),
    "Support Vector Regressor": SVR(kernel="linear"),
    "k-Nearest Neighbors": KNeighborsRegressor(n_neighbors=5)
}

# Store results
results = {}

# Iterate over models
for model_name, model in models.items():
    # Optional: Use PCA for dimensionality reduction if needed
    pipeline = Pipeline([
        ('pca', PCA(n_components=5)),  # Adjust n_components based on explained variance
        ('regressor', model)
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = pipeline.predict(X_test)
    
    # Calculate the R-squared score
    r2_test = r2_score(y_test, y_pred)
    r2_train = r2_score(y_train, pipeline.predict(X_train))
    
    # Store the results
    results[model_name] = r2
    print(f"{model_name} R-squared: {r2_test:.2f} (Test), {r2_train:.2f} (Train)")   

NameError: name 'target_property' is not defined

In [11]:
from mendeleev import element
import numpy as np
import re

# Function to parse compound formula (e.g., AxByCz)
def parse_formula(formula):
    pattern = re.compile(r"([A-Z][a-z]*)(\d*)")
    elements = pattern.findall(formula)
    return {el: int(num) if num else 1 for el, num in elements}

# Function to retrieve basic properties of each element
def get_element_properties(element_symbol):
    elem = element(element_symbol)
    return {
        'atomic_number': elem.atomic_number,
        'electronegativity': elem.en_pauling or 0.0,
        'atomic_radius': elem.atomic_radius or 0.0,
        'valence_electrons': elem.nvalence if elem.nvalence else 0
    }

# Descriptor function
def generate_descriptor(formula):
    composition = parse_formula(formula)
    elements = list(composition.keys())
    
    # Initialize dictionaries to store properties
    atomic_numbers = []
    electronegativities = []
    atomic_radii = []
    valence_electrons = []
    atomic_masses = []
    
    # Gather elemental properties
    for element_symbol, count in composition.items():
        props = get_element_properties(element_symbol)
        atomic_numbers.extend([props['atomic_number']] * count)
        electronegativities.extend([props['electronegativity']] * count)
        atomic_radii.extend([props['atomic_radius']] * count)
        valence_electrons.extend([props['valence_electrons']] * count)
        atomic_masses.extend([element(element_symbol).atomic_weight] * count)
    
    # Compositional features
    total_atoms = sum(composition.values())
    stoichiometric_ratios = [composition[el] / total_atoms for el in elements]
    average_atomic_mass = np.mean(atomic_masses)
    
    # Statistical measures for elemental properties
    stats = {
        'mean_atomic_number': np.mean(atomic_numbers),
        'std_atomic_number': np.std(atomic_numbers),
        'min_atomic_number': np.min(atomic_numbers),
        'max_atomic_number': np.max(atomic_numbers),
        'mean_electronegativity': np.mean(electronegativities),
        'std_electronegativity': np.std(electronegativities),
        'min_electronegativity': np.min(electronegativities),
        'max_electronegativity': np.max(electronegativities),
        'mean_atomic_radius': np.mean(atomic_radii),
        'std_atomic_radius': np.std(atomic_radii),
        'min_atomic_radius': np.min(atomic_radii),
        'max_atomic_radius': np.max(atomic_radii),
        'mean_valence_electrons': np.mean(valence_electrons),
        'std_valence_electrons': np.std(valence_electrons),
        'min_valence_electrons': np.min(valence_electrons),
        'max_valence_electrons': np.max(valence_electrons),
    }
    
    # Heuristic quantities
    # Differences in electronegativity between neighboring atoms
    electronegativity_diffs = []
    for i, el1 in enumerate(elements):
        for el2 in elements[i + 1:]:
            diff = abs(get_element_properties(el1)['electronegativity'] - get_element_properties(el2)['electronegativity'])
            electronegativity_diffs.append(diff)
    mean_electronegativity_diff = np.mean(electronegativity_diffs) if electronegativity_diffs else 0.0

    # Atomic packing fraction placeholder (assuming crystal structure details are unknown)
    # For a rough estimate, we'll use the approximate atomic volume based on atomic radii
    if atomic_radii:
        atomic_volumes = [(4 / 3) * np.pi * (r ** 3) for r in atomic_radii]
        packing_fraction = np.sum(atomic_volumes) / total_atoms  # A rough average
    else:
        packing_fraction = 0.0

    # Descriptor vector
    descriptor_vector = [
        *stoichiometric_ratios,               # Compositional stoichiometric ratios
        average_atomic_mass,                  # Average atomic mass
        mean_electronegativity_diff,          # Mean electronegativity difference
        packing_fraction,                     # Estimated atomic packing fraction
        *stats.values()                       # Statistical measures
    ]
    
    return descriptor_vector

# Example Usage
compound_formula = 'H2O3Ge2'
descriptor_vector = generate_descriptor(compound_formula)
print(f"Descriptor vector for {compound_formula}:")
print(descriptor_vector)


TypeError: unsupported operand type(s) for +: 'method' and 'method'