In [15]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf

# Load data
with open('cleaned_data_3102025.json', 'r') as f:
    data = json.load(f)

# Periodic table elements
periodic_table = {
    "H", "Li", "Be", "B", "C", "N", "O", "F", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "K", "Ca",
    "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Rb", "Sr", "Y",
    "Zr", "Nb", "Mo", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Cs", "Ba", "La", "Ce", "Pr",
    "Nd", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt",
    "Au", "Hg", "Tl", "Pb", "Bi"
}


# Preprocess data
filtered_data = []
for entry in data:
    composition = entry.get("composition", {})
    filtered_composition = {elem: 1 for elem in composition if elem in periodic_table}
    if filtered_composition:  # Only keep entries with valid compositions
        entry["composition"] = filtered_composition
        filtered_data.append(entry)

# Remove entries with no composition
data = [entry for entry in filtered_data if entry["composition"]]

# Create dataframe
features = []
elements = sorted(list(periodic_table))  # Keep element order consistent
for entry in data:
    row = {
        "title": entry["title"],
        "b0": round(np.nan_to_num(float(entry["b0"]), nan=0, posinf=1e9, neginf=-1e9), 4),
        "d1": round(np.nan_to_num(float(entry["d1"]), nan=0, posinf=1e9, neginf=-1e9), 4),
        "d2": round(np.nan_to_num(float(entry["d2"]), nan=0, posinf=1e9, neginf=-1e9), 4),
        "ei": round(np.nan_to_num(float(entry["ei"]), nan=0, posinf=1e9, neginf=-1e9), 4),
        "n": round(np.nan_to_num(float(entry.get("n", 0)), nan=0, posinf=1e9, neginf=-1e9), 4),  # New feature
        "k": round(np.nan_to_num(float(entry.get("k", 0)), nan=0, posinf=1e9, neginf=-1e9), 4)   # New feature
    }
    for elem in elements:
        row[elem] = entry["composition"].get(elem, 0)
    features.append(row)

df = pd.DataFrame(features)
X = df[["b0", "d1", "d2", "ei", "n", "k"]].astype(np.float64)
X = X.clip(lower=-1e10, upper=1e10)  # Clip extreme values
Y = df[elements]

kf = KFold(n_splits=10, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]

    model = DecisionTreeClassifier()
    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)

    print(f"============================== Fold {fold} Results ==============================")
    print("Title | Actual | Predicted")

    correct = 0
    for i, idx in enumerate(test_idx):
        actual_elements = [elem for elem, present in zip(elements, Y_test.iloc[i]) if present]
        predicted_elements = [elem for elem, present in zip(elements, Y_pred[i]) if present]
        correct += actual_elements == predicted_elements
        print(f"{df.iloc[idx]['title']} | Actual: {', '.join(actual_elements)} | Predicted: {', '.join(predicted_elements)}")

    accuracy = correct / len(test_idx) * 100
    print(f"Accuracy for Fold {fold}: {accuracy:.2f}%\n")

    if fold == 5:  # Save tree to PDF for better visibility
        pdf = matplotlib.backends.backend_pdf.PdfPages("decision_tree.pdf")
        fig = plt.figure(figsize=(300, 300))  # Large size for better readability
        plot_tree(model, feature_names=X.columns, class_names=elements, filled=True, fontsize=12)
        pdf.savefig(fig, bbox_inches="tight")
        pdf.close()
        plt.close(fig)


Title | Actual | Predicted
paper-139 | Actual: Hf, Nb, Ta, Ti, V, Zr | Predicted: Hf, Nb, Ta, Ti, V, Zr
paper-133 | Actual: Co, Cr, Fe, Mn, Ni | Predicted: Co, Cr, Fe, Mn, Ni
paper-128 | Actual: Co, Cr, Cu, Fe, Ni, Ti | Predicted: Co, Cr, Cu, Fe, Ni, Ti
paper-34 | Actual: Al, C, Co, Cr, Fe, Mo, Nb, Ni, Ti, W | Predicted: Al, C, Co, Cr, Fe, Mo, Nb, Ni, Ti, W
paper-144 | Actual: Hf, Mo, N, Ta, Ti, Zr | Predicted: Hf, Mo, Ta, Ti, Zr
paper-212 | Actual: C, Mo, Nb, Ta, W | Predicted: C, Mo, Nb, Ta, W
paper-226 | Actual: Al, Co, Cr, Fe, Mn, Ni | Predicted: Co, Cr, Fe, Mn, Ni
paper-221 | Actual: Al, Co, Cr, Fe, V | Predicted: Al, Co, Cr, Ni, Ti
paper-222 | Actual: C, Co, Cr, Fe, Mn, Nb | Predicted: 
paper-223 | Actual: Al, Cr, Cu, Fe, Ni, Si, Ti | Predicted: Cr, Fe, Mn, Ni, V
paper-234 | Actual: Al, Co, Cr, Fe, Ni | Predicted: Al, Co, Cr, Fe, Ni
paper-250 | Actual: Co, Cr, Cu, Fe, Ni | Predicted: Al, Cr, Fe, Ni
Accuracy for Fold 1: 50.00%

Title | Actual | Predicted
paper-150 | Actual: Co, Cr

In [3]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf

# Load data
with open('cleaned_data_3102025.json', 'r') as f:
    data = json.load(f)

# Periodic table elements
periodic_table = {"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr"}

# Preprocess data
filtered_data = []
for entry in data:
    composition = entry.get("composition", {})
    filtered_composition = {elem: 1 for elem in composition if elem in periodic_table}
    if filtered_composition:  # Only keep entries with valid compositions
        entry["composition"] = filtered_composition
        filtered_data.append(entry)

# Remove entries with no composition
data = [entry for entry in filtered_data if entry["composition"]]

# Create dataframe
features = []
elements = sorted(list(periodic_table))  # Keep element order consistent
for entry in data:
    row = {
        "title": entry["title"],
        "b0": round(np.nan_to_num(float(entry["b0"]), nan=0, posinf=1e9, neginf=-1e9), 4),
        "d1": round(np.nan_to_num(float(entry["d1"]), nan=0, posinf=1e9, neginf=-1e9), 4),
        "d2": round(np.nan_to_num(float(entry["d2"]), nan=0, posinf=1e9, neginf=-1e9), 4),
        "ei": round(np.nan_to_num(float(entry["ei"]), nan=0, posinf=1e9, neginf=-1e9), 4),
        "n": round(np.nan_to_num(float(entry.get("n", 0)), nan=0, posinf=1e9, neginf=-1e9), 4),  # New feature
        "k": round(np.nan_to_num(float(entry.get("k", 0)), nan=0, posinf=1e9, neginf=-1e9), 4),   # New feature
        "b0_d1_ratio": round(float(entry["b0"])/(float(entry["d1"])+1e-9), 4)  # Feature interaction
    }
    for elem in elements:
        row[elem] = entry["composition"].get(elem, 0)
    features.append(row)

df = pd.DataFrame(features)
X = df[["b0", "d1", "d2", "ei", "n", "k"]].astype(np.float64)
X = X.clip(lower=-1e10, upper=1e10)  # Clip extreme values
Y = df[elements]

# Feature scaling
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
fold_results = []

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'hidden_layer_sizes': [(128, 128, 64), (256, 128, 64)],  # Deeper network
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.0005],
    'learning_rate': ['constant', 'adaptive'],
    'learning_rate_init': [0.001, 0.0005]  # Lower learning rates for stability
}

best_model = MLPClassifier(max_iter=500, random_state=42)
grid_search = GridSearchCV(best_model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X, Y)
best_params = grid_search.best_params_

for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    X_train, X_test = X[train_idx], X[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]

    model = MLPClassifier(**best_params, max_iter=500, random_state=42)
    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)

    print(f"============================== Fold {fold} Results ==============================")
    print("Title | Actual | Predicted")

    correct = 0
    for i, idx in enumerate(test_idx):
        actual_elements = [elem for elem, present in zip(elements, Y_test.iloc[i]) if present]
        predicted_elements = [elem for elem, present in zip(elements, Y_pred[i]) if present]
        correct += actual_elements == predicted_elements
        print(f"{df.iloc[idx]['title']} | Actual: {', '.join(actual_elements)} | Predicted: {', '.join(predicted_elements)}")

    accuracy = correct / len(test_idx) * 100
    print(f"Accuracy for Fold {fold}: {accuracy:.2f}%\n")




Title | Actual | Predicted
paper-139 | Actual: Hf, Nb, Ta, Ti, V, Zr | Predicted: Cr, Ti
paper-133 | Actual: Co, Cr, Fe, Mn, Ni | Predicted: Co, Cr, Fe, Mn, Ni
paper-128 | Actual: Co, Cr, Cu, Fe, Ni, Ti | Predicted: Co, Cr, Cu, Mn, Ni
paper-34 | Actual: Al, C, Co, Cr, Fe, Mo, Nb, Ni, Ti, W | Predicted: Co, Cr, Fe, Mn, Ni
paper-144 | Actual: Hf, Mo, N, Ta, Ti, Zr | Predicted: Cr, Fe, Ti
paper-212 | Actual: C, Mo, Nb, Ta, W | Predicted: C, Mo, Nb, Ta, W
paper-226 | Actual: Al, Co, Cr, Fe, Mn, Ni | Predicted: Co, Cr, Fe, Ni
paper-221 | Actual: Al, Co, Cr, Fe, V | Predicted: Nb, Ta, Ti
paper-222 | Actual: C, Co, Cr, Fe, Mn, Nb | Predicted: Al, Co, Cr, Fe, Ni
paper-223 | Actual: Al, Cr, Cu, Fe, Ni, Si, Ti | Predicted: Al, Co, Cr, Fe, Ni
paper-234 | Actual: Al, Co, Cr, Fe, Ni | Predicted: Co, Cr, Fe, Mn, Ni
paper-250 | Actual: Co, Cr, Cu, Fe, Ni | Predicted: Co, Cr, Fe, Ni, Ti
Accuracy for Fold 1: 16.67%





Title | Actual | Predicted
paper-150 | Actual: Co, Cr, Fe, Mn, Ni | Predicted: Al, Co, Cr, Fe, Ni
paper-127 | Actual: Co, Cr, Cu, Fe, Ni, Ti | Predicted: Co, Cr, Fe, Mn, Ni
paper-6 | Actual: Co, Cr, Fe, Mn, Ni | Predicted: Al, Co, Cr, Fe, Ni, Ti
paper-131 | Actual: Co, Cr, Fe, Mn, Ni | Predicted: Co, Cr, Fe, Ni
paper-140 | Actual: Cr, Mo, Nb, Ta, Ti, Zr | Predicted: Co, Cr, Fe, Ni, Ti
paper-206 | Actual: Al, Co, Cr, Ni, Ti | Predicted: Al, Co, Cr, Fe, Ni
paper-213 | Actual: Co, Cr, Fe, Mn | Predicted: Al, Ti, Zr
paper-228 | Actual: Al, Cr, Mo, Ti, V, Zr | Predicted: Al, Co, Cr, Fe, Ni, Ti
paper-218 | Actual: Al, Co, Cr, Cu, Fe, Ni | Predicted: Cr, Fe, Ni
paper-224 | Actual: Al, Nb, Ta, Ti, Zr | Predicted: Al, Co, Cr, Fe, Ti
paper-265 | Actual: Al, Co, Cr, Fe, Ni | Predicted: Co, Cr, Fe, Mn, Ni
Accuracy for Fold 2: 0.00%





Title | Actual | Predicted
paper-131 | Actual: Co, Cr, Fe, Mn, Ni | Predicted: Co, Cr, Fe, Ni
paper-2 | Actual: Co, Cr, Fe, Mn, Ni | Predicted: Co, Cr, Fe, Mn, Ni
paper-26 | Actual: Co, Cr, Fe, Mn, Ni | Predicted: Al, Co, Cr, Fe, Ni
paper-51 | Actual: Cr, Fe, Mn, Ni | Predicted: Co, Cr, Fe, Ni
paper-110 | Actual: Al, Mg, Si | Predicted: Cr, Ta, Ti, Zr
paper-127 | Actual: Co, Cr, Cu, Fe, Ni, Ti | Predicted: Co, Cr, Fe, Ni
paper-143 | Actual: Hf, Nb, Zr | Predicted: Co, Cr, Cu, Fe, Ni
paper-146 | Actual: Hf, Nb, Ta, Ti, Zr | Predicted: Al, Co, Cr, Fe, Ni, Ti
paper-230 | Actual: Al, Co, Cr, Cu, Fe, Ni | Predicted: Co, Cr, Fe, Ni
paper-239 | Actual: Al, Cr, Cu, Fe, Ni | Predicted: Al, Co, Cr, Fe, Ni
paper-240 | Actual: Al, Cr, Mo, Nb, V | Predicted: Al, Co, Cr, Fe, Ni
Accuracy for Fold 3: 9.09%





Title | Actual | Predicted
paper-138 | Actual: Cr, Mo, Nb, Ta, Ti, Zr | Predicted: Cr, Fe, Ni
paper-4 | Actual: Al, Co, Cr, Fe, Ni, Ti | Predicted: Co, Cr, Fe, Mn, Ni
paper-32 | Actual: Al, C, Co, Cr, Fe, Mo, Nb, Ni, Ti, W | Predicted: Al, Co, Cr, Fe, Ni
paper-129 | Actual: Nb, Ta, Ti, Zr | Predicted: Co, Cr, Fe, Ni
paper-137 | Actual: Al, Co, Cr, Fe, Ni | Predicted: Nb, Ti, Zr
paper-138 | Actual: Cr, Mo, Nb, Ta, Ti, Zr | Predicted: Cr, Fe, Ni
paper-201 | Actual: Al, Co, Cr, Fe, Ni, Ti | Predicted: Al, Mo, Nb, Ta
paper-204 | Actual: Al, Co, Cr, Ni, Ti | Predicted: Nb, Ti, Zr
paper-210 | Actual: C, Mo, Nb, Ta, W | Predicted: C, Mo, Nb, Ta, W
paper-227 | Actual: Al, Co, Cr, Fe, Mn, Ni | Predicted: Al, Co, Cr, Fe, Ni
paper-259 | Actual: Al, Co, Cr, Fe, Mn, Ni | Predicted: Al, Cr, Ni
Accuracy for Fold 4: 9.09%





Title | Actual | Predicted
paper-144 | Actual: Hf, Mo, Ta, Ti, Zr | Predicted: Cr, Fe, Ti
paper-22 | Actual: Co, Cr, Cu, Fe, Mn, Si | Predicted: Co, Cr, Fe, Mn, Ni
paper-27 | Actual: Co, Cr, Fe, Mn, Ni | Predicted: Co, Cr, Fe, Mn, Ni
paper-31 | Actual: Co, Cr, Fe, Mn, Si | Predicted: Co, Cr, Fe, Mn, Ni
paper-128 | Actual: Co, Cr, Cu, Fe, Ni, Ti | Predicted: Co, Cr, Cu, Mn, Ni
paper-141 | Actual: Co, Cr, Fe, Mn, Ni, Ti | Predicted: Al, Co, Cr, Fe, Ni, Ti
paper-214 | Actual: Al, Co, Cr, Fe, Mn, Ni | Predicted: Al, Co, Cr, Fe, Ni
paper-225 | Actual: Al, Co, Cr, Fe, Mn, Ni | Predicted: Al, Co, Cr, Fe, Ni
paper-217 | Actual: C, Cr, Cu, Fe, Mn, Ni | Predicted: Al, Co, Cr, Fe, Mn, Ni
paper-238 | Actual: Al, Co, Cu, Mn, Ni | Predicted: Cr, Ni
paper-245 | Actual: Al, Co, Cr, Fe, Ni | Predicted: Al, Co, Cr, Fe, Mn, Ni
Accuracy for Fold 5: 9.09%





Title | Actual | Predicted
paper-152 | Actual: Co, Cr, Fe, Mo, Ni | Predicted: Co, Cr, Fe, Ni
paper-141 | Actual: Co, Cr, Fe, Mn, Ni, Ti | Predicted: Al, Co, Cr, Fe, Ni, Ti
paper-137 | Actual: Al, Hf, Nb, Ta, Ti, Zr | Predicted: Cr, Fe, Ti
paper-129 | Actual: Nb, Ta, Ti, Zr | Predicted: Nb, Ta, Ti, Zr
paper-5 | Actual: Co, Cr, Fe, Mn, Ni | Predicted: Co, Cr, Fe, Mn, Ni
paper-8 | Actual: Co, Cr, Fe, Mn, Ni | Predicted: Co, Cr, Fe, Mn, Ni
paper-134 | Actual: Al, Co, Cr, Fe, Ni, Ti | Predicted: Co, Cr, Fe, Ni
paper-151 | Actual: Al, Co, Cr, Fe, Ni, Ti | Predicted: Co, Cr, Fe
paper-219 | Actual: Al, Co, Cr, Fe, Ni | Predicted: Al, Co, Cr, Fe, Mn, Ni
paper-235 | Actual: Cr, Cu, Ni, Ti, Zr | Predicted: Al, Co, Cr, Mo, Ni, Ti
paper-256 | Actual: Al, Co, Cr, Fe, Ni | Predicted: Al, Co, Cr, Fe, Mn, Ni
Accuracy for Fold 6: 27.27%





Title | Actual | Predicted
paper-145 | Actual: Al, Hf, Nb, Ta, Ti, Zr | Predicted: Cr, Fe
paper-152 | Actual: Al, Co, Cr, Fe, Ni, Ti | Predicted: Co, Cr, Fe, Ni
paper-202 | Actual: Al, Co, Cr, Cu, Fe, Ni | Predicted: Al, Mo, Nb
paper-208 | Actual: Cr, Fe, Mn, Ni, V | Predicted: Co, Cr, Fe, Ni
paper-233 | Actual: Al, Co, Cr, Fe, Ni | Predicted: Al, Co, Cr, Fe, Mn, Ni
paper-242 | Actual: Al, Co, Cr, Fe, Ni | Predicted: Al, Nb, Ti, Zr
paper-248 | Actual: Be, Hf, Ti, Zr | Predicted: Cr, Ni, Ti
paper-253 | Actual: Co, Cr, Cu, Mn, Ni | Predicted: Cr, Ni
paper-255 | Actual: Al, Nb, Ta, Ti, Zr | Predicted: Cr
paper-260 | Actual: Al, Co, Cr, Cu, Fe, Ni | Predicted: Al, Co, Cr, Fe, Ni
paper-262 | Actual: Co, Cr, Cu, Fe, Ni | Predicted: C, Mg, Mo, Nb, Ta, Ti, W, Zr
Accuracy for Fold 7: 0.00%





Title | Actual | Predicted
paper-109 | Actual: C, Mg | Predicted: Mo, Nb, Ta, Ti, Zr
paper-139 | Actual: Hf, Nb, Ta, Ti, V, Zr | Predicted: Al, Cr, Ti, Zr
paper-150 | Actual: Co, Cr, Fe, Mn, Ni | Predicted: Al, Co, Cr, Fe, Ni
paper-205 | Actual: Al, Co, Cr, Ni, Ti | Predicted: Al, Co, Cr, Fe, Mo, Ni, Ti
paper-203 | Actual: Co, Cr, Fe, Mn, Ni | Predicted: Co, Cr, Fe, Mn, Ni
paper-209 | Actual: Cr, Fe, Mn, Ni, V | Predicted: Co, Cr, Fe, Ni
paper-211 | Actual: C, Mo, Nb, Ta, W | Predicted: C, Mo, Nb, Ta, W
paper-229 | Actual: Al, Nb, Ti, V, Zr | Predicted: Al, Cr, Ti, Zr
paper-246 | Actual: Co, Cr, Fe, Mn, Ni | Predicted: Al, Ti, Zr
paper-258 | Actual: Mo, Nb, Ta, Ti, W | Predicted: Cu, Ni, Ti, Zr
paper-263 | Actual: Mo, Nb, Ta, Ti, Zr | Predicted: Al, Co, Cr, Fe, Ni
Accuracy for Fold 8: 18.18%





Title | Actual | Predicted
paper-153 | Actual: Co, Cr, Fe, Mo, Ni | Predicted: Al, Cr, Ta, Ti, Zr
paper-151 | Actual: Co, Cr, Fe, Mo, Ni | Predicted: Al, Co, Cr, Fe, Ti
paper-25 | Actual: Co, Cr, Fe, Mn, Ni | Predicted: Co, Cr, Fe, Mn, Ni
paper-7 | Actual: Al, Co, Cr, Fe, Ni, Ti | Predicted: Co, Cr, Fe, Mn, Ni
paper-33 | Actual: Al, C, Co, Cr, Fe, Mo, Nb, Ni, Ti, W | Predicted: Al, Co, Cr, Fe, Mn, Ni
paper-133 | Actual: Co, Cr, Fe, Mn, Ni | Predicted: Co, Cr, Fe, Mn, Ni
paper-200 | Actual: Al, Co, Cr, Fe, Ni, Ti | Predicted: Al, Nb, Ta, Ti, V, Zr
paper-231 | Actual: Al, Co, Cr, Cu, Fe, Ni | Predicted: Al, Co, Cr, Fe, Ni
paper-237 | Actual: Nb, Ti, V, Zr | Predicted: Al, Co, Cr, Fe, Ni
paper-252 | Actual: In, Mo, Ti, W, Zr | Predicted: Al, Co, Cr, Fe, Ni
paper-254 | Actual: Co, Cr, Fe, Mn | Predicted: Al, Co, Cr, Fe, Ni
Accuracy for Fold 9: 18.18%

Title | Actual | Predicted
paper-1 | Actual: Co, Cr, Cu, Fe, Mn, Si | Predicted: Co, Cr, Fe, Mn, Ni
paper-23 | Actual: Co, Cr, Cu, Fe, Mn, S

