In [None]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load the JSON data
with open('allData_result.json', 'r') as file:
    data = json.load(file)

# Filter out entries without composition
filtered_data = [entry for entry in data if entry['composition'] is not None]

# Prepare features (X) and target (y)
X = []
y = []

for entry in filtered_data:
    features = [entry['b0'], entry['d1'], entry['d2'], entry['ei'], entry['ee']]
    X.append(features)

    composition = entry['composition']
    composition_values = [composition.get(element, 0) for element in set().union(*[d['composition'].keys() for d in filtered_data])]
    y.append(composition_values)

X = np.array(X)
y = np.array(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the MLP model
model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate the Mean Squared Error and R-squared score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

# Calculate element-wise accuracy
element_accuracy = 1 - np.mean(np.abs(y_test - y_pred) / np.maximum(y_test, 1e-8), axis=0)
print("Element-wise Accuracy:")
for i, acc in enumerate(element_accuracy):
    print(f"Element {i+1}: {acc:.4f}")

# Overall accuracy
overall_accuracy = np.mean(element_accuracy)
print(f"\nOverall Accuracy: {overall_accuracy:.4f}")

Mean Squared Error: 1544.9264367906496
R-squared Score: -38.302451338327494
Element-wise Accuracy:
Element 1: -653875100.0376
Element 2: -204953085.5696
Element 3: -3.0223
Element 4: -339382809.2256
Element 5: -726921699.4998
Element 6: -108743286.0658
Element 7: -32949485.0103
Element 8: -95568410.4502
Element 9: -26481368.6371
Element 10: -42210140.1532
Element 11: -58644731.5821
Element 12: -655191758.0805
Element 13: -59859077.7481
Element 14: -1101310697.2299
Element 15: -82158729.9511
Element 16: -44250917.8262
Element 17: -26584357.4424
Element 18: -39071471.4258
Element 19: -10.0597
Element 20: -51906004.6724
Element 21: -100999779.0691
Element 22: -191273196.8207
Element 23: -227212481.4841
Element 24: -73972850.0743
Element 25: -165917925.4072
Element 26: -730237045.8909
Element 27: -37546986.5875
Element 28: -740302272.0503
Element 29: -5.9010
Element 30: -45155904.1585
Element 31: -72506796.8854

Overall Accuracy: -217264141.5490




In [None]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Load the JSON data
with open('allData_result.json', 'r') as file:
    data = json.load(file)

# Filter out entries without composition
filtered_data = [entry for entry in data if entry['composition'] is not None]

# Prepare features (X) and target (y)
X = []
y = []

for entry in filtered_data:
    features = [entry['b0'], entry['d1'], entry['d2'], entry['ei'], entry['ee']]
    X.append(features)

    composition = entry['composition']
    composition_values = [composition.get(element, 0) for element in set().union(*[d['composition'].keys() for d in filtered_data])]
    y.append(composition_values)

X = np.array(X)
y = np.array(y)

# Handle infinite and NaN values
X = np.nan_to_num(X, nan=0, posinf=0, neginf=0)
y = np.nan_to_num(y, nan=0, posinf=0, neginf=0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the features and target
feature_scaler = RobustScaler()
X_train_scaled = feature_scaler.fit_transform(X_train)
X_test_scaled = feature_scaler.transform(X_test)

target_scaler = RobustScaler()
y_train_scaled = target_scaler.fit_transform(y_train)
y_test_scaled = target_scaler.transform(y_test)

# Create and train the MLP model
model = MLPRegressor(hidden_layer_sizes=(50, 25), max_iter=5000, alpha=0.01, random_state=42)
model.fit(X_train_scaled, y_train_scaled)

# Make predictions on the test set
y_pred_scaled = model.predict(X_test_scaled)
y_pred = target_scaler.inverse_transform(y_pred_scaled)

# Calculate the Mean Squared Error and R-squared score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

# Calculate element-wise accuracy
element_accuracy = 1 - np.mean(np.abs(y_test - y_pred) / (np.abs(y_test) + 1e-8), axis=0)
print("Element-wise Accuracy:")
for i, acc in enumerate(element_accuracy):
    print(f"Element {i+1}: {acc:.4f}")

# Overall accuracy
overall_accuracy = np.mean(element_accuracy)
print(f"\nOverall Accuracy: {overall_accuracy:.4f}")

Mean Squared Error: 75.89587895061625
R-squared Score: -0.9624944133629785
Element-wise Accuracy:
Element 1: -456535956.7056
Element 2: -13273518.7344
Element 3: -2.8963
Element 4: -60282868.3702
Element 5: -1596081400.6737
Element 6: -21074369.1221
Element 7: -15910801.3077
Element 8: -18847465.1441
Element 9: -20359342.4998
Element 10: -21440968.9835
Element 11: -26941788.2145
Element 12: -432428861.2596
Element 13: -24007442.5234
Element 14: -404117454.5787
Element 15: -19411037.2243
Element 16: -9319499.2791
Element 17: -17817934.9171
Element 18: -9711728.9414
Element 19: -4.9176
Element 20: -20381881.2515
Element 21: -31087165.7582
Element 22: -169553117.0720
Element 23: -77903927.5932
Element 24: -14182117.8009
Element 25: -65547592.6010
Element 26: -700808068.0795
Element 27: -10614725.1815
Element 28: -713206711.4518
Element 29: -4.4077
Element 30: -26709311.7612
Element 31: -17464425.3253

Overall Accuracy: -161774886.9218


In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Load the JSON data
with open('allData_result.json', 'r') as file:
    data = json.load(file)

# Filter out entries without composition
filtered_data = [entry for entry in data if entry['composition'] is not None]

# Prepare features (X) and target (y)
X = []
y = []
composition_keys = set()

for entry in filtered_data:
    features = [entry['b0'], entry['d1'], entry['d2'], entry['ei'], entry['ee']]
    X.append(features)

    composition = entry['composition']
    composition_keys.update(composition.keys())
    y.append(composition)

# Convert to DataFrame for easier analysis
df = pd.DataFrame(X, columns=['b0', 'd1', 'd2', 'ei', 'ee'])
for key in composition_keys:
    df[key] = [entry['composition'].get(key, 0) for entry in filtered_data]

# Data exploration
print("Feature statistics:")
print(df[['b0', 'd1', 'd2', 'ei', 'ee']].describe())

print("\nComposition statistics:")
print(df[list(composition_keys)].describe())

# Correlation analysis
correlation = df.corr()
plt.figure(figsize=(12, 10))
plt.imshow(correlation, cmap='coolwarm', aspect='auto')
plt.colorbar()
plt.xticks(range(len(correlation.columns)), correlation.columns, rotation=90)
plt.yticks(range(len(correlation.columns)), correlation.columns)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.savefig("correlation_heatmap.png")
plt.close()

# Prepare data for modeling
X = df[['b0', 'd1', 'd2', 'ei', 'ee']].values
y = df[list(composition_keys)].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nMean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

# Calculate element-wise accuracy
element_accuracy = 1 - np.mean(np.abs(y_test - y_pred) / (np.abs(y_test) + 1e-8), axis=0)
print("\nElement-wise Accuracy:")
for i, acc in enumerate(element_accuracy):
    print(f"{list(composition_keys)[i]}: {acc:.4f}")

# Overall accuracy
overall_accuracy = np.mean(element_accuracy)
print(f"\nOverall Accuracy: {overall_accuracy:.4f}")

# Feature importance
feature_importance = model.feature_importances_
for i, importance in enumerate(['b0', 'd1', 'd2', 'ei', 'ee']):
    print(f"{importance}: {feature_importance[i]:.4f}")

plt.figure(figsize=(10, 6))
plt.bar(['b0', 'd1', 'd2', 'ei', 'ee'], feature_importance)
plt.title("Feature Importance")
plt.tight_layout()
plt.savefig("feature_importance.png")
plt.close()

Feature statistics:
                d1         d2         ei            ee
count    23.000000  23.000000  23.000000  2.300000e+01
mean   1611.932293   0.252174   0.150000  1.000000e-03
std    1630.615123   0.101666   0.058387  6.651416e-19
min     190.979233   0.200000   0.100000  1.000000e-03
25%     626.734771   0.200000   0.100000  1.000000e-03
50%     839.622557   0.200000   0.150000  1.000000e-03
75%    2506.137990   0.200000   0.200000  1.000000e-03
max    5879.000000   0.450000   0.250000  1.000000e-03

Composition statistics:
              Nb         Cu         Cr         Si         Ti          B  \
count  23.000000  23.000000  23.000000  23.000000  23.000000  23.000000   
mean    1.460870   0.195652   7.443478   0.869565   4.669565   0.086957   
std     4.291825   0.516525   7.978541   1.937767  10.551624   0.288104   
min     0.000000   0.000000   0.000000   0.000000   0.000000   0.000000   
25%     0.000000   0.000000   1.000000   0.000000   0.000000   0.000000   
50%     0.

In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Load the JSON data
with open('allData_result.json', 'r') as file:
    data = json.load(file)

# Filter out entries without composition
filtered_data = [entry for entry in data if entry['composition'] is not None]

# Prepare features (X) and target (y)
X = []
y = []
composition_keys = set()

for entry in filtered_data:
    features = [entry['b0'], entry['d1'], entry['d2'], entry['ei'], entry['ee']]
    X.append(features)

    composition = entry['composition']
    composition_keys.update(composition.keys())
    y.append(composition)

# Convert to DataFrame
df = pd.DataFrame(X, columns=['b0', 'd1', 'd2', 'ei', 'ee'])
for key in composition_keys:
    df[key] = [1 if entry['composition'].get(key, 0) > 0 else 0 for entry in filtered_data]

# Select top 10 most common elements
top_elements = df[list(composition_keys)].sum().sort_values(ascending=False).head(10).index.tolist()

# Prepare data for modeling
X = df[['b0', 'd1', 'd2', 'ei']].values  # Removing 'ee' as it's constant
y = df[top_elements].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Multi-output Random Forest Classifier
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test.flatten(), y_pred.flatten())
print(f"\nOverall Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=top_elements))

# Feature importance
feature_importance = np.mean([estimator.feature_importances_ for estimator in model.estimators_], axis=0)
for i, importance in enumerate(['b0', 'd1', 'd2', 'ei']):
    print(f"{importance}: {feature_importance[i]:.4f}")

plt.figure(figsize=(10, 6))
plt.bar(['b0', 'd1', 'd2', 'ei'], feature_importance)
plt.title("Feature Importance")
plt.tight_layout()
plt.savefig("feature_importance_classification.png")
plt.close()


Overall Accuracy: 0.7800

Classification Report:
              precision    recall  f1-score   support

          Co       1.00      0.80      0.89         5
          Cr       1.00      0.80      0.89         5
          Fe       1.00      0.80      0.89         5
          Ni       1.00      0.80      0.89         5
          Mn       0.50      0.67      0.57         3
           C       0.00      0.00      0.00         0
          Nb       0.00      0.00      0.00         0
          Ti       0.00      0.00      0.00         0
          Al       0.00      0.00      0.00         2
          Si       0.00      0.00      0.00         0

   micro avg       0.82      0.72      0.77        25
   macro avg       0.45      0.39      0.41        25
weighted avg       0.86      0.72      0.78        25
 samples avg       0.72      0.72      0.72        25



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


b0: 0.1107
d1: 0.5951
d2: 0.0884
ei: 0.2058


# Train the model

In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Load the JSON data
with open('allData_result.json', 'r') as file:
    data = json.load(file)

# Filter out entries without composition
filtered_data = [entry for entry in data if entry['composition'] is not None]

# Prepare features (X) and target (y)
X = []
y = []
composition_keys = set()

for entry in filtered_data:
    features = [entry['b0'], entry['d1'], entry['d2'], entry['ei'], entry['ee']]
    X.append(features)

    composition = entry['composition']
    composition_keys.update(composition.keys())
    y.append(composition)

# Convert to DataFrame
df = pd.DataFrame(X, columns=['b0', 'd1', 'd2', 'ei', 'ee'])
for key in composition_keys:
    df[key] = [1 if entry['composition'].get(key, 0) > 0 else 0 for entry in filtered_data]

# Select top 10 most common elements
top_elements = df[list(composition_keys)].sum().sort_values(ascending=False).head(10).index.tolist()

# Prepare data for modeling
X = df[['b0', 'd1', 'd2', 'ei']].values  # Removing 'ee' as it's constant
y = df[top_elements].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Multi-output Random Forest Classifier
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test.flatten(), y_pred.flatten())
print(f"\nOverall Accuracy: {accuracy:.4f}")

# Handle undefined metrics by setting zero_division to 0
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=top_elements, zero_division=0))

# Analyze data distribution in training and test sets
train_counts = np.sum(y_train, axis=0)
test_counts = np.sum(y_test, axis=0)
element_distribution = pd.DataFrame({'Element': top_elements, 'Train Count': train_counts, 'Test Count': test_counts})
print("\nElement Distribution:")
print(element_distribution)

# Feature importance
feature_importance = np.mean([estimator.feature_importances_ for estimator in model.estimators_], axis=0)
for i, importance in enumerate(['b0', 'd1', 'd2', 'ei']):
    print(f"{importance}: {feature_importance[i]:.4f}")

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.bar(['b0', 'd1', 'd2', 'ei'], feature_importance)
plt.title("Feature Importance")
plt.tight_layout()
plt.savefig("feature_importance_classification.png")
plt.close()



Overall Accuracy: 0.7800

Classification Report:
              precision    recall  f1-score   support

          Co       1.00      0.80      0.89         5
          Cr       1.00      0.80      0.89         5
          Fe       1.00      0.80      0.89         5
          Ni       1.00      0.80      0.89         5
          Mn       0.50      0.67      0.57         3
           C       0.00      0.00      0.00         0
          Nb       0.00      0.00      0.00         0
          Ti       0.00      0.00      0.00         0
          Al       0.00      0.00      0.00         2
          Si       0.00      0.00      0.00         0

   micro avg       0.82      0.72      0.77        25
   macro avg       0.45      0.39      0.41        25
weighted avg       0.86      0.72      0.78        25
 samples avg       0.72      0.72      0.72        25


Element Distribution:
  Element  Train Count  Test Count
0      Co           14           5
1      Cr           14           5
2      Fe

# Guess the element

In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Load the JSON data
def load_data(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Prepare features (X) and target (y)
def prepare_data(data):
    X = []
    y = []
    composition_keys = set()
    for entry in data:
        if entry['composition'] is not None:
            features = [entry['b0'], entry['d1'], entry['d2'], entry['ei']]  # Removed 'ee' as it's constant
            X.append(features)
            composition = entry['composition']
            composition_keys.update(composition.keys())
            y.append(composition)
    return X, y, list(composition_keys)

# Convert data to DataFrame
def create_dataframe(X, y, composition_keys, feature_names):
    df = pd.DataFrame(X, columns=feature_names)
    for key in composition_keys:
        df[key] = [1 if entry.get(key, 0) > 0 else 0 for entry in y]
    return df

# Select top N most common elements
def select_top_elements(df, composition_keys, n=10):
    return df[list(composition_keys)].sum().sort_values(ascending=False).head(n).index.tolist()

# Train the model
def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
    model.fit(X_train_scaled, y_train)
    return model, scaler, X_test_scaled, y_test

# Evaluate the model
def evaluate_model(model, X_test_scaled, y_test, top_elements):
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test.flatten(), y_pred.flatten())
    print(f"\nOverall Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=top_elements, zero_division=0))

# Predict compositions for objects without them
def predict_compositions(model, scaler, data, feature_names, top_elements):
    for entry in data:
        if entry['composition'] is None:
            features = [[entry[feature] for feature in feature_names]]
            features_scaled = scaler.transform(features)
            predicted_composition = model.predict(features_scaled)[0]
            entry['composition'] = {element: int(pred) for element, pred in zip(top_elements, predicted_composition)}
    return data

# Save the updated data to a new JSON file
def save_data(data, output_file):
    with open(output_file, 'w') as file:
        json.dump(data, file, indent=2)

# Main function
def main():
    input_file = 'allData_result.json'
    output_file = 'allData_result_with_predictions.json'
    feature_names = ['b0', 'd1', 'd2', 'ei']

    data = load_data(input_file)
    X, y, composition_keys = prepare_data(data)
    df = create_dataframe(X, y, composition_keys, feature_names)
    top_elements = select_top_elements(df, composition_keys)

    X = df[feature_names].values
    y = df[top_elements].values

    model, scaler, X_test_scaled, y_test = train_model(X, y)
    evaluate_model(model, X_test_scaled, y_test, top_elements)

    updated_data = predict_compositions(model, scaler, data, feature_names, top_elements)
    save_data(updated_data, output_file)

    print(f"\nProcessing complete. Updated data saved to {output_file}")

    # Feature importance
    feature_importance = np.mean([estimator.feature_importances_ for estimator in model.estimators_], axis=0)
    for i, importance in enumerate(feature_names):
        print(f"{importance}: {feature_importance[i]:.4f}")

    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.bar(feature_names, feature_importance)
    plt.title("Feature Importance")
    plt.tight_layout()
    plt.savefig("feature_importance_classification.png")
    plt.close()

if __name__ == "__main__":
    main()


Overall Accuracy: 0.7800

Classification Report:
              precision    recall  f1-score   support

          Co       1.00      0.80      0.89         5
          Cr       1.00      0.80      0.89         5
          Fe       1.00      0.80      0.89         5
          Ni       1.00      0.80      0.89         5
          Mn       0.50      0.67      0.57         3
           C       0.00      0.00      0.00         0
          Nb       0.00      0.00      0.00         0
          Ti       0.00      0.00      0.00         0
          Al       0.00      0.00      0.00         2
          Si       0.00      0.00      0.00         0

   micro avg       0.82      0.72      0.77        25
   macro avg       0.45      0.39      0.41        25
weighted avg       0.86      0.72      0.78        25
 samples avg       0.72      0.72      0.72        25


Processing complete. Updated data saved to allData_result_with_predictions.json
b0: 0.1107
d1: 0.5951
d2: 0.0884
ei: 0.2058


# Periodic table elements.

In [5]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# List of elements in the periodic table
PERIODIC_TABLE_ELEMENTS = [
    'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar',
    'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr',
    'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe',
    'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu',
    'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn',
    'Fr', 'Ra', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr',
    'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl', 'Mc', 'Lv', 'Ts', 'Og'
]

# Load the JSON data
def load_data(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Prepare features (X) and target (y)
def prepare_data(data):
    X = []
    y = []
    composition_keys = set()
    for entry in data:
        if entry['composition'] is not None:
            features = [entry['b0'], entry['d1'], entry['d2'], entry['ei']]  # Removed 'ee' as it's constant
            X.append(features)
            composition = {k: v for k, v in entry['composition'].items() if k in PERIODIC_TABLE_ELEMENTS}
            composition_keys.update(composition.keys())
            y.append(composition)
    return X, y, list(composition_keys)

# Convert data to DataFrame
def create_dataframe(X, y, composition_keys, feature_names):
    df = pd.DataFrame(X, columns=feature_names)
    for key in composition_keys:
        df[key] = [1 if entry.get(key, 0) > 0 else 0 for entry in y]
    return df

# Select top N most common elements
def select_top_elements(df, composition_keys, n=15):
    return df[list(composition_keys)].sum().sort_values(ascending=False).head(n).index.tolist()

# Train the model
def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
    model.fit(X_train_scaled, y_train)
    return model, scaler, X_test_scaled, y_test

# Evaluate the model
def evaluate_model(model, X_test_scaled, y_test, top_elements):
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test.flatten(), y_pred.flatten())
    print(f"\nOverall Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=top_elements, zero_division=0))

# Predict compositions for objects without them
def predict_compositions(model, scaler, data, feature_names, top_elements):
    for entry in data:
        if entry['composition'] is None:
            features = [[entry[feature] for feature in feature_names]]
            features_scaled = scaler.transform(features)
            predicted_composition = model.predict(features_scaled)[0]
            entry['composition'] = {
                element: int(pred) for element, pred in zip(top_elements, predicted_composition)
                if element in PERIODIC_TABLE_ELEMENTS and pred > 0
            }
    return data

# Save the updated data to a new JSON file
def save_data(data, output_file):
    with open(output_file, 'w') as file:
        json.dump(data, file, indent=2)

# Main function
def main():
    input_file = 'allData_result.json'
    output_file = 'allData_result_with_predictions.json'
    feature_names = ['b0', 'd1', 'd2', 'ei']

    data = load_data(input_file)
    X, y, composition_keys = prepare_data(data)
    df = create_dataframe(X, y, composition_keys, feature_names)
    top_elements = select_top_elements(df, composition_keys)

    X = df[feature_names].values
    y = df[top_elements].values

    model, scaler, X_test_scaled, y_test = train_model(X, y)
    evaluate_model(model, X_test_scaled, y_test, top_elements)

    updated_data = predict_compositions(model, scaler, data, feature_names, top_elements)
    save_data(updated_data, output_file)

    print(f"\nProcessing complete. Updated data saved to {output_file}")

    # Feature importance
    feature_importance = np.mean([estimator.feature_importances_ for estimator in model.estimators_], axis=0)
    for i, importance in enumerate(feature_names):
        print(f"{importance}: {feature_importance[i]:.4f}")

    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.bar(feature_names, feature_importance)
    plt.title("Feature Importance")
    plt.tight_layout()
    plt.savefig("feature_importance_classification.png")
    plt.close()

if __name__ == "__main__":
    main()


Overall Accuracy: 0.7067

Classification Report:
              precision    recall  f1-score   support

          Co       1.00      0.60      0.75         5
          Cr       1.00      0.60      0.75         5
          Fe       1.00      0.60      0.75         5
          Mn       0.33      0.33      0.33         3
          Ni       1.00      0.60      0.75         5
           C       0.00      0.00      0.00         0
          Ti       0.00      0.00      0.00         0
          Nb       0.00      0.00      0.00         0
          Si       0.00      0.00      0.00         0
          Zr       0.00      0.00      0.00         0
          Al       0.00      0.00      0.00         2
          Cu       0.00      0.00      0.00         0
          Hf       0.00      0.00      0.00         0
          Ta       0.00      0.00      0.00         0
           B       0.00      0.00      0.00         0

   micro avg       0.57      0.52      0.54        25
   macro avg       0.29      0