# Remove entries that do not have composition and remove the element weights.

In [1]:
import json

# List of valid periodic table elements
periodic_table_elements = {
    "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S",
    "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga",
    "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd",
    "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm",
    "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os",
    "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa",
    "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg",
    "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
}

# Function to process the JSON data
def process_data(input_file, output_file):
    with open(input_file, 'r') as infile:
        data = json.load(infile)

    # Create a list to hold filtered data
    filtered_data = []

    # Process each entity
    for entity in data:
        composition = entity.get('composition')
        if composition:
            # Filter out elements not in the periodic table
            valid_elements = [elem for elem in composition.keys() if elem in periodic_table_elements]
            if valid_elements:
                # Update composition with only element names
                entity['composition'] = valid_elements
                filtered_data.append(entity)

    # Write the updated data to a new JSON file
    with open(output_file, 'w') as outfile:
        json.dump(filtered_data, outfile, indent=4)

# Usage example
input_file = 'allData_result.json'
output_file = 'filtered_allData_result.json'
process_data(input_file, output_file)


# Train model

In [2]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load filtered JSON data
def load_data(input_file):
    with open(input_file, 'r') as infile:
        data = json.load(infile)
    return data

# Preprocessing the data
def preprocess_data(data, periodic_table_elements):
    compositions = []
    features = []

    # Extract features and compositions
    for entry in data:
        # Using lemma as features (just an example)
        features.append(entry['lemma'])
        # Create binary composition array (1 if element exists, 0 otherwise)
        composition = [1 if elem in entry['composition'] else 0 for elem in periodic_table_elements]
        compositions.append(composition)

    return features, compositions

# Encode the feature list into a one-hot encoding format
def encode_features(feature_list):
    # Create a DataFrame and use get_dummies for one-hot encoding
    df = pd.DataFrame(feature_list)
    df_encoded = pd.get_dummies(df, prefix_sep='_', drop_first=True)
    return df_encoded

# Train and evaluate model
def train_and_evaluate(X, y, periodic_table_elements):
    # Split the data into 80% training and 20% testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Use Random Forest and SVM as models with MultiOutputClassifier for multi-label classification
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    svm_model = SVC(kernel='linear', class_weight='balanced')  # Adding class_weight='balanced' to handle imbalance

    classifiers = {
        'RandomForest': MultiOutputClassifier(rf_model),
        'SVM': MultiOutputClassifier(svm_model, n_jobs=-1)
    }

    # Train and cross-validate models
    for name, model in classifiers.items():
        try:
            model.fit(X_train, y_train)
        except ValueError as e:
            print(f"Skipping {name} due to error: {e}")
            continue

        # Cross-validation with 5 folds
        try:
            cross_val_scores = cross_val_score(model, X_train, y_train, cv=5)
            print(f"Cross-validation accuracy for {name}: {cross_val_scores.mean() * 100:.2f}%")
        except ValueError as e:
            print(f"Cross-validation failed for {name} due to error: {e}")
            continue

        # Make predictions on the test set
        y_pred = model.predict(X_test)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Test accuracy for {name}: {accuracy * 100:.2f}%")

        # Print original and predicted composition for test set
        print("\nTest Results (Original vs Predicted):")
        for i in range(len(y_test)):
            original = [periodic_table_elements[idx] for idx, val in enumerate(y_test[i]) if val == 1]
            predicted = [periodic_table_elements[idx] for idx, val in enumerate(y_pred[i]) if val == 1]
            print(f"Original: {original} | Predicted: {predicted}")
        print("\n" + "="*50 + "\n")

# Main function to run the process
def main():
    input_file = 'filtered_allData_result.json'

    # List of valid periodic table elements (assuming the same as in previous code)
    periodic_table_elements = [
        "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S",
        "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga",
        "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd",
        "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm",
        "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os",
        "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa",
        "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg",
        "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
    ]

    # Load and preprocess data
    data = load_data(input_file)
    features, compositions = preprocess_data(data, periodic_table_elements)

    # Encode features into a numeric format
    X = encode_features(features)
    y = np.array(compositions)

    # Train and evaluate models
    train_and_evaluate(X, y, periodic_table_elements)

# Run the script
if __name__ == "__main__":
    main()


Cross-validation accuracy for RandomForest: 71.00%
Test accuracy for RandomForest: 50.00%

Test Results (Original vs Predicted):
Original: ['Cr', 'Mn', 'Fe', 'Co', 'Ni'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']
Original: ['H', 'Si', 'Cr', 'Mn', 'Fe', 'Co'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']
Original: ['Al', 'Ti', 'Cr', 'Fe', 'Co', 'Ni'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']
Original: ['Al', 'Ti', 'Cr', 'Fe', 'Cu', 'Zn'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']
Original: ['Si', 'Cr', 'Mn', 'Fe', 'Co', 'Cu'] | Predicted: ['Si', 'Cr', 'Mn', 'Fe', 'Co', 'Cu']
Original: ['Cr', 'Mn', 'Fe', 'Co', 'Ni'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']


Skipping SVM due to error: The number of classes has to be greater than one; got 1 class


# Improve accuracy

In [3]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

# Load filtered JSON data
def load_data(input_file):
    with open(input_file, 'r') as infile:
        data = json.load(infile)
    return data

# Preprocessing the data
def preprocess_data(data, periodic_table_elements):
    compositions = []
    features = []

    # Extract features and compositions
    for entry in data:
        # Using lemma as features (just an example)
        features.append(entry['lemma'])
        # Create binary composition array (1 if element exists, 0 otherwise)
        composition = [1 if elem in entry['composition'] else 0 for elem in periodic_table_elements]
        compositions.append(composition)

    return features, compositions

# Encode the feature list into a one-hot encoding format
def encode_features(feature_list):
    # Create a DataFrame and use get_dummies for one-hot encoding
    df = pd.DataFrame(feature_list)
    df_encoded = pd.get_dummies(df, prefix_sep='_', drop_first=True)
    return df_encoded

# Function for Hyperparameter tuning using GridSearchCV for RandomForest
def tune_hyperparameters(X_train, y_train):
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=2, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    print("Best Hyperparameters:", grid_search.best_params_)
    return grid_search.best_estimator_

# Train and evaluate model
def train_and_evaluate(X, y, periodic_table_elements):
    # Split the data into 80% training and 20% testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    # Feature Scaling (important for SVM, XGBoost, etc.)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Tune RandomForest Hyperparameters
    print("\nTuning RandomForest Hyperparameters...\n")
    rf_model = tune_hyperparameters(X_train_scaled, y_train)

    # Models to evaluate
    models = {
        'RandomForest': MultiOutputClassifier(rf_model),
        'XGBoost': MultiOutputClassifier(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
    }

    # Cross-validation with K-Folds (since StratifiedKFold doesn't support multi-label)
    kf = KFold(n_splits=5)

    for name, model in models.items():
        # Cross-validation
        print(f"\nTraining {name} model...\n")
        cross_val_scores = cross_val_score(model, X_train_scaled, y_train, cv=kf, n_jobs=-1)
        print(f"Cross-validation accuracy for {name}: {cross_val_scores.mean() * 100:.2f}%")

        # Train the model on full training set
        model.fit(X_train_scaled, y_train)

        # Make predictions on the test set
        y_pred = model.predict(X_test_scaled)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Test accuracy for {name}: {accuracy * 100:.2f}%")

        # Print original and predicted composition for test set
        print("\nTest Results (Original vs Predicted):")
        for i in range(len(y_test)):
            original = [periodic_table_elements[idx] for idx, val in enumerate(y_test[i]) if val == 1]
            predicted = [periodic_table_elements[idx] for idx, val in enumerate(y_pred[i]) if val == 1]
            print(f"Original: {original} | Predicted: {predicted}")
        print("\n" + "="*50 + "\n")

# Main function to run the process
def main():
    input_file = 'filtered_allData_result.json'

    # List of valid periodic table elements (assuming the same as in previous code)
    periodic_table_elements = [
        "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S",
        "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga",
        "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd",
        "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm",
        "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os",
        "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa",
        "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg",
        "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
    ]

    # Load and preprocess data
    data = load_data(input_file)
    features, compositions = preprocess_data(data, periodic_table_elements)

    # Encode features into a numeric format
    X = encode_features(features)
    y = np.array(compositions)

    # Train and evaluate models
    train_and_evaluate(X, y, periodic_table_elements)

# Run the script
if __name__ == "__main__":
    main()



Tuning RandomForest Hyperparameters...

Fitting 2 folds for each of 54 candidates, totalling 108 fits
Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

Training RandomForest model...

Cross-validation accuracy for RandomForest: 71.00%
Test accuracy for RandomForest: 33.33%

Test Results (Original vs Predicted):
Original: ['Cr', 'Mn', 'Fe', 'Co', 'Ni'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']
Original: ['H', 'Si', 'Cr', 'Mn', 'Fe', 'Co'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']
Original: ['Al', 'Ti', 'Cr', 'Fe', 'Co', 'Ni'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']



Training XGBoost model...

Cross-validation accuracy for XGBoost: 32.00%


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Test accuracy for XGBoost: 33.33%

Test Results (Original vs Predicted):
Original: ['Cr', 'Mn', 'Fe', 'Co', 'Ni'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']
Original: ['H', 'Si', 'Cr', 'Mn', 'Fe', 'Co'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']
Original: ['Al', 'Ti', 'Cr', 'Fe', 'Co', 'Ni'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']




#MLP

In [4]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Load filtered JSON data
def load_data(input_file):
    with open(input_file, 'r') as infile:
        data = json.load(infile)
    return data

# Preprocessing the data
def preprocess_data(data, periodic_table_elements):
    compositions = []
    features = []

    # Extract features and compositions
    for entry in data:
        features.append(entry['lemma'])
        composition = [1 if elem in entry['composition'] else 0 for elem in periodic_table_elements]
        compositions.append(composition)

    return features, compositions

# Encode the feature list into a one-hot encoding format
def encode_features(feature_list):
    df = pd.DataFrame(feature_list)
    df_encoded = pd.get_dummies(df, prefix_sep='_', drop_first=True)
    return df_encoded

# Define and compile the MLP model
def create_mlp(input_dim, output_dim):
    model = Sequential([
        Dense(128, input_dim=input_dim, activation='relu'),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(output_dim, activation='sigmoid')  # Sigmoid for multi-label classification
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train and evaluate MLP model with overall accuracy calculation
def train_and_evaluate_mlp(X_train, X_test, y_train, y_test, periodic_table_elements):
    input_dim = X_train.shape[1]
    output_dim = y_train.shape[1]

    model = create_mlp(input_dim, output_dim)

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    history = model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=32,
                        callbacks=[early_stopping], verbose=2)

    # Evaluate on test data
    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print(f"\nMLP Test Accuracy per label: {test_accuracy * 100:.2f}%")

    # Predict on test set
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int)

    # Calculate and print overall accuracy
    overall_accuracy = np.mean([np.array_equal(y_test[i], y_pred[i]) for i in range(len(y_test))])
    print(f"\nOverall MLP Accuracy: {overall_accuracy * 100:.2f}%")

    # Print original and predicted composition for test set
    print("\nTest Results (Original vs Predicted):")
    for i in range(len(y_test)):
        original = [periodic_table_elements[idx] for idx, val in enumerate(y_test[i]) if val == 1]
        predicted = [periodic_table_elements[idx] for idx, val in enumerate(y_pred[i]) if val == 1]
        print(f"Original: {original} | Predicted: {predicted}")
    print("\n" + "="*50 + "\n")

# Main function to run the process
def main():
    input_file = 'filtered_allData_result.json'

    # List of valid periodic table elements
    periodic_table_elements = [
        "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S",
        "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga",
        "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd",
        "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm",
        "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os",
        "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa",
        "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg",
        "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
    ]

    # Load and preprocess data
    data = load_data(input_file)
    features, compositions = preprocess_data(data, periodic_table_elements)

    # Encode features into a numeric format
    X = encode_features(features)
    y = np.array(compositions)

    # Split the data into 80% training and 20% testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Feature Scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train and evaluate MLP model
    train_and_evaluate_mlp(X_train, X_test, y_train, y_test, periodic_table_elements)

# Run the script
if __name__ == "__main__":
    main()


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


1/1 - 2s - 2s/step - accuracy: 0.0000e+00 - loss: 0.6943 - val_accuracy: 0.0000e+00 - val_loss: 0.6806
Epoch 2/100
1/1 - 0s - 319ms/step - accuracy: 0.0000e+00 - loss: 0.6809 - val_accuracy: 0.0000e+00 - val_loss: 0.6719
Epoch 3/100
1/1 - 0s - 77ms/step - accuracy: 0.0000e+00 - loss: 0.6689 - val_accuracy: 0.0000e+00 - val_loss: 0.6633
Epoch 4/100
1/1 - 0s - 56ms/step - accuracy: 0.0000e+00 - loss: 0.6579 - val_accuracy: 0.0000e+00 - val_loss: 0.6542
Epoch 5/100
1/1 - 0s - 76ms/step - accuracy: 0.0000e+00 - loss: 0.6468 - val_accuracy: 0.0000e+00 - val_loss: 0.6447
Epoch 6/100
1/1 - 0s - 64ms/step - accuracy: 0.0000e+00 - loss: 0.6351 - val_accuracy: 0.0000e+00 - val_loss: 0.6346
Epoch 7/100
1/1 - 0s - 58ms/step - accuracy: 0.0000e+00 - loss: 0.6228 - val_accuracy: 0.0000e+00 - val_loss: 0.6238
Epoch 8/100
1/1 - 0s - 55ms/step - accuracy: 0.0000e+00 - loss: 0.6097 - val_accuracy: 0.0000e+00 - val_loss: 0.6123
Epoch 9/100
1/1 - 0s - 59ms/step - accuracy: 0.0000e+00 - loss: 0.5957 - val_

In [5]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Load filtered JSON data
def load_data(input_file):
    with open(input_file, 'r') as infile:
        data = json.load(infile)
    return data

# Preprocessing the data
def preprocess_data(data, periodic_table_elements):
    compositions = []
    features = []

    # Extract features and compositions
    for entry in data:
        features.append(entry['lemma'])
        composition = [1 if elem in entry['composition'] else 0 for elem in periodic_table_elements]
        compositions.append(composition)

    return features, compositions

# Encode the feature list into a one-hot encoding format
def encode_features(feature_list):
    df = pd.DataFrame(feature_list)
    df_encoded = pd.get_dummies(df, prefix_sep='_', drop_first=True)
    return df_encoded

# Define and compile the enhanced MLP model
def create_mlp(input_dim, output_dim):
    model = Sequential([
        Dense(256, input_dim=input_dim, activation='relu'),
        Dropout(0.3),  # Dropout layer to prevent overfitting
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dense(output_dim, activation='sigmoid')  # Sigmoid for multi-label classification
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train and evaluate MLP model with overall accuracy calculation
def train_and_evaluate_mlp(X_train, X_test, y_train, y_test, periodic_table_elements):
    input_dim = X_train.shape[1]
    output_dim = y_train.shape[1]

    model = create_mlp(input_dim, output_dim)

    # Early stopping and adaptive learning rate reduction
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)

    # Train the model
    history = model.fit(X_train, y_train, validation_split=0.2, epochs=150, batch_size=64,
                        callbacks=[early_stopping, reduce_lr], verbose=2)

    # Evaluate on test data
    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print(f"\nMLP Test Accuracy per label: {test_accuracy * 100:.2f}%")

    # Predict on test set
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int)

    # Calculate and print overall accuracy
    overall_accuracy = np.mean([np.array_equal(y_test[i], y_pred[i]) for i in range(len(y_test))])
    print(f"\nOverall MLP Accuracy: {overall_accuracy * 100:.2f}%")

    # Print original and predicted composition for test set
    print("\nTest Results (Original vs Predicted):")
    for i in range(len(y_test)):
        original = [periodic_table_elements[idx] for idx, val in enumerate(y_test[i]) if val == 1]
        predicted = [periodic_table_elements[idx] for idx, val in enumerate(y_pred[i]) if val == 1]
        print(f"Original: {original} | Predicted: {predicted}")
    print("\n" + "="*50 + "\n")

# Main function to run the process
def main():
    input_file = 'filtered_allData_result.json'

    # List of valid periodic table elements
    periodic_table_elements = [
        "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S",
        "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga",
        "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd",
        "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm",
        "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os",
        "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa",
        "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg",
        "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
    ]

    # Load and preprocess data
    data = load_data(input_file)
    features, compositions = preprocess_data(data, periodic_table_elements)

    # Encode features into a numeric format
    X = encode_features(features)
    y = np.array(compositions)

    # Split the data into 80% training and 20% testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Feature Scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train and evaluate MLP model
    train_and_evaluate_mlp(X_train, X_test, y_train, y_test, periodic_table_elements)

# Run the script
if __name__ == "__main__":
    main()


Epoch 1/150


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


1/1 - 2s - 2s/step - accuracy: 0.0000e+00 - loss: 0.7342 - val_accuracy: 0.0000e+00 - val_loss: 0.6928 - learning_rate: 0.0010
Epoch 2/150
1/1 - 0s - 119ms/step - accuracy: 0.0000e+00 - loss: 0.7048 - val_accuracy: 0.0000e+00 - val_loss: 0.6765 - learning_rate: 0.0010
Epoch 3/150
1/1 - 0s - 55ms/step - accuracy: 0.0000e+00 - loss: 0.6862 - val_accuracy: 0.0000e+00 - val_loss: 0.6616 - learning_rate: 0.0010
Epoch 4/150
1/1 - 0s - 58ms/step - accuracy: 0.0000e+00 - loss: 0.6673 - val_accuracy: 0.2000 - val_loss: 0.6468 - learning_rate: 0.0010
Epoch 5/150
1/1 - 0s - 57ms/step - accuracy: 0.0000e+00 - loss: 0.6476 - val_accuracy: 0.2000 - val_loss: 0.6311 - learning_rate: 0.0010
Epoch 6/150
1/1 - 0s - 51ms/step - accuracy: 0.0000e+00 - loss: 0.6324 - val_accuracy: 0.2000 - val_loss: 0.6146 - learning_rate: 0.0010
Epoch 7/150
1/1 - 0s - 54ms/step - accuracy: 0.0625 - loss: 0.6175 - val_accuracy: 0.2000 - val_loss: 0.5962 - learning_rate: 0.0010
Epoch 8/150
1/1 - 0s - 56ms/step - accuracy: 0

In [6]:
import json
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, hamming_loss
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.multiclass")

# Load filtered JSON data
def load_data(input_file):
    with open(input_file, 'r') as infile:
        data = json.load(infile)
    return data

# Preprocessing the data
def preprocess_data(data, periodic_table_elements):
    compositions = []
    features = []

    # Extract features and compositions
    for entry in data:
        features.append(entry['lemma'])
        composition = [1 if elem in entry['composition'] else 0 for elem in periodic_table_elements]
        compositions.append(composition)

    return features, compositions

# Encode the feature list into a one-hot encoding format
def encode_features(feature_list):
    df = pd.DataFrame(feature_list)
    df_encoded = pd.get_dummies(df, prefix_sep='_', drop_first=True)
    return df_encoded

# Train and evaluate models with OneVsRestClassifier and Hamming Loss
def train_and_evaluate_ensemble(X_train, X_test, y_train, y_test, periodic_table_elements):
    # Define individual models
    rf_model = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42))
    xgb_model = OneVsRestClassifier(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=100, max_depth=10, learning_rate=0.05, random_state=42))

    models = {'RandomForest': rf_model, 'XGBoost': xgb_model}

    # Train and evaluate each model
    for name, model in models.items():
        print(f"\nTraining {name} model...\n")
        model.fit(X_train, y_train)

        # Predict on test set
        y_pred = model.predict(X_test)

        # Calculate per-label and overall accuracy, and Hamming Loss
        test_accuracy = np.mean([accuracy_score(y_test[:, i], y_pred[:, i]) for i in range(y_test.shape[1])])
        overall_accuracy = np.mean([np.array_equal(y_test[i], y_pred[i]) for i in range(len(y_test))])
        hamming_loss_score = hamming_loss(y_test, y_pred)

        print(f"\n{name} Test Accuracy per label: {test_accuracy * 100:.2f}%")
        print(f"Overall {name} Accuracy: {overall_accuracy * 100:.2f}%")
        print(f"{name} Hamming Loss: {hamming_loss_score:.4f}")

        # Print original and predicted composition for test set
        print("\nTest Results (Original vs Predicted):")
        for i in range(len(y_test)):
            original = [periodic_table_elements[idx] for idx, val in enumerate(y_test[i]) if val == 1]
            predicted = [periodic_table_elements[idx] for idx, val in enumerate(y_pred[i]) if val == 1]
            print(f"Original: {original} | Predicted: {predicted}")
        print("\n" + "="*50 + "\n")

# Main function to run the process
def main():
    input_file = 'filtered_allData_result.json'  # Update the path as necessary

    # List of valid periodic table elements
    periodic_table_elements = [
        "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S",
        "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga",
        "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd",
        "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm",
        "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os",
        "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa",
        "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg",
        "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
    ]

    # Load and preprocess data
    data = load_data(input_file)
    features, compositions = preprocess_data(data, periodic_table_elements)

    # Encode features into a numeric format
    X = encode_features(features)
    y = np.array(compositions)

    # Split the data into 80% training and 20% testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Feature Scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train and evaluate ensemble model
    train_and_evaluate_ensemble(X_train, X_test, y_train, y_test, periodic_table_elements)

# Run the script
if __name__ == "__main__":
    main()



Training RandomForest model...


RandomForest Test Accuracy per label: 98.16%
Overall RandomForest Accuracy: 50.00%
RandomForest Hamming Loss: 0.0184

Test Results (Original vs Predicted):
Original: ['Cr', 'Mn', 'Fe', 'Co', 'Ni'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']
Original: ['H', 'Si', 'Cr', 'Mn', 'Fe', 'Co'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']
Original: ['Al', 'Ti', 'Cr', 'Fe', 'Co', 'Ni'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']
Original: ['Al', 'Ti', 'Cr', 'Fe', 'Cu', 'Zn'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']
Original: ['Si', 'Cr', 'Mn', 'Fe', 'Co', 'Cu'] | Predicted: ['Si', 'Cr', 'Mn', 'Fe', 'Co', 'Cu']
Original: ['Cr', 'Mn', 'Fe', 'Co', 'Ni'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']



Training XGBoost model...



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




XGBoost Test Accuracy per label: 97.18%
Overall XGBoost Accuracy: 33.33%
XGBoost Hamming Loss: 0.0282

Test Results (Original vs Predicted):
Original: ['Cr', 'Mn', 'Fe', 'Co', 'Ni'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']
Original: ['H', 'Si', 'Cr', 'Mn', 'Fe', 'Co'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']
Original: ['Al', 'Ti', 'Cr', 'Fe', 'Co', 'Ni'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']
Original: ['Al', 'Ti', 'Cr', 'Fe', 'Cu', 'Zn'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']
Original: ['Si', 'Cr', 'Mn', 'Fe', 'Co', 'Cu'] | Predicted: ['C']
Original: ['Cr', 'Mn', 'Fe', 'Co', 'Ni'] | Predicted: ['Cr', 'Mn', 'Fe', 'Co', 'Ni']


