In [1]:
# Load self-defined functions.
%run tools.ipynb

# Import libraries.
from KmdPlus import StatsDescriptor, formula_to_composition 
import pandas as pd
import numpy as np
from statistics import median
from scipy.spatial import distance_matrix
from pymatgen.core.composition import Composition
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import time
import pickle
import math
import os
from keras import backend as K
from keras.layers import Layer
from keras import regularizers
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, BatchNormalization, Activation
from tensorflow.keras.layers import Input, Dropout
import random
import optuna
# from mp_api import MPRester
from matminer.featurizers.site import CrystalNNFingerprint  
from matminer.featurizers.structure import SiteStatsFingerprint
import itertools
from collections import Counter
import copy

# Loading MLdata (run CSPML_Creating_MLdata.ipynb for creating CSPML_MLdata.xz).
with open("data_set/CSPML_MLdata.xz", "rb") as f:
    MLdata = pickle.load(f)

In [2]:
# Data for the hyper-parameter optimization.
i = 0

train_X, train_y = MLdata[i]["train_set"]["X"], to_categorical(MLdata[i]["train_set"]["y"])
val_X, val_y = MLdata[i]["val_set"]["X"], to_categorical(MLdata[i]["val_set"]["y"])
test_X, test_y = MLdata[i]["test_set"]["X"], to_categorical(MLdata[i]["test_set"]["y"])

# Function for making model structure (classifier).
def Make_MLP_model(Layer_num, Unit_num, Drop_out, Input_shape):
    K.clear_session()
    input = Input(shape=(Input_shape,))
    x = Dense(Unit_num)(input)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(Drop_out)(x)
    
    for i in range(Layer_num-2):
        x = Dense(Unit_num)(x)
        x = BatchNormalization()(x)
        x = Activation('relu')(x)
        x = Dropout(Drop_out)(x)
    
    x = Dense(Unit_num)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dense(2)(x) # for classifier.
    x = Activation('softmax')(x) # for classifier.
    model = Model(input, x)
    return model

In [3]:
###### Hyper parameter optimization #####

start = time.time()

# Set seed for whole loops.
tf.keras.utils.set_random_seed(1122)

# Set the fixed hyperparameters.
verbose = 0
epochs = 1000
n_trials = 50

# Define optuna function.
def objective(trial):

    # Set region of hyperparameters.
    Drop_out = trial.suggest_float('Drop_out', 0, 0.2, step = 0.1)
    Unit_num = trial.suggest_int('Unit_num', 200, 1000, 200)
    Layer_num = trial.suggest_int('Layer_num', 2, 4, 1)
    patience = trial.suggest_int('patience', 50, 100, 25)
    batch_size = trial.suggest_categorical('batch_size', [1024, 2048])
    
    # Create model.
    model = Make_MLP_model(Layer_num, Unit_num, Drop_out, train_X.shape[1])
    model.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=['accuracy'])

    # Set callback.
    early_stopping = EarlyStopping(monitor='val_accuracy', patience = patience, verbose = verbose, mode='max')
    mcp_save = ModelCheckpoint('model.hdf5',verbose = verbose, save_best_only=True, monitor='val_accuracy', mode='max')

    # Run model.
    H = model.fit(train_X,train_y,batch_size=batch_size,epochs=epochs,verbose=verbose,validation_data=(val_X, val_y),
                  callbacks=[early_stopping, mcp_save])

    # Load model.
    model.load_weights('model.hdf5')

    # Evaluation of model.
    y_pred = model.predict(val_X, verbose=0)[:,1] 
    y_true = val_y[:,1]
    acc = 1 - np.sum(np.abs(y_true - (y_pred >= 0.5)))/len(y_pred) 

    return acc

# Run optuna for hyper-parameter optimization.
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=42), direction='maximize') # Set seed.
study.optimize(objective, n_trials=n_trials)

end = time.time()
print(f"Calculation time: {end-start} s")

# Save optuna results.
with open("data_set/CSPML_study.xz", "wb") as f:
    pickle.dump(study, f)

[32m[I 2023-12-10 17:36:52,705][0m A new study created in memory with name: no-name-3f9537a3-90a4-43ee-9713-aa28e4aa4c3a[0m
2023-12-10 17:36:52.721111: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-12-10 17:36:52.721337: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M1 Max


2023-12-10 17:36:53.326847: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-12-10 17:36:53.828234: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 17:36:58.336916: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 17:39:28.420430: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
[32m[I 2023-12-10 17:39:29,387][0m Trial 0 finished with value: 0.8467614533965244 and parameters: {'Drop_out': 0.1, 'Unit_num': 1000, 'Layer_num': 4, 'patience': 75, 'batch_size': 1024}. Best is trial 0 with value: 0.8467614533965244.[0m
2023-12-10 17:39:30.111846: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 17:39:

2023-12-10 17:58:37.502088: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
[32m[I 2023-12-10 17:58:38,336][0m Trial 11 finished with value: 0.8493568043331077 and parameters: {'Drop_out': 0.2, 'Unit_num': 600, 'Layer_num': 3, 'patience': 50, 'batch_size': 1024}. Best is trial 11 with value: 0.8493568043331077.[0m
2023-12-10 17:58:38.879015: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 17:58:41.438060: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 18:00:02.662756: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
[32m[I 2023-12-10 18:00:03,499][0m Trial 12 finished with value: 0.8472128187767998 and parameters: {'Drop_out': 0.2, 'Unit_num': 600, 

2023-12-10 18:16:45.454000: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 18:18:09.107402: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
[32m[I 2023-12-10 18:18:10,006][0m Trial 23 finished with value: 0.8467614533965244 and parameters: {'Drop_out': 0.2, 'Unit_num': 600, 'Layer_num': 3, 'patience': 50, 'batch_size': 1024}. Best is trial 19 with value: 0.8513879485443466.[0m
2023-12-10 18:18:10.563621: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 18:18:14.370840: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 18:19:28.956466: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_typ

2023-12-10 18:37:21.203419: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 18:37:25.176324: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 18:39:39.164623: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
[32m[I 2023-12-10 18:39:40,041][0m Trial 35 finished with value: 0.8452945159106297 and parameters: {'Drop_out': 0.2, 'Unit_num': 400, 'Layer_num': 3, 'patience': 75, 'batch_size': 1024}. Best is trial 19 with value: 0.8513879485443466.[0m
2023-12-10 18:39:40.639201: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 18:39:43.124171: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_typ

[32m[I 2023-12-10 18:54:39,685][0m Trial 46 finished with value: 0.8494696456781765 and parameters: {'Drop_out': 0.2, 'Unit_num': 600, 'Layer_num': 3, 'patience': 50, 'batch_size': 1024}. Best is trial 19 with value: 0.8513879485443466.[0m
2023-12-10 18:54:40.163026: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 18:54:43.542490: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 18:55:35.135610: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
[32m[I 2023-12-10 18:55:35,925][0m Trial 47 finished with value: 0.8415707515233581 and parameters: {'Drop_out': 0.2, 'Unit_num': 600, 'Layer_num': 2, 'patience': 50, 'batch_size': 1024}. Best is trial 19 with value: 0.8513879485443466.[0m
2023-12-10 18:55:36.584850: I tensorflow/core/gra

Calculation time: 4877.7916169166565 s


In [4]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_Drop_out,params_Layer_num,params_Unit_num,params_batch_size,params_patience,state
0,0,0.846761,2023-12-10 17:36:52.707104,2023-12-10 17:39:29.387660,0 days 00:02:36.680556,0.1,4,1000,1024,75,COMPLETE
1,1,0.837508,2023-12-10 17:39:29.388345,2023-12-10 17:41:54.041421,0 days 00:02:24.653076,0.0,3,1000,2048,100,COMPLETE
2,2,0.846084,2023-12-10 17:41:54.042102,2023-12-10 17:42:36.303301,0 days 00:00:42.261199,0.2,2,400,2048,50,COMPLETE
3,3,0.844166,2023-12-10 17:42:36.303902,2023-12-10 17:43:44.035139,0 days 00:01:07.731237,0.1,3,400,2048,50,COMPLETE
4,4,0.844166,2023-12-10 17:43:44.035739,2023-12-10 17:45:05.431841,0 days 00:01:21.396102,0.1,2,800,1024,75,COMPLETE
5,5,0.843602,2023-12-10 17:45:05.432439,2023-12-10 17:47:48.519930,0 days 00:02:43.087491,0.1,2,200,1024,100,COMPLETE
6,6,0.828594,2023-12-10 17:47:48.520524,2023-12-10 17:48:40.971746,0 days 00:00:52.451222,0.0,4,200,2048,75,COMPLETE
7,7,0.844166,2023-12-10 17:48:40.972336,2023-12-10 17:49:33.425907,0 days 00:00:52.453571,0.0,2,1000,2048,75,COMPLETE
8,8,0.844505,2023-12-10 17:49:33.426518,2023-12-10 17:53:30.756443,0 days 00:03:57.329925,0.1,4,200,1024,100,COMPLETE
9,9,0.841571,2023-12-10 17:53:30.757051,2023-12-10 17:54:22.942967,0 days 00:00:52.185916,0.1,2,1000,2048,50,COMPLETE


In [5]:
### Training ensemble models with the chosen parameters.

# Set best hyperparameters chosen by optuna studies.
Drop_out = study.best_trial.params['Drop_out']
Unit_num = study.best_trial.params['Unit_num']
Layer_num = study.best_trial.params['Layer_num']
patience = study.best_trial.params['patience']
batch_size = study.best_trial.params['batch_size']

print(f"Layer_num: {Layer_num}")
print(f"Unit_num: {Unit_num}")
print(f"Drop_out: {Drop_out}")
print(f"patience: {patience}")
print(f"batch_size: {batch_size}")

start = time.time()

# Set seed for whole loops.
tf.keras.utils.set_random_seed(1122)

models = []

for i in range(len(MLdata)):
    # Get data.
    train_X, train_y = MLdata[i]["train_set"]["X"], to_categorical(MLdata[i]["train_set"]["y"])
    val_X, val_y = MLdata[i]["val_set"]["X"], to_categorical(MLdata[i]["val_set"]["y"])
    test_X, test_y = MLdata[i]["test_set"]["X"], to_categorical(MLdata[i]["test_set"]["y"])
    
    # Create model.
    model = Make_MLP_model(Layer_num, Unit_num, Drop_out, train_X.shape[1])
    model.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=['accuracy'])

    # Set callback.
    early_stopping = EarlyStopping(monitor='val_accuracy', patience = patience, verbose = verbose, mode='max')
    mcp_save = ModelCheckpoint('model.hdf5',verbose = verbose, save_best_only=True, monitor='val_accuracy', mode='max')

    # Run model.
    H = model.fit(train_X,train_y,batch_size=batch_size,epochs=epochs,verbose=verbose,validation_data=(val_X, val_y),
                  callbacks=[early_stopping, mcp_save])

    # Load model.
    model.load_weights('model.hdf5')

    # Evaluation of model.
    y_pred = model.predict(val_X, verbose=0)[:,1] 
    y_true = val_y[:,1]
    acc = 1 - np.sum(np.abs(y_true - (y_pred >= 0.5)))/len(y_pred) 
    print(acc)
    
    models.append(model)

end = time.time()
print(f"Calculation time: {end-start} s")

# Save models.
with open("data_set/CSPML_models.xz", "wb") as f:
    pickle.dump(models, f)

Layer_num: 3
Unit_num: 400
Drop_out: 0.2
patience: 50
batch_size: 1024


2023-12-10 19:16:27.242745: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 19:16:32.788346: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 19:17:58.221345: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


0.8461972466711803


2023-12-10 19:17:59.898576: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 19:18:05.138838: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 19:19:30.672033: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


0.866984505363528


2023-12-10 19:19:32.380630: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 19:19:37.868924: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 19:21:08.601515: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


0.8672270842929526


2023-12-10 19:21:10.375350: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 19:21:15.937503: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 19:22:52.157661: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


0.8543735790355391


2023-12-10 19:22:53.888135: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 19:22:59.497223: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 19:24:41.468894: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


0.8579929053667468
Calculation time: 496.0056710243225 s
INFO:tensorflow:Assets written to: ram://2d9f1e72-62dd-43db-ab64-0ac98a0c3eee/assets
INFO:tensorflow:Assets written to: ram://d25683a6-30d3-469d-a74e-49487d369e8d/assets
INFO:tensorflow:Assets written to: ram://7a9515e6-d479-4f25-a173-3b3735e4e96e/assets
INFO:tensorflow:Assets written to: ram://4c92d5a8-75ff-47cc-9e6b-c738a790fa92/assets
INFO:tensorflow:Assets written to: ram://61cd304e-e24a-4f73-857b-b7595e1232a6/assets


In [7]:
# For evaluating model performance.
from sklearn.metrics import average_precision_score, accuracy_score

os.makedirs("ML_metrics")

# Calculate per-class Metrics.

list_APs = []
list_accs = []

for i in range(len(MLdata)):
    # Get ratio labels.
    ratio_class = copy.deepcopy(MLdata[i]["test_set"]["y_class"]) # Test class label (ith).
    # Get couters and unique labels.
    count = Counter(ratio_class).most_common()
    ratio_class_unique = np.array([count[i][0] for i in range(len(count))], dtype = "object")
    freqs = np.array([count[i][1] for i in range(len(count))])
    # Get test set.
    test_X, test_y = MLdata[i]["test_set"]["X"], MLdata[i]["test_set"]["y"]
    # Calculate per class metrics.
    APs = []
    accs = []
    for j in range(len(ratio_class_unique)):
        ix = (ratio_class == ratio_class_unique[j])
        y_pred = models[i].predict(test_X[ix], verbose=0)[:,1]
        y_true = test_y[ix]
        accs.append(accuracy_score(y_true, (y_pred >= 0.5)*1)) # accuracy is ok for in case len(y_true) = 1 or 
                                                               # len(np.unique(y_true)) = 1.
        if len(np.unique(y_true)) < 2: # in case AP is meaningless (since always 1).
            pass
        else:
            APs.append(average_precision_score(y_true, y_pred))
           
    APs = np.array(APs)
    accs = np.array(accs)
    
    # Save plot.
    fig = plt.figure(figsize=(16,4))
    plt.bar(np.arange(len(APs)), APs)
    plt.title(f"Per class average precisions for {i+1}th test set; MAP = {np.mean(APs):.3f}")
    plt.xlabel(f"Ratio labels")
    plt.ylabel("Average precisions")
    fig.tight_layout()
    plt.savefig(f"ML_metrics/Per class average precisions for {i+1}th test set.png", dpi = 100)
    plt.close()
    
    fig = plt.figure(figsize=(16,4))
    plt.bar(np.arange(len(accs)), accs)
    plt.title(f"Per class accuracies for {i+1}th test set; MACC={np.mean(accs):.3f}")
    plt.xlabel(f"Ratio labels")
    plt.ylabel("Accuracies")
    fig.tight_layout()
    plt.savefig(f"ML_metrics/Per class accuracies for {i+1}th test set.png", dpi = 100)
    plt.close()
    
    fig = plt.figure(figsize=(16,4))
    plt.bar(np.arange(len(accs)), freqs, log=True)
    plt.title(f"Distribution of the ratio labels for {i+1}th test set")
    plt.xlabel(f"Ratio labels")
    plt.ylabel("Number of ratio labels")
    fig.tight_layout()
    plt.savefig(f"ML_metrics/Distribution of the ratio labels for {i+1}th test set.png", dpi = 100)
    plt.close()
    
    list_APs.append(APs)
    list_accs.append(accs)
    
# Get MAPs and MACCs.
MAPs = np.array([np.mean(list_APs[i]) for i in range(len(MLdata))])
MACCs = np.array([np.mean(list_accs[i]) for i in range(len(MLdata))])

# Not per class APs and Accs.
APs, accs = [], []

for i in range(len(MLdata)):
    # Get test set.
    test_X, test_y = MLdata[i]["test_set"]["X"], MLdata[i]["test_set"]["y"]
    # Make prediction.
    y_pred = models[i].predict(test_X, verbose=0)[:,1]
    y_true = test_y
    APs.append(average_precision_score(y_true, y_pred))
    accs.append(accuracy_score(y_true, (y_pred >= 0.5)*1))
    
APs = np.array(APs)
accs = np.array(accs)

2023-12-10 19:57:33.164871: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 19:57:42.626749: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-12-10 19:57:51.547809: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [8]:
# Summarize results.
metric_mean = np.array([np.mean(MAPs), np.mean(MACCs), np.mean(APs), np.mean(accs)])
metric_std = np.array([np.std(MAPs), np.std(MACCs), np.std(APs), np.std(accs)])

from tabulate import tabulate

summary = np.round(np.array([metric_mean, metric_std]), 3)

print(tabulate(summary, headers = ["MAP", "MACC", "AP", "ACC"], showindex= ["Mean", "Std"]))

        MAP    MACC     AP    ACC
----  -----  ------  -----  -----
Mean  0.941   0.89   0.913  0.864
Std   0.007   0.007  0.009  0.011
