# Prerequisites

## Modules

In [None]:
# Modules are available in conda environment with name: icet
# conda activate icet
#test
import ase
from ase.io import read as ASEread
from ase.io.vasp import write_vasp
from ase.db import connect
from ase.cell import Cell
from ase.neighborlist import NewPrimitiveNeighborList
from ase.build import make_supercell

import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import colormaps
import glob
import sys
import os
import random
import shutil

import icet
from icet import ClusterSpace, StructureContainer, ClusterExpansion
from trainstation import CrossValidationEstimator
from icet.tools import enumerate_structures
from icet.tools.structure_generation import generate_sqs_by_enumeration

try:
    import seaborn as sns
    sns.set_context('notebook')
except ImportError:
    print('sad')
    
import subprocess

import datetime
import time as pytime

## Misc Functions

In [None]:
# Stop message
def jupyter_stop(ErrorMessage="User-defined stop via jupyter_stop() function"):
    """
    User defined stop function, similar to exit(). Mostly for testing purpose or to 
    avoid overwriting of already generated data.
    """
    raise SystemExit(ErrorMessage)

## via finished CE

In [None]:

filename_cluster_expansion = 'mixing_energy_anchorTS_lasso.ce'
path_cluster_expansion = os.path.join(os.path.abspath(os.getcwd()), filename_cluster_expansion)
print(path_cluster_expansion)
ce = icet.ClusterExpansion.read(path_cluster_expansion)
species = ['Li', 'X']
data = {'concentration': [], 'mixing_energy': []}
structures = []
cluster_space = ce.get_cluster_space_copy()
chemical_symbols = cluster_space.chemical_symbols
print(cluster_space.primitive_structure)
primitive_structure = cluster_space.primitive_structure
for structure in enumerate_structures(structure=primitive_structure,
                                      sizes=range (1, 7),
                                      chemical_symbols=chemical_symbols):
    concentration = structure.symbols.count('Li') / len(structure)
    data['concentration'].append(concentration)
    data['mixing_energy'].append(ce.predict(structure))
    structures.append(structure)
print(f'Predicted energies for {len(structures)} structures')


hull = icet.tools.ConvexHull(data['concentration'], data['mixing_energy'])

data['concentration'] = np.array(data['concentration']) * 1e3

fig, ax = plt.subplots(figsize=(4, 3))
ax.set_xlabel(r'Li concentration')
ax.set_ylabel(r'Mixing energy (meV/atom)')
ax.set_xlim([0, 1])
ax.set_ylim([-69, 15])
ax.scatter(data['concentration'], data['mixing_energy'],
           marker='x')
ax.plot(hull.concentrations, 1e3 * hull.energies, '-o', color='green')
plt.savefig(filename_cluster_expansion.replace('.ce', '_predicted.png'), bbox_inches='tight')

### with average of rnd start structures

In [None]:
# Read the CSV file containing energies of randomized structures
filename_energy_of_rnd_structures = '~/CathodeSimulationResults/energy_of_rnd_start-structures/kmc_Cathode_energy_of_randomized_structures.csv'
# Use os.path.expanduser to expand '~' to the full home directory path.
path_energy_of_rnd_structures = os.path.expanduser(filename_energy_of_rnd_structures)
data_rnd = pd.read_csv(path_energy_of_rnd_structures)
data_rnd.columns = data_rnd.columns.str.replace('\t', '') #getting rid of tab characters in column names
# Calculate concentration as 1-(vacancy_content/100)
if 'vacancy_content' in data_rnd.columns:
    data_rnd['concentration'] = 1 - (data_rnd['vacancy_content'] / 100)
    print("Added concentration column based on vacancy content")
else:
    print("Warning: 'vacancy_content' column not found in data_rnd")
    # List available columns for reference
    print("Available columns:", data_rnd.columns.tolist())
print(data_rnd.head())
print(data_rnd.columns.tolist())

In [None]:
# Calculate mean and standard deviation of energy for each concentration
grouped = data_rnd.groupby('concentration')['energy_avg [meV/atom]'].agg(['energy_avg [meV/atom]', 'std_dev[meV/atom]']).reset_index()

fig, ax = plt.subplots(figsize=(8, 6))
ax.errorbar(grouped['concentration'], grouped['energy_avg [meV/atom]'], yerr=grouped['std'], fmt='o', label='Randomized structures (mean ± std)', capsize=4)

# Plot convex hull if available
if 'hull' in locals():
    ax.plot(hull.concentrations, 1e3 * hull.energies, '-o', color='green', label='Convex Hull')

ax.set_xlabel('Concentration')
ax.set_ylabel('Mixing energy (meV/atom)')
ax.legend()
plt.tight_layout()
plt.show()

## CE Functions

In [None]:
# Basic setups

def get_fit_data(prim, chemical_symbols, cutoffs, energy_list, atoms_ref_list, outcar_list, position_tolerance, symprec, tol_positions):
    """
    Construct cluster space and structure container for the given cutoffs
    and return the fit matrix along with the target energies
    """
    # stepsize to print update of training:
    stepsize = int(0.1*len(outcar_list))
    
    # Collect the mapped structures
    mapped_structures = []
    
    # Set up Clusterspace
    cs = ClusterSpace(structure=prim,
                      cutoffs=cutoffs,
                      chemical_symbols=chemical_symbols,
                      position_tolerance=position_tolerance,
                      symprec=symprec)
    
    #print(cs)
    
    # Set up StructureContainer with the previsouly generated ClusterSpace
    sc = StructureContainer(cluster_space=cs)
    
    # Fill the StructureContainer
    for i, (outcar, E, at_ref) in enumerate(zip(outcar_list, energy_list, atoms_ref_list)):
        
        # print update of training
        if i % stepsize == 0:
            print(f"Computing structure {i} of {len(outcar_list)} ({i/len(outcar_list):.1%})   {datetime.datetime.now()}")
        
        
        # Read the OUTCAR [by default last step is used] and get energy
        #at     = ASEread(outcar)
        #total_energy = at.get_potential_energy() # total_energy = atoms.get_potential_energy(force_consistent=True)
        
        # Map the enumerated structure to the primitive cell, add it to cluster space with the energy of the properly relaxed system
        try:
            mapped_atoms, info = icet.tools.map_structure_to_reference(structure=at_ref, 
                                                             reference=prim, 
                                                             inert_species=["O"], 
                                                             tol_positions=tol_positions, 
                                                             suppress_warnings=False, 
                                                             assume_no_cell_relaxation=False)
            mapped_structures.append(mapped_atoms)

            sc.add_structure(structure=mapped_atoms,
                     properties={'Total Energy': E},
                     user_tag = outcar,
                     sanity_check=True,
                     )
        
        except ValueError as err:
            print(f"Mapping Error with {outcar}")
            print(f"Note: Possibly a different structure was used for the mapping!")
            print("Original Error Message:")
            print(err , "\n")
            

            
    print(f"len(cs) = {len(cs)}")
    
    return sc.get_fit_data(key='Total Energy'), mapped_structures

def get_mapped_structures(prim, atoms_ref_list, tol_positions):
    mapped_structures = []
    for at_ref in atoms_ref_list:
        try:
            mapped_atoms, info = icet.tools.map_structure_to_reference(structure=at_ref, 
                                                             reference=prim, 
                                                             inert_species=["O"], 
                                                             tol_positions=tol_positions, 
                                                             suppress_warnings=False, 
                                                             assume_no_cell_relaxation=False)
            mapped_structures.append(mapped_atoms)
        except ValueError as err:
            print(f"Mapping Error with {outcar}")
            print(f"Note: Possibly a different structure was used for the mapping!")
            print("Original Error Message:")
            print(err , "\n")
    return mapped_structures
    

def get_A_y(prim, chemical_symbols, cutoffs, energy_list, atoms_ref_list, outcar_list, position_tolerance, symprec, tol_positions):
    return get_fit_data(prim, chemical_symbols, cutoffs, energy_list, atoms_ref_list, outcar_list, position_tolerance, symprec, tol_positions)



def get_row(cve, alpha=None):
    row = dict()
    row['rmse_validation'] = cve.rmse_validation
    row['rmse_train'] = cve.rmse_train
    row['BIC'] = cve.model.BIC
    row['n_parameters'] = cve.n_parameters
    row['n_nonzero_parameters'] = cve.n_nonzero_parameters
    
    if alpha != None:
        row['alpha'] = alpha
    
    return row


def train_ce(prim, chemical_symbols, cutoffs, energy_list, atoms_ref_list, outcar_list, position_tolerance, symprec, tol_positions, fit_method):
    """
    Train a cluster expansion with the given cutoffs and return fit metrics of the obtained model.
    prim: ase atoms object, its the primitive structure that the CE lives on
    chemical_symbols: List of the possible atoms types on the different sites of prim
    cutoffs: cutoffs for the 2-body, 3-body, ... terms
    atoms_list: list of all the atoms objects to use for training/testing
    outcar_list : list with paths (strings) of the corresponding atoms objects
    fit_method examples with additional options (to be implemented at a later point): 
        fit_method='rfe'
        fit_method='ardr', threshold_lambda=4e5
        fit_method='ardr', line_scan=True
        fit_method='lasso'
        fit_method='least-squares'
    """
    (A, y), mapped_structures = get_fit_data(prim, chemical_symbols, cutoffs, energy_list, atoms_ref_list, outcar_list, position_tolerance, symprec, tol_positions)
    if fit_method == 'ardr-lambda':
        cve = CrossValidationEstimator((A, y), fit_method='ardr', threshold_lambda=1000, validation_method='shuffle-split', n_splits=10)
    elif fit_method == 'ardr-lineScan':
        cve = CrossValidationEstimator((A, y), fit_method='ardr', line_scan=True, validation_method='shuffle-split', n_splits=10)
    else:
        cve = CrossValidationEstimator((A, y), fit_method=fit_method, validation_method='shuffle-split', n_splits=10)
    cve.validate()
    cve.train()

    row = get_row(cve)
    
    return row

def prevent_overwrite(file_name,add=''):
    if os.path.exists(file_name+add):
        if add != '':
            n = int(add)
            n += 1
        else:
            n = 1  
        add = str(n)
        file_name = prevent_overwrite(file_name,add)
        add = ''
    return file_name + add

## Reorder Atoms

In [None]:
# S only in this list to 'trick' the structure enumeration
# S as extra Nickel
atomic_label2number = {"Li" :  3,
                       "O"  :  8,
                       "S"  : 16,
                       "Ni" : 28}

atomic_number2label = { 3 : "Li",
                        8 :  "O",
                       16 :  "S",
                       28 :  "Ni"}


def order_atoms(atoms,order=["Li","Ni","O"]):

    # get old positions and atomic numbers
    old_positions       = atoms.get_positions()
    old_atomic_number   = atoms.get_atomic_numbers()

    # create empty dict for all types
    atomic_pos_dict = {}
    for sym in order:
        atomic_pos_dict[sym] = []

    # append positions to dict
    for num, pos in zip(old_atomic_number, old_positions):
        atomic_pos_dict[atomic_number2label[num]].append(pos)

    # put together the new ordered positions and atomic numbers
    new_positions = []
    new_atomic_numbers = []
    for sym in order:
        new_positions.extend(atomic_pos_dict[sym])
        new_atomic_numbers.extend( [ atomic_label2number[sym] ] * len(atomic_pos_dict[sym]) )

    # copy original atoms object and modify it
    copy_atoms = atoms.copy()
    copy_atoms.set_positions(new_positions)
    copy_atoms.set_atomic_numbers(new_atomic_numbers)

    return copy_atoms


# Collect data

In [None]:
### Get the reference energies of LiNiO2 and NiO2 normed per unit cell
LiNiO2 = ASEread("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/CE_database_Marcel/02_enumerate_P21c_0-4fu/0001_finished_approved/run_final_approved/OUTCAR")
E_ref_LiNiO2_per_O2 = LiNiO2.get_potential_energy() / LiNiO2.get_chemical_symbols().count("O") * 2 #or per Ni in case of no extra Ni

NiO2   = ASEread("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/CE_database_Marcel/02_enumerate_P21c_0-4fu/0003_finished_approved/run_final_approved/OUTCAR")
E_ref_NiO2_per_O2   = NiO2.get_potential_energy() / NiO2.get_chemical_symbols().count("O") * 2 #or per Ni in case of no extra Ni

data = {}

## Own enumerated structures based on P21/c

In [None]:
#Li verteilung ohne trans
atoms_for_training_from_own_enumerated_structures = []
H_o_M_for_training_from_own_enumerated_structures = []

# Get all the outcars of interest
outcars_for_training_from_own_enumerated_structures= sorted(glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/CE_database_Marcel/02_enumerate_P21c_0-4fu/0*_finished_approved/run_final_approved/OUTCAR"))

# Iterate over them
for outcar in outcars_for_training_from_own_enumerated_structures:

    # get atoms object
    atoms = ASEread(outcar, index=":")

    # Compute total heat of mixing
    Li_count = atoms[-1].get_chemical_symbols().count("Li")
    O_count  = atoms[-1].get_chemical_symbols().count("O")
    H_o_M    = atoms[-1].get_potential_energy() - Li_count * E_ref_LiNiO2_per_O2 - (O_count/2-Li_count) * E_ref_NiO2_per_O2

    # Append data...
    # ... but to make mapping easier take the originally generated structures instead of the relaxed ones
    ref = "/".join(outcar.split("/")[:-2]) + "/POSCAR_enumerated"
    atoms_for_training_from_own_enumerated_structures.append(ASEread(ref))
    #refactor to HOM per Atom
    H_o_M_for_training_from_own_enumerated_structures.append( H_o_M / len(atoms[-1]) )

## Markus low energy CE data

In [None]:
#Li verteilung ohne trans
# Compare Markus last step energy and volume with the ones re-relaxed from me
# To this end, use the README files where 

atoms_for_training_from_Markus_low_energy_structures   = []
H_o_M_for_training_from_Markus_low_energy_structures   = []
outcars_for_training_from_Markus_low_energy_structures = []

READMEs = glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/CE_database_Marcel/03_Markus_approved_low_energy_data/*/README_original_path_from_Markus")

for README in READMEs:
    
    if os.path.isdir(README.replace("README_original_path_from_Markus","run_final")):
    
        # get the transformed CONTCAR to enable correct mapped to our prim structure later
        # Only for the 2 structures that made problems, take the original (rotated) POSCAR to enable mapping later
        if "re-relax_Markus038_finished" in README or "re-relax_Markus115_finished" in README:
            with open(README, "r") as f:
                line = f.readlines()[0]
            transformed_contcar = "/".join(line.split("/")[0:-1]) + "/run01/POSCAR_rotated.vasp" 
        else:
            transformed_contcar = README.replace("README_original_path_from_Markus","run_final/CONTCAR_rotated.vasp")
        atoms_transformed_contcar = ASEread(transformed_contcar)
        atoms_for_training_from_Markus_low_energy_structures.append(atoms_transformed_contcar)
        
        # get the outcar from the relaxation to get the energy
        outcar = README.replace("README_original_path_from_Markus","run_final/OUTCAR")
        outcars_for_training_from_Markus_low_energy_structures.append(outcar)
        atoms = ASEread(outcar, index=":")

        # Compute heat of mixing and per atom
        Li_count = atoms[-1].get_chemical_symbols().count("Li")
        O_count  = atoms[-1].get_chemical_symbols().count("O")
        H_o_M    = atoms[-1].get_potential_energy() - Li_count * E_ref_LiNiO2_per_O2 - (O_count/2-Li_count) * E_ref_NiO2_per_O2    
        H_o_M_for_training_from_Markus_low_energy_structures.append( H_o_M / len(atoms[-1]) )
        

## NEB initial and final images (without Ni_Li)

In [None]:
#Li verteilung ohne trans
atoms_for_training_NEB_initial_and_final_images = []
H_o_M_for_training_NEB_initial_and_final_images = []

# Find the ordered ones from 0250, 0500 and 0750 first
outcars_for_training_NEB_initial_and_final_images  = glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0250/image*/02_scan/*final/OUTCAR", recursive=True)
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0500/image*/02_scan/*final/OUTCAR", recursive=True)
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0750/image*/02_scan/*final/OUTCAR", recursive=True)

# and the ones from the random structures
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0*random*/01_initial_structure/02_scan/*final/OUTCAR", recursive=True)
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0*random*/02_odh/image*/02_scan/*final/OUTCAR", recursive=True)
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0*random*/03_tsh/image*/02_scan/*final/OUTCAR", recursive=True)
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0*random*/04_double_tsh/image*/02_scan/*final/OUTCAR", recursive=True)

# Iterate over OUTCARs
for OUTCAR in outcars_for_training_NEB_initial_and_final_images:
    
    # get the atoms object
    atoms = ASEread(OUTCAR, index=":") 
    
    # Compute heat of mixing and per atom
    Li_count = atoms[-1].get_chemical_symbols().count("Li")
    O_count  = atoms[-1].get_chemical_symbols().count("O")
    H_o_M    = atoms[-1].get_potential_energy() - Li_count * E_ref_LiNiO2_per_O2 - (O_count/2-Li_count) * E_ref_NiO2_per_O2
        
    # append them to the lists
    atoms_for_training_NEB_initial_and_final_images.append(atoms[-1])
    H_o_M_for_training_NEB_initial_and_final_images.append( H_o_M / len(atoms[-1]) )

## NEB transition states

In [None]:
#Li-trans

atoms_for_training_NEB_transition_states = []
H_o_M_for_training_NEB_transition_states = []
paths_for_training_NEB_transition_states = []

# the ones generated manually (0250, 0500, 0750)
paths_for_training_NEB_transition_states  = glob.glob("/nfshome/winkelmann/ARL/NEBs_Marcel/*/NEB_*/run_final*")

# the random ones
paths_for_training_NEB_transition_states += glob.glob("/nfshome/winkelmann/ARL/NEBs_Marcel/*random*/02_odh/NEB*/run_final*")
paths_for_training_NEB_transition_states += glob.glob("/nfshome/winkelmann/ARL/NEBs_Marcel/*random*/03_tsh/NEB*/run_final*")
paths_for_training_NEB_transition_states += glob.glob("/nfshome/winkelmann/ARL/NEBs_Marcel/*random*/04_double_tsh/NEB*/run_final*")

# Iterate over all run_final folders
for path in paths_for_training_NEB_transition_states:
    #get the folder path
    source_folder = '/'.join(path.split('/')[0:-1])
    # Check the energy along the path. Use initial and final energies from the corresponding relaxed structures + the last steps of the 
    # optimized intermediate images
    energies = []
    energies.append(ASEread(source_folder + "/OUTCAR_initial_image").get_potential_energy())
    could_not_read_counter = 0
    for i in ["01", "02", "03", "04", "05"]:
        try:
            energies.append(ASEread(f"{path}/{i}/OUTCAR").get_potential_energy())
        except:
            could_not_read_counter += 1
            print(f"{could_not_read_counter}. \t Could not read {path}/{i}/OUTCAR")
    if could_not_read_counter == 5:
        print(f"Ignore {path}\n  ---> could not read any OUTCARs!")
        continue
    energies.append(ASEread(source_folder + "/OUTCAR_final_image").get_potential_energy())
    
    # For "Proper" paths, there should be maximum in energy !between! initial and final paths... ignore those where this is not the case
    index_highest_energy = energies.index(max(energies))
    
    if index_highest_energy == 0 or index_highest_energy == 6:
        print(f"Ignore {path}\n  ---> image {index_highest_energy} has highest energy!")
    
    else:    
        # get the interpolated middle points of the initially created, straight odh-type path to be used as ideal position for the CE training
        ideal_TS_structure_file = glob.glob(source_folder + "/anchor_trans_image.vasp")
        ideal_TS_atoms = ASEread(ideal_TS_structure_file[0])
     
        atoms_for_training_NEB_transition_states.append(ideal_TS_atoms)
        
        # Compute heat of mixing per atom and append to list
        Li_count = ideal_TS_atoms.get_chemical_symbols().count("Li") + 1 # +1 for the jumping Li
        O_count  = ideal_TS_atoms.get_chemical_symbols().count("O")
        H_o_M    = max(energies) - Li_count * E_ref_LiNiO2_per_O2 - (O_count/2-Li_count) * E_ref_NiO2_per_O2
        # total number of lattice sites:
        # O_count   for Oxygen
        # O_count/2 for Li-Sites
        # O_count/2 for Ni-Sites
        # = 2*O_count --> Lattice Sites
        H_o_M_for_training_NEB_transition_states.append( H_o_M / (2*O_count) )

## Combine data

In [None]:
# combine the non TS structures
train_structures = ( atoms_for_training_from_own_enumerated_structures 
                    + atoms_for_training_from_Markus_low_energy_structures
                    + atoms_for_training_NEB_initial_and_final_images
                    + atoms_for_training_NEB_transition_states
             )

# ... and energies
train_H_o_M      = ( H_o_M_for_training_from_own_enumerated_structures 
                     + H_o_M_for_training_from_Markus_low_energy_structures
                     + H_o_M_for_training_NEB_initial_and_final_images
                     + H_o_M_for_training_NEB_transition_states
             )

# to be able to retrieve problematic files, keep the paths
file_location = ( outcars_for_training_from_own_enumerated_structures 
                + outcars_for_training_from_Markus_low_energy_structures 
                + outcars_for_training_NEB_initial_and_final_images
                + paths_for_training_NEB_transition_states
                )

In [None]:
# Store stuff for later use

from icet.tools import ConvexHull

data = {'concentration': [], 'reference_energy': [], 'hull_energy': [], 'file_location': []}

# Go trough all the data
for h_o_m, location in zip( train_H_o_M, file_location):
    
    try:
        
        atoms = ASEread(location, index=":")
    
        # Compute total heat of mixing 
        Li_count = atoms[-1].get_chemical_symbols().count("Li")
        O_count  = atoms[-1].get_chemical_symbols().count("O")
        # Compute Li concentration
        data['concentration'].append(Li_count/(O_count/2))

        # Add original energy to dictthe factor of 1e3 serves to convert from eV/atom to meV/atom
        data['reference_energy'].append(1e3 * h_o_m)
        
        # keep the file location to allow parsing
        data['file_location'].append(location)
    
    # Catch errors in case something goes wrong
    except Exception as err:
        print(f"Problems with {file_location}")
        print(f"Original Error Message:\n {err}\n")



In [None]:
# Go trough all the data and create hull data
for h_o_m, location in zip( train_H_o_M, file_location):

    try:

        atoms = ase.io.read(location, format='vasp')

        # Compute total heat of mixing
        Li_count = atoms.get_chemical_symbols().count("Li") + 1
        O_count  = atoms.get_chemical_symbols().count("O")
        # Compute Li concentration
        data['concentration'].append(Li_count/(O_count/2))

        # Add original energy to dictthe factor of 1e3 serves to convert from eV/atom to meV/atom
        data['reference_energy'].append(1e3 * h_o_m)

        # keep the file location to allow parsing
        data['file_location'].append(location)

    # Catch errors in case something goes wrong
    except Exception as err:
        print(f"Problems with {location}")
        print(f"Original Error Message:\n {err}\n")

hull = ConvexHull(data['concentration'], data['reference_energy'])
data['hull_energy'] = []
for concentration in data['concentration']:
    data['hull_energy'].append(hull.get_energy_at_convex_hull(concentration))
print(len(data['concentration']), len(data['hull_energy']))

# Fitting of just the Li sublattice

In [None]:
# Read R-3m model of LiNiO2 in R-3m symmetry with transition states
prim_TS = ASEread("/nfshome/sadowski/work/LiNiO2_Sabrina/37_CE_for_Li_diffusion/00_LNO_R-3m.vasp")

print(prim_TS)
print(prim_TS.get_chemical_symbols())

In [None]:
# Assign chemical symbols
chemical_symbols_TS= [['Li', 'X', 'Ti'],   # Li sublattice will contain: Li and Vacancies (=X), later also Ni
                      ['Ni'],       # Ni sublattice will not be changed
                      ['O'],        # O  sublattice will not be changed
                      ['O']]

# Optimizing CE

## finding cutoffs
list possile cutoffs and define standard variables

In [None]:
position_tolerance = 0.01
symprec = 0.01
tol_positions=0.05

fit_methods = ['lasso'] # 'elasticnet', 'ardr-lambda', 'ardr-lineScan', 'bayesian-ridge', 'omp', 'ridge']

## comparing fitting algorithms and getting Hull distances

In [None]:
import os
import sys
from contextlib import contextmanager

@contextmanager
def suppress_output():
    with open(os.devnull, 'w') as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout

In [None]:
hull_distances = {}
ce_lib = {}
predict = {}

for fit_method in fit_methods:
    # convert structures and energy to vectors
    ce_lib[fit_method] = ClusterExpansion.read('/nfshome/winkelmann/ARL/tmp/mixing_energy_anchorTS_%s.ce' % fit_method)
    cutoffs = [8.09,8.09,8.09]

    print('start %s \t %s' % (fit_method, pytime.strftime("%H:%M:%S", pytime.localtime())))
    # convert structures and energy to vectors
    #with suppress_output():
    (A, y), mapped_structures = get_fit_data(prim=prim_TS, chemical_symbols=chemical_symbols_TS, cutoffs=cutoffs,
                                                energy_list=train_H_o_M,
                                                atoms_ref_list=train_structures,
                                                outcar_list=file_location,
                                                position_tolerance=position_tolerance, symprec=symprec, tol_positions=tol_positions)
    predict[fit_method] = []
    
    print(len(mapped_structures), len(data['hull_energy']))
    print('predicting HOM %s \t %s' % (fit_method, pytime.strftime("%H:%M:%S", pytime.localtime())))
    for mapped_structure in mapped_structures:
        predict[fit_method].append(1e3 * ce_lib[fit_method].predict(mapped_structure))
    
    #calculating distance to hull
    hull_distances[fit_method] = np.absolute(np.subtract(predict[fit_method], data['hull_energy']))
    print('end %s \t %s' % (fit_method, pytime.strftime("%H:%M:%S", pytime.localtime())))

In [None]:
df_predict = pd.DataFrame(predict)
df_data = pd.DataFrame(data)
df_predict.to_csv('predict.csv')
df_data.to_csv('data.csv')

## ploting of the Convex Hull and all datapoints

In [None]:
### plotting and comparing results
# jupyter_stop('if Ce arent rerunning no sense in plotting them')
# %%capture
sorted_concentration_indices = np.argsort(data['concentration'])

matplotlib.rcParams.update({'font.size': 22})

for key, value in predict.items():       

    fig, ax1 = plt.subplots(nrows=1, ncols=1, figsize=(10,10), dpi=300)
    # ax1 = HOM (predict & reference) & hull
    ax.set_title('heat of mixing (%s)' % key)
    ax.set_xlabel('x in Li$_x$')
    ax.set_ylabel('Mixing energy $\frac{\text{meV}}{\text{atom}}$')

    ax.plot(np.array(data['concentration'])[sorted_concentration_indices],
             np.array(data['hull_energy'])[sorted_concentration_indices], label='convex hull of reference')
    ax.scatter(data['concentration'], data['reference_energy'], marker='o', label='reference')
    ax.scatter(data['concentration'], value, marker='x', label=key)
    ax.legend()

    plt.savefig('/nfshome/winkelmann/ARL/save/anchorTS_data/extra_plots/convexHull_anchorTS_%s.png' %(key))
    plt.show()

    # ax2 = reference vs predict (color coded for diffrence hull
    fig, ax2 = plt.subplots(nrows=1, ncols=1, figsize=(10,10), dpi=300)
    ax2.set_title('prediction error plot (%s)' % key)
    ax2.set_xlabel('Prediction $\frac{\text{meV}}{\text{atom}}$')
    ax2.set_ylabel('reference $\frac{\text{meV}}{\text{atom}}$')
    
    img1 = ax2.scatter(value, data['reference_energy'], c=hull_distances[key], cmap='jet')
    cb = fig.colorbar(img1)
    cb.set_label('distance to hull (ref)')    
    
    ax2.plot(ax2.get_xlim(), ax2.get_ylim(), ls='--', color='black')  # plotting a diagonal line for reference
    fig.savefig('/nfshome/winkelmann/ARL/save/anchorTS_data/extra_plots/error_anchorTS_%s.png' %(key))

## via Convex Hull

In [None]:
#reading existing CE's

predict_no_opt = {}
hull_distances = {'opt_cve':{}, 'opt':{}, 'no_opt':{}, 'reference':[]}
ce_lib = {}

for fit_method in fit_methods:
    ce_lib[fit_method] = ClusterExpansion.read('/nfshome/winkelmann/ARL/tmp/mixing_energy_no_TS_%s.ce' % fit_method)
hull_distances['reference'] = np.absolute(np.subtract(data['hull_energy'], data['reference_energy']))
weights = []
for distance in hull_distances['reference']:
    if distance != 0:
        weights.append(1/distance)
    else: 
        weights.append(0)
max_weight = max(weights)
for w in weights:
    if w == 0:
        w = max_weight * 1.1 #number makes no noticable diffrence (weights should be small but this should be the highest)
weights = np.array(weights)

In [None]:
jupyter_stop('dont rerun all CE creation if not necessary')
# optimizing CE
from trainstation import Optimizer

opt = {}
ce_opt_lib = {}
predict_opt = {}
save_file = prevent_overwrite('/nfshome/winkelmann/ARL/tmp/compare_fitting_algorithms_optimized_titanTS')
file_format = '%14s,\t%20s,\t%20s,\t%20s,\t%16s,\t%16s' + os.linesep
file = open(save_file,'w')
file.write(file_format % ('fit_method', 'RMSE_validation', 'RMSE_train', 'hull_distance', 'comp_time', 'use_time'))
file.close()

for fit_method in fit_methods:  
    start_time = pytime.time()
    cutoffs = best_cutoffs[fit_method]
        
    #calculate weighted ClusterVectors and energies  
    print(fit_method)
    (A, y), mapped_structures = get_fit_data(prim=prim_without_TS, chemical_symbols=chemical_symbols_without_TS, cutoffs=cutoffs, 
                                             energy_list=train_H_o_M, 
                                             atoms_ref_list=train_structures, 
                                             outcar_list=file_location, 
                                             position_tolerance=position_tolerance, symprec=symprec, tol_positions=tol_positions)
    
    A_weighted = np.multiply(A, weights.reshape(-1,1))
    y_weighted = np.multiply(y, weights)    
    if fit_method == 'ardr-lambda':
        opt[fit_method] = Optimizer((A_weighted, y_weighted), fit_method='ardr', threshold_lambda=1000)
    elif fit_method == 'ardr-lineScan':
        opt[fit_method] = Optimizer((A_weighted, y_weighted), fit_method='ardr', line_scan=True)
    else:
        opt[fit_method] = Optimizer((A_weighted, y_weighted), fit_method=fit_method)
    opt[fit_method].train()
    
    # set up Clusterspace
    cs = ClusterSpace(structure=prim_without_TS, cutoffs=cutoffs, chemical_symbols=chemical_symbols_without_TS, position_tolerance=position_tolerance, symprec=symprec)  
    
    ce_opt_lib[fit_method] = ClusterExpansion(cluster_space=cs, parameters=opt[fit_method].parameters, metadata=opt[fit_method].summary)
    ce_opt_lib[fit_method].write('/nfshome/winkelmann/ARL/tmp/mixing_energy_opt_no_TS_%s.ce' % fit_method)
    comp_time = pytime.time() - start_time
    
    predict_opt[fit_method] = []
    start_time = pytime.time()
    for mapped_structure in mapped_structures:
        predict_opt[fit_method].append(1e3 * ce_opt_lib[fit_method].predict(mapped_structure))
    use_time = pytime.time() - start_time

    #calculating distance to hull
    hull_distances['opt'][fit_method] = np.absolute(np.subtract(data['hull_energy'], predict_opt[fit_method]))
        
    file = open(save_file,'a')
    file.write(file_format % (fit_method, opt[fit_method].rmse_test, opt[fit_method].rmse_train, sum(hull_distances['opt'][fit_method]), comp_time, use_time))
    file.close()
file = open(save_file,'a')
file.write(file_format % ('reference','','',sum(hull_distances['reference']),'',''))
file.close()

In [None]:
##read created CE's

ce_opt_lib = {}
predict_opt = {}
mapped_structures = get_mapped_structures(prim=prim_without_TS, atoms_ref_list=train_structures, tol_positions=tol_positions)
for fit_method in fit_methods:
    ce_opt_lib[fit_method] = ClusterExpansion.read('/nfshome/winkelmann/ARL/tmp/mixing_energy_opt_no_TS_%s.ce' % fit_method)
    predict_opt[fit_method] = []
    for mapped_structure in mapped_structures:
        predict_opt[fit_method].append(1e3 * ce_opt_lib[fit_method].predict(mapped_structure))
    
    #calculating distance to hull
    hull_distances['opt'][fit_method] = np.absolute(np.subtract(data['hull_energy'], predict_opt[fit_method]))#sort by concentration for hull plotting
sorted_concentration_indices = np.argsort(data['concentration'])
#calculating predictions of normal CE for later fitting
for fit_method in fit_methods:
    predict_no_opt[fit_method] = []
    hull_distances['no_opt'] = {}
    for mapped_structure in mapped_structures:
        predict_no_opt[fit_method].append(1e3 * ce_lib[fit_method].predict(mapped_structure))
    
    #calculating distance to hull
    hull_distances['no_opt'][fit_method] = np.absolute(np.subtract(predict_no_opt[fit_method], data['hull_energy']))
    
    

In [None]:
concentration_list = []
for concentration in data['concentration']:
    if not concentration in concentration_list:
        concentration_list.append(concentration)

concentration_list.sort()
concentration_eval = {}
for concentration in concentration_list:
    concentration_eval[concentration] = 0

for concentration in data['concentration']:
    concentration_eval[concentration] += 1
    
for concentration in concentration_eval:
    if concentration_eval[concentration] < 10:
        print(concentration, concentration_eval[concentration])

In [None]:
%%capture

pdf = PdfPages('/nfshome/winkelmann/ARL/tmp/compare_fitting_algorithms_opt_titanTS.pdf')
for key, value in predict_opt.items():
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(18,9))

    # ax1 = HOM (predict & reference) & hull

    ax1.set_title('heat of mixing (%s)' % key)
    ax1.set_xlabel('x in Li$_x$')
    ax1.set_ylabel('Mixing energy (meV/atom)')

    ax1.plot(np.array(data['concentration'])[sorted_concentration_indices],
             np.array(data['hull_energy'])[sorted_concentration_indices], label='convex hull of reference')
    ax1.scatter(data['concentration'], data['reference_energy'], marker='o', label='reference')
    ax1.scatter(data['concentration'], predict_no_opt[key], marker='x', label=key)
    ax1.scatter(data['concentration'], value, marker='+', label='%s_opt' % key)
    ax1.legend()
    
    
    # ax2 = reference vs predict (color coded for diffrence hull

    ax2.set_title('prediction error plot (%s)' % key)
    ax2.set_xlabel('Prediction (mev/atom)')
    ax2.set_ylabel('reference (meV/atom)')
    
    img1 = ax2.scatter(value, data['reference_energy'], c=hull_distances['opt'][key], marker='+', cmap='jet')
    cb = fig.colorbar(img1)
    cb.set_label('distance to hull (opt)')    
    
    ax2.plot(ax2.get_xlim(), ax2.get_ylim(), ls='--', color='black')  # plotting a diagonal line for reference
    pdf.savefig(figure=fig, bbox_inches='tight')
pdf.close()

## with CrossValidationOptimizer instead of simple Optimizer

In [None]:
opt_cve = {}
ce_opt_cve_lib = {}
predict_opt_cve = {}
save_file = prevent_overwrite('/nfshome/winkelmann/ARL/tmp/compare_fitting_algorithms_optimized_cve_titanTS')
file_format = '%14s,\t%20s,\t%20s,\t%20s,\t%16s,\t%16s' + os.linesep
file = open(save_file,'w')
file.write(file_format % ('fit_method', 'RMSE_validation', 'RMSE_train', 'hull_distance', 'comp_time', 'use_time'))
file.close()

for fit_method in fit_methods:
    start_time = pytime.time()
    cutoffs = best_cutoffs[fit_method]
        
    #calculate weighted ClusterVectors and energies   
    (A, y), mapped_structures = get_fit_data(prim=prim_without_TS, chemical_symbols=chemical_symbols_without_TS, cutoffs=cutoffs, 
                                             energy_list=train_H_o_M, 
                                             atoms_ref_list=train_structures, 
                                             outcar_list=file_location, 
                                             position_tolerance=position_tolerance, symprec=symprec, tol_positions=tol_positions)

    A_weighted = np.multiply(A, weights.reshape(-1,1))
    y_weighted = np.multiply(y, weights)    
    if fit_method == 'ardr-lambda':
        opt_cve[fit_method] = CrossValidationEstimator((A_weighted, y_weighted), fit_method='ardr', threshold_lambda=1000)
    elif fit_method == 'ardr-lineScan':
        opt_cve[fit_method] = CrossValidationEstimator((A_weighted, y_weighted), fit_method='ardr', line_scan=True)
    else:
        opt_cve[fit_method] = CrossValidationEstimator((A_weighted, y_weighted), fit_method=fit_method)
    opt_cve[fit_method].validate()
    opt_cve[fit_method].train()
    
    # set up Clusterspace
    cs = ClusterSpace(structure=prim_without_TS, cutoffs=cutoffs, chemical_symbols=chemical_symbols_without_TS, position_tolerance=position_tolerance, symprec=symprec)  
    
    ce_opt_cve_lib[fit_method] = ClusterExpansion(cluster_space=cs, parameters=opt_cve[fit_method].parameters, metadata=opt_cve[fit_method].summary)
    ce_opt_cve_lib[fit_method].write('/nfshome/winkelmann/ARL/tmp/mixing_energy_opt_cve_no_TS_%s.ce' % fit_method)
    comp_time = start_time - pytime.time()
    
    predict_opt_cve[fit_method] = []
    start_time = pytime.time()
    for mapped_structure in mapped_structures:
        predict_opt_cve[fit_method].append(1e3 * ce_opt_lib[fit_method].predict(mapped_structure))
    use_time = start_time - pytime.time()
    
    #calculating distance to hull
    hull_distances['opt_cve'][fit_method] = np.absolute(np.subtract(predict_opt_cve[fit_method], data['hull_energy']))
    
    file = open(save_file,'a')
    file.write(file_format % (fit_method, opt_cve[fit_method].rmse_validation, opt_cve[fit_method].rmse_train, sum(hull_distances['opt_cve'][fit_method]), comp_time, use_time))
    file.close()
file = open(save_file,'a')
file.write(file_format % ('reference','','',sum(hull_distances['reference']),'',''))
file.close()

In [None]:
%%capture
pdf = PdfPages('/nfshome/winkelmann/ARL/tmp/compare_fitting_algorithms_opt_cve_titanTS.pdf')

for key, value in predict_opt_cve.items():
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(18,9))

    # ax1 = HOM (predict & reference) & hull

    ax1.set_title('heat of mixing (%s)' % key)
    ax1.set_xlabel('x in Li$_x$')
    ax1.set_ylabel('Mixing energy (meV/atom)')

    ax1.plot(np.array(data['concentration'])[sorted_concentration_indices],
             np.array(data['hull_energy'])[sorted_concentration_indices], label='convex hull of reference')
    ax1.scatter(data['concentration'], data['reference_energy'], marker='o', label='reference')
    ax1.scatter(data['concentration'], predict_opt[key], marker='x', label=key)
    ax1.scatter(data['concentration'], value, marker='+', label='%s_opt_cve' % key)
    ax1.scatter(data['concentration'], predict_opt_cve[key], marker='2', label='%s_opt' % key)
    ax1.legend()
    
    
    # ax2 = reference vs predict (color coded for diffrence hull

    ax2.set_title('prediction error plot (%s)' % key)
    ax2.set_xlabel('Prediction (mev/atom)')
    ax2.set_ylabel('reference (meV/atom)')
    
    img1 = ax2.scatter(value, data['reference_energy'],marker='x', c=hull_distances['opt_cve'][key], cmap='jet')
    cb1 = fig.colorbar(img1)
    cb1.set_label('distance to hull (opt_cve)')  
    img2 = ax2.scatter(predict_opt[key],data['reference_energy'],marker='+', c=hull_distances['opt'][key], cmap='jet') 
    cb2 = fig.colorbar(img2)
    cb2.set_label('distance to hull (opt)')
    ax2.plot(ax2.get_xlim(), ax2.get_ylim(), ls='--', color='black')  # plotting a diagonal line for reference
    
    pdf.savefig(figure=fig, bbox_inches='tight')
pdf.close()

In [None]:
%%capture

pdf = PdfPages('/nfshome/winkelmann/ARL/tmp/optimization_evolution.pdf')
for key, value in predict_opt_cve.items():
    fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=1, ncols=4, figsize=(36,9))

    # ax1 = HOM (reference) & hull

    ax1.set_title('heat of mixing (%s)' % key)
    ax1.set_xlabel('x in Li$_x$')
    ax1.set_ylabel('Mixing energy (meV/atom)')

    ax1.plot(np.array(data['concentration'])[sorted_concentration_indices],
             np.array(data['hull_energy'])[sorted_concentration_indices], label='convex hull of reference')
    ax1.scatter(data['concentration'], data['reference_energy'], marker='o', label='reference')
    ax1.legend()
    
    # ax2 = HOM (predict & reference) & hull

    ax2.set_title('heat of mixing (%s)' % key)
    ax2.set_xlabel('x in Li$_x$')
    ax2.set_ylabel('Mixing energy (meV/atom)')

    ax2.plot(np.array(data['concentration'])[sorted_concentration_indices],
             np.array(data['hull_energy'])[sorted_concentration_indices], label='convex hull of reference')
    ax2.scatter(data['concentration'], data['reference_energy'], marker='o', label='reference')
    ax2.scatter(data['concentration'], predict_no_opt[key], marker='x', label=key)
    ax2.legend()
    
    # ax3 = HOM (predict_opt & predict & reference) & hull

    ax3.set_title('heat of mixing (%s)' % key)
    ax3.set_xlabel('x in Li$_x$')
    ax3.set_ylabel('Mixing energy (meV/atom)')

    ax3.plot(np.array(data['concentration'])[sorted_concentration_indices],
             np.array(data['hull_energy'])[sorted_concentration_indices], label='convex hull of reference')
    ax3.scatter(data['concentration'], data['reference_energy'], marker='o', label='reference')
    ax3.scatter(data['concentration'], predict_no_opt[key], marker='x', label=key)
    ax3.scatter(data['concentration'], predict_opt[key], marker='+', label='%s_opt' % key)
    ax3.legend()
    
    # ax4 = HOM (predict_opt_cve & predict_opt & predict & reference) & hull

    ax4.set_title('heat of mixing (%s)' % key)
    ax4.set_xlabel('x in Li$_x$')
    ax4.set_ylabel('Mixing energy (meV/atom)')

    ax4.plot(np.array(data['concentration'])[sorted_concentration_indices],
             np.array(data['hull_energy'])[sorted_concentration_indices], label='convex hull of reference')
    ax4.scatter(data['concentration'], data['reference_energy'], marker='o', label='reference')
    ax4.scatter(data['concentration'], predict_no_opt[key], marker='x', label=key)
    ax4.scatter(data['concentration'], predict_opt[key], marker='x', label='%s_opt' % key)
    ax4.scatter(data['concentration'], value, marker='+', label='%s_opt_cve' % key)
    ax4.legend()
    

    pdf.savefig(figure=fig, bbox_inches='tight')
pdf.close()

In [None]:
for fit_method in fit_methods:
    print(fit_method + ':\t' + str(sum(np.absolute(np.subtract(predict_opt_cve[fit_method], predict_opt[fit_method])))))