# Prerequisites

## Modules

In [None]:
# Modules are available in conda environment with name: icet
# conda activate icet

import ase
from ase.io import read as ASEread
from ase.io.vasp import write_vasp
from ase.db import connect
from ase.cell import Cell
from ase.neighborlist import NewPrimitiveNeighborList
from ase.build import make_supercell

import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import colormaps
import glob
import sys
import os
import random
import shutil

import icet
from icet import ClusterSpace, StructureContainer, ClusterExpansion
from trainstation import CrossValidationEstimator
from icet.tools import enumerate_structures
from icet.tools.structure_generation import generate_sqs_by_enumeration

try:
    import seaborn as sns
    sns.set_context('notebook')
except ImportError:
    print('sad')
    
import subprocess

import datetime

## Misc Functions

In [None]:
# Stop message
def jupyter_stop(ErrorMessage="User-defined stop via jupyter_stop() function"):
    """
    User defined stop function, similar to exit(). Mostly for testing purpose or to 
    avoid overwriting of already generated data.
    """
    raise SystemExit(ErrorMessage)

## CE Functions

## Reorder Atoms

In [None]:
# S only in this list to 'trick' the structure enumeration
# S as extra Nickel
atomic_label2number = {"Li" :  3,
                       "O"  :  8,
                       "S"  : 16,
                       "Ni" : 28}

atomic_number2label = { 3 : "Li",
                        8 :  "O",
                       16 :  "S",
                       28 :  "Ni"}


def order_atoms(atoms,order=["Li","Ni","O"]):
    
    # get old positions and atomic numbers
    old_positions       = atoms.get_positions()
    old_atomic_number   = atoms.get_atomic_numbers()
    
    # create empty dict for all types
    atomic_pos_dict = {}
    for sym in order:
        atomic_pos_dict[sym] = []
    
    # append positions to dict 
    for num, pos in zip(old_atomic_number, old_positions):
        atomic_pos_dict[atomic_number2label[num]].append(pos)
    
    # put together the new ordered positions and atomic numbers
    new_positions = []
    new_atomic_numbers = []
    for sym in order:
        new_positions.extend(atomic_pos_dict[sym])
        new_atomic_numbers.extend( [ atomic_label2number[sym] ] * len(atomic_pos_dict[sym]) )
    
    # copy original atoms object and modify it
    copy_atoms = atoms.copy()
    copy_atoms.set_positions(new_positions)
    copy_atoms.set_atomic_numbers(new_atomic_numbers)
    
    return copy_atoms


# Collect data

In [None]:
### Get the reference energies of LiNiO2 and NiO2 normed per unit cell
LiNiO2 = ASEread("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/CE_database_Marcel/02_enumerate_P21c_0-4fu/0001_finished_approved/run_final_approved/OUTCAR")
E_ref_LiNiO2_per_O2 = LiNiO2.get_potential_energy() / LiNiO2.get_chemical_symbols().count("O") * 2 #or per Ni in case of no extra Ni

NiO2   = ASEread("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/CE_database_Marcel/02_enumerate_P21c_0-4fu/0003_finished_approved/run_final_approved/OUTCAR")
E_ref_NiO2_per_O2   = NiO2.get_potential_energy() / NiO2.get_chemical_symbols().count("O") * 2 #or per Ni in case of no extra Ni

## Own enumerated structures based on P21/c

In [None]:
#Li verteilung ohne trans
atoms_for_training_from_own_enumerated_structures = []
H_o_M_for_training_from_own_enumerated_structures = []

# Get all the outcars of interest
outcars_for_training_from_own_enumerated_structures= sorted(glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/CE_database_Marcel/02_enumerate_P21c_0-4fu/0*_finished_approved/run_final_approved/OUTCAR"))

# Iterate over them
for outcar in outcars_for_training_from_own_enumerated_structures:
    
    # get atoms object
    atoms = ASEread(outcar, index=":")
    
    # Compute total heat of mixing 
    Li_count = atoms[-1].get_chemical_symbols().count("Li")
    O_count  = atoms[-1].get_chemical_symbols().count("O")
    H_o_M    = atoms[-1].get_potential_energy() - Li_count * E_ref_LiNiO2_per_O2 - (O_count/2-Li_count) * E_ref_NiO2_per_O2

    # Append data...     
    # ... but to make mapping easier take the originally generated structures instead of the relaxed ones
    ref = "/".join(outcar.split("/")[:-2]) + "/POSCAR_enumerated"
    atoms_for_training_from_own_enumerated_structures.append(ASEread(ref))
    #refactor to HOM per Atom
    H_o_M_for_training_from_own_enumerated_structures.append( H_o_M / len(atoms[-1]) )



## Markus low energy CE data

In [None]:
# Basic setups

def get_fit_data(prim, chemical_symbols, cutoffs, energy_list, atoms_ref_list, outcar_list, position_tolerance, symprec, tol_positions):
    """
    Construct cluster space and structure container for the given cutoffs
    and return the fit matrix along with the target energies
    """
    # stepsize to print update of training:
    stepsize = int(0.1*len(outcar_list))
    
    # Collect the mapped structures
    mapped_structures = []
    
    # Set up Clusterspace
    cs = ClusterSpace(structure=prim,
                      cutoffs=cutoffs,
                      chemical_symbols=chemical_symbols,
                      position_tolerance=position_tolerance,
                      symprec=symprec)
    
    #print(cs)
    
    # Set up StructureContainer with the previsouly generated ClusterSpace
    sc = StructureContainer(cluster_space=cs)
    
    # Fill the StructureContainer
    for i, (outcar, E, at_ref) in enumerate(zip(outcar_list, energy_list, atoms_ref_list)):
        
        # print update of training
        if i % stepsize == 0:
            print(f"Computing structure {i} of {len(outcar_list)} ({i/len(outcar_list):.1%})   {datetime.datetime.now()}")
        
        
        # Read the OUTCAR [by default last step is used] and get energy
        #at     = ASEread(outcar)
        #total_energy = at.get_potential_energy() # total_energy = atoms.get_potential_energy(force_consistent=True)
        
        # Map the enumerated structure to the primitive cell, add it to cluster space with the energy of the properly relaxed system
        try:
            mapped_atoms, info = icet.tools.map_structure_to_reference(structure=at_ref, 
                                                             reference=prim, 
                                                             inert_species=["O"], 
                                                             tol_positions=tol_positions, 
                                                             suppress_warnings=False, 
                                                             assume_no_cell_relaxation=False)
            mapped_structures.append(mapped_atoms)

            sc.add_structure(structure=mapped_atoms,
                     properties={'Total Energy': E},
                     user_tag = outcar,
                     sanity_check=True,
                     )
        
        except ValueError as err:
            print(f"Mapping Error with {outcar}")
            print(f"Note: Possibly a different structure was used for the mapping!")
            print("Original Error Message:")
            print(err , "\n")
            

            
    print(f"len(cs) = {len(cs)}")
    
    return sc.get_fit_data(key='Total Energy'), mapped_structures


def get_A_y(prim, chemical_symbols, cutoffs, energy_list, atoms_ref_list, outcar_list, position_tolerance, symprec, tol_positions):
    return get_fit_data(prim, chemical_symbols, cutoffs, energy_list, atoms_ref_list, outcar_list, position_tolerance, symprec, tol_positions)



def get_row(cve, alpha=None):
    row = dict()
    row['rmse_validation'] = cve.rmse_validation
    row['rmse_train'] = cve.rmse_train
    row['BIC'] = cve.model.BIC
    row['n_parameters'] = cve.n_parameters
    row['n_nonzero_parameters'] = cve.n_nonzero_parameters
    
    if alpha != None:
        row['alpha'] = alpha
    
    return row


def train_ce(prim, chemical_symbols, cutoffs, energy_list, atoms_ref_list, outcar_list, position_tolerance, symprec, tol_positions, fit_method):
    """
    Train a cluster expansion with the given cutoffs and return fit metrics of the obtained model.
    prim: ase atoms object, its the primitive structure that the CE lives on
    chemical_symbols: List of the possible atoms types on the different sites of prim
    cutoffs: cutoffs for the 2-body, 3-body, ... terms
    atoms_list: list of all the atoms objects to use for training/testing
    outcar_list : list with paths (strings) of the corresponding atoms objects
    fit_method examples with additional options (to be implemented at a later point): 
        fit_method='rfe'
        fit_method='ardr', threshold_lambda=4e5
        fit_method='ardr', line_scan=True
        fit_method='lasso'
        fit_method='least-squares'
    """
    (A, y), mapped_structures = get_fit_data(prim, chemical_symbols, cutoffs, energy_list, atoms_ref_list, outcar_list, position_tolerance, symprec, tol_positions)
    if fit_method == 'ardr-lambda':
        cve = CrossValidationEstimator((A, y), fit_method='ardr', threshold_lambda=1000, validation_method='shuffle-split', n_splits=10)
    elif fit_method == 'ardr-lineScan':
        cve = CrossValidationEstimator((A, y), fit_method='ardr', line_scan=True, validation_method='shuffle-split', n_splits=10)
    else:
        cve = CrossValidationEstimator((A, y), fit_method=fit_method, validation_method='shuffle-split', n_splits=10)
    cve.validate()
    cve.train()

    row = get_row(cve)
    
    return row

def prevent_overwrite(file_name,add=''):
    if os.path.exists(file_name+add):
        if add != '':
            n = int(add)
            n += 1
        else:
            n = 1  
        add = str(n)
        file_name = prevent_overwrite(file_name,add)
        add = ''
    return file_name + add

In [None]:
#Li verteilung ohne trans
# Compare Markus last step energy and volume with the ones re-relaxed from me
# To this end, use the README files where 

atoms_for_training_from_Markus_low_energy_structures   = []
H_o_M_for_training_from_Markus_low_energy_structures   = []
outcars_for_training_from_Markus_low_energy_structures = []

READMEs = glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/CE_database_Marcel/03_Markus_approved_low_energy_data/*/README_original_path_from_Markus")

for README in READMEs:
    
    if os.path.isdir(README.replace("README_original_path_from_Markus","run_final")):
    
        # get the transformed CONTCAR to enable correct mapped to our prim structure later
        # Only for the 2 structures that made problems, take the original (rotated) POSCAR to enable mapping later
        if "re-relax_Markus038_finished" in README or "re-relax_Markus115_finished" in README:
            with open(README, "r") as f:
                line = f.readlines()[0]
            transformed_contcar = "/".join(line.split("/")[0:-1]) + "/run01/POSCAR_rotated.vasp" 
        else:
            transformed_contcar = README.replace("README_original_path_from_Markus","run_final/CONTCAR_rotated.vasp")
        atoms_transformed_contcar = ASEread(transformed_contcar)
        atoms_for_training_from_Markus_low_energy_structures.append(atoms_transformed_contcar)
        
        # get the outcar from the relaxation to get the energy
        outcar = README.replace("README_original_path_from_Markus","run_final/OUTCAR")
        outcars_for_training_from_Markus_low_energy_structures.append(outcar)
        atoms = ASEread(outcar, index=":")

        # Compute heat of mixing and per atom
        Li_count = atoms[-1].get_chemical_symbols().count("Li")
        O_count  = atoms[-1].get_chemical_symbols().count("O")
        H_o_M    = atoms[-1].get_potential_energy() - Li_count * E_ref_LiNiO2_per_O2 - (O_count/2-Li_count) * E_ref_NiO2_per_O2    
        H_o_M_for_training_from_Markus_low_energy_structures.append( H_o_M / len(atoms[-1]) )
        

## NEB initial and final images (without Ni_Li)

In [None]:
#Li verteilung ohne trans
atoms_for_training_NEB_initial_and_final_images = []
H_o_M_for_training_NEB_initial_and_final_images = []

# Find the ordered ones from 0250, 0500 and 0750 first
outcars_for_training_NEB_initial_and_final_images  = glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0250/image*/02_scan/*final/OUTCAR", recursive=True)
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0500/image*/02_scan/*final/OUTCAR", recursive=True)
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0750/image*/02_scan/*final/OUTCAR", recursive=True)

# and the ones from the random structures
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0*random*/01_initial_structure/02_scan/*final/OUTCAR", recursive=True)
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0*random*/02_odh/image*/02_scan/*final/OUTCAR", recursive=True)
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0*random*/03_tsh/image*/02_scan/*final/OUTCAR", recursive=True)
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0*random*/04_double_tsh/image*/02_scan/*final/OUTCAR", recursive=True)

# Iterate over OUTCARs
for OUTCAR in outcars_for_training_NEB_initial_and_final_images:
    
    # get the atoms object
    atoms = ASEread(OUTCAR, index=":") 
    
    # Compute heat of mixing and per atom
    Li_count = atoms[-1].get_chemical_symbols().count("Li")
    O_count  = atoms[-1].get_chemical_symbols().count("O")
    H_o_M    = atoms[-1].get_potential_energy() - Li_count * E_ref_LiNiO2_per_O2 - (O_count/2-Li_count) * E_ref_NiO2_per_O2
        
    # append them to the lists
    atoms_for_training_NEB_initial_and_final_images.append(atoms[-1])
    H_o_M_for_training_NEB_initial_and_final_images.append( H_o_M / len(atoms[-1]) )

## NEB transition states

In [None]:
#Li-trans
fig, ax = plt.subplots()

atoms_for_training_NEB_transition_states = []
H_o_M_for_training_NEB_transition_states = []
paths_for_training_NEB_transition_states = []

# the ones generated manually (0250, 0500, 0750)
paths_for_training_NEB_transition_states  = glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0250/NEB_*_finished/run_final")
paths_for_training_NEB_transition_states += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0500/NEB_*_finished/run_final")
paths_for_training_NEB_transition_states += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0750/NEB_*_finished/run_final")

# the random ones
paths_for_training_NEB_transition_states += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0*random*/02_odh/NEB_*/run_final")
paths_for_training_NEB_transition_states += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0*random*/03_tsh/NEB_*/run_final")
paths_for_training_NEB_transition_states += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0*random*/04_double_tsh/NEB_*/run_final")

# Iterate over all run_final folders
for path in paths_for_training_NEB_transition_states:
    
    # Check the energy along the path. Use initial and final energies from the corresponding relaxed structures + the last steps of the 
    # optimized intermediate images
    energies = []
    energies.append(ASEread(path.replace("run_final", "OUTCAR_initial_image")).get_potential_energy())
    for i in ["01", "02", "03", "04", "05"]:
        energies.append(ASEread(f"{path}/{i}/OUTCAR").get_potential_energy())
    energies.append(ASEread(path.replace("run_final", "OUTCAR_final_image")).get_potential_energy())
    ax.plot([0,1,2,3,4,5,6], np.array(energies)-energies[0])
    
    # For "Proper" paths, there should be maximum in energy !between! initial and final paths... ignore those where this is not the case
    index_highest_energy = energies.index(max(energies))
    
    if index_highest_energy == 0 or index_highest_energy == 6:
        print(f"Ignore {path}\n  ---> image {index_highest_energy} has highest energy!")
    
    else:    
        # get the interpolated middle points of the initially created, straight odh-type path to be used as ideal position for the CE training
        ideal_TS_structure_file = glob.glob(path.replace("run_final", "run01*/03/POSCAR_orig_linear_interpolation"))
        if len(ideal_TS_structure_file) == 0:                # For ODH-type jumps there is no POSCAR_orig_linear_interpolation
            ideal_TS_structure_file = glob.glob(path.replace("run_final", "run01*/03/POSCAR"))
        ideal_TS_atoms = ASEread(ideal_TS_structure_file[0])
     
        atoms_for_training_NEB_transition_states.append(ideal_TS_atoms)
        
        # Compute heat of mixing per atom and append to list
        Li_count = ideal_TS_atoms.get_chemical_symbols().count("Li")
        O_count  = ideal_TS_atoms.get_chemical_symbols().count("O")
        H_o_M    = max(energies) - Li_count * E_ref_LiNiO2_per_O2 - (O_count/2-Li_count) * E_ref_NiO2_per_O2
        H_o_M_for_training_NEB_transition_states.append( H_o_M / len(ideal_TS_atoms) )
        
    


## Combine data

In [None]:
# combine the structures 
train_structures = ( atoms_for_training_from_own_enumerated_structures 
                    + atoms_for_training_from_Markus_low_energy_structures
                    + atoms_for_training_NEB_initial_and_final_images 
                    #+ atoms_for_training_NEB_transition_states 
             )

# ... and energies
train_H_o_M      = ( H_o_M_for_training_from_own_enumerated_structures 
                     + H_o_M_for_training_from_Markus_low_energy_structures
                     + H_o_M_for_training_NEB_initial_and_final_images 
                     #+ H_o_M_for_training_NEB_transition_states 
             )

# to be able to retrieve problematic files, keep the paths
file_location = ( outcars_for_training_from_own_enumerated_structures 
                + outcars_for_training_from_Markus_low_energy_structures 
                + outcars_for_training_NEB_initial_and_final_images 
                #+ paths_for_training_NEB_transition_states
                )

# Fitting of just the Li sublattice

In [None]:
# Read R-3m model of LiNiO2 in R-3m symmetry without transition states
prim_without_TS = ASEread("/nfshome/sadowski/work/LiNiO2_Sabrina/37_CE_for_Li_diffusion/00_LNO_R-3m.vasp")

print(prim_without_TS)
print(prim_without_TS.get_chemical_symbols())

In [None]:
# Assign chemical symbols
chemical_symbols_without_TS= [['Li', 'X'],   # Li sublattice will contain: Li and Vacancies (=X), later also Ni
                    ['Ni'],       # Ni sublattice will not be changed
                    ['O'],        # O  sublattice will not be changed
                    ['O']]        

<font color='red'>  deleted first fitting with CVE</font>

<font color='red'> deleted traing of ardr_lambda </font>
just left the setting of the Clusterspace

<font color='red'> deleted code cell for calculating with the model with respect to the trained structure </font>
kept initalization of data[] (without predicted_energy)

<font color='red'> deleted code cell for plotting the ardr_lambda fitting vs reference data with respect to the diffrent data sets </font>

# Optimizing CE

## finding cutoffs
list possile cutoffs and define standard variables

In [None]:
c2_vals = [2.84, 2.85, 4.93, 5.00, 5.69, 5.75, 6.42, 7.53, 7.57, 8.09, 8.53, 8.57, 9.47, 9.58, 9.85, 9.89, 10.00, 10.26, 10.29, 10.39, 11.10, 11.14, 11.38, 11.50, 11.76, 11.84, 12.40, 12.42, 12.51, 12.75, 12.83, 13.03, 13.14, 13.66, 13.74, 13.96, 14.15] # 37 values - derived from list below

position_tolerance = 0.01
symprec = 0.01
tol_positions=0.05

fit_methods = ['ardr-lambda', 'ardr-lineScan', 'rfe', 'lasso', 'least-squares', 'bayesian-ridge', 'elasticnet', 'omp', 'ridge', 'split-bregman']
records = {}

### pair cutoff

In [None]:
jupyter_stop('dont rerun cutoff-2-scan if not necessary - needs some time')

df2 = {}

save_file = prevent_overwrite('/nfshome/winkelmann/ARL/tmp/cutoff_2_noTS_fit')
file_format = '%14s,\t%8s,\t%15s,\t%15s,\t%15s,\t%5s,\t%5s' + os.linesep
file = open(save_file,'w')
file.write(file_format % ('fit_method', 'cutoff2', 'validation', 'train', 'BIC', 'number', 'nonzero'))
file.close()
for fit_method in fit_methods:
    records[fit_method]= []
    for c2 in c2_vals:       
        cutoffs = [c2]
        row = train_ce(prim=prim_without_TS,
                       chemical_symbols=chemical_symbols_without_TS, 
                       cutoffs=cutoffs, 
                       energy_list = train_H_o_M, 
                       atoms_ref_list = train_structures,
                       outcar_list = file_location, 
                       position_tolerance = position_tolerance, 
                       symprec = symprec, 
                       tol_positions = tol_positions, 
                       fit_method = fit_method)
        records[fit_method].append({'c2': c2, **row})
        
        file = open(save_file,'a')
        file.write(file_format % (fit_method, c2, row['rmse_validation'], row['rmse_train'], row['BIC'], row['n_parameters'], row['n_nonzero_parameters']))
        file.close()
    df2[fit_method]= pd.DataFrame(records)
print(save_file)

#Total number of Li-Li Bonds as function of cutoff (intra layer/inter layer):
#                       same Layer          next layer x2   second next layer x2
# 2.84    0  ( 0/ 0)    
# 2.85    6  ( 6/ 0)    first  (6)
# 4.93   12  (12/ 0)    second (6)
# 5.00   18  (12/ 6)                        first  (3) 
# 5.69   24  (18/ 6)    third  (6)
# 5.75   30  (18/12)                        second (3) 
# 6.42   42  (18/24)                        third  (6) 
# 7.53   54  (30/24)    fourth (12) 
# 7.57   66  (30/36)                        fourth (6) 
# 8.09   72  (30/42)                        fifth  (3) 
# 8.53   78  (36/42)    fifth  (6)
# 8.57   90  (36/54)                        sixth  (6) 
# 9.47   96  (36/60)                        seventh(3)
# 9.58  102  (36/60/6)                                      first  (3)
# 9.85  108  (42/60/6)  sixth  (6)          
# 9.89  120  (42/72/6)                      eigth  (6)
#10.00  126  (42/72/12)                                     second (3)
#10.26  138  (54/72/12) seventh(12)
#10.29  150  (54/84/12)                     ninth  (6)
#10.39  162  (54/84/24)                                     third  (6)
#11.10  174  (54/96/24)                     tenth(6)
#11.14  186  (54/96/36)                                     fourth(6)
#11.38  192  (60/96/36) eighth(6)
#11.50  198  (60/96/42)                                     fifth(3)
#11.76  210  (60/108/42)                    ninth(6)
#11.84  222  (60/108/54)                                    sixth(6)
#12.40  234  (72/108/54) ninth(12)
#12.42  252  (72/126/54)                    tenth(9)
#12.51  258  (72/126/60)                                    seventh(3)
#12.75  270  (72/138/60)                    eleventh(6)
#12.83  282  (72/138/72)                                    eighth(6)
#13.03  294  (84/138/72) tenth(12)
#13.14  306  (84/138/84)                                    ninth(6)
#13.66  318  (84/150/84)                    twelfth(6)
#13.74  330  (84/150/96)                                    tenth(6)
#13.96  336  (84/156/96)                    thiteenth(3)
#14.15  338  (84/156/96/2)                                                  first(1)
#bis 12 bleiben es 222 nachbarn

### triplet cutoff

In [None]:
# read results cutoff2

best_cutoffs2 = {}
df2 = {}

for fit_method in fit_methods:
    df2[fit_method] = {'c2':[],'rmse_validation':[],'rmse_train':[],'BIC':[],'n_parameters':[],'n_nonzero_parameters':[]}
file = open('/nfshome/winkelmann/ARL/save/cutoff_2_noTS_fitting_data','r')
file.readline()
lines = file.readlines()
    
for line in lines:
    line = line.replace(' ', '')
    values = line.split(',')
    df2[values[0]]['c2'].append(float(values[1]))
    df2[values[0]]['rmse_validation'].append(float(values[2]))
    df2[values[0]]['rmse_train'].append(float(values[3]))
    df2[values[0]]['BIC'].append(float(values[4]))
    df2[values[0]]['n_parameters'].append(float(values[5]))
    df2[values[0]]['n_nonzero_parameters'].append(float(values[6]))

for fit_method in fit_methods:
    df2[fit_method] = pd.DataFrame(df2[fit_method])
    best_cutoffs2[fit_method] = df2[fit_method].c2[df2[fit_method].rmse_validation.idxmin()]

for key,cutoff in best_cutoffs2.items():
    print("%s: %s" %(key, cutoff))
max_cutoff2 = max(best_cutoffs2.values())
print('Max pair cutoff: %s' % max_cutoff2)

In [None]:
jupyter_stop('dont rerun cutoff-3-scan if not necessary - needs some time')

# c3_vals = c2_vals[0: biggest min(rmse_validation) cutoff2]
# changed to 10 since everything above 8.09=c2_vals[9]
c3_vals = c2_vals[:10]
df3 = {}

save_file = prevent_overwrite('/nfshome/winkelmann/ARL/tmp/cutoff_noTS_3_fit')
file_format = '%14s,\t%8s,\t%15s,\t%15s,\t%15s,\t%5s,\t%5s' + os.linesep
file = open(save_file,'w')
file.write(file_format % ('fit_method', 'cutoff3', 'validation', 'train', 'BIC', 'number', 'nonzero'))
file.close()
for fit_method in fit_methods:
    records[fit_method]= []
    for c3 in c3_vals:     
        cutoffs = [best_cutoffs2[fit_method], c3]
        row = train_ce(prim=prim_without_TS,
                       chemical_symbols=chemical_symbols_without_TS, 
                       cutoffs=cutoffs, 
                       energy_list = train_H_o_M, 
                       atoms_ref_list = train_structures,
                       outcar_list = file_location, 
                       position_tolerance = position_tolerance, 
                       symprec = symprec, 
                       tol_positions = tol_positions, 
                       fit_method = fit_method)
        records[fit_method].append({'c3': c3, **row})
        
        file = open(save_file,'a')
        file.write(file_format % (fit_method, c3, row['rmse_validation'], row['rmse_train'], row['BIC'], row['n_parameters'], row['n_nonzero_parameters']))
        file.close()
    df3[fit_method]= pd.DataFrame(records[fit_method])
print(save_file)

### quartett cutoff

In [None]:
# read results cutoff3

best_cutoffs3 = {}
df3 = {}

for fit_method in fit_methods:
    df3[fit_method] = {'c3':[],'rmse_validation':[],'rmse_train':[],'BIC':[],'n_parameters':[],'n_nonzero_parameters':[]}
file = open('/nfshome/winkelmann/ARL/save/cutoff_3_noTS_fitting_data','r')
file.readline()
lines = file.readlines()
    
for line in lines:
    line = line.replace(' ', '')
    values = line.split(',')
    if len(values) > 7:
        print(values[7] + ',' + str(values[0:7]))
        continue
    df3[values[0]]['c3'].append(float(values[1]))
    df3[values[0]]['rmse_validation'].append(float(values[2]))
    df3[values[0]]['rmse_train'].append(float(values[3]))
    df3[values[0]]['BIC'].append(float(values[4]))
    df3[values[0]]['n_parameters'].append(float(values[5]))
    df3[values[0]]['n_nonzero_parameters'].append(float(values[6]))

for fit_method in fit_methods:
    df3[fit_method] = pd.DataFrame(df3[fit_method])
    best_cutoffs3[fit_method] = df3[fit_method].c3[df3[fit_method].rmse_validation.idxmin()]

for key,cutoff in best_cutoffs3.items():
    print("%s: %s" %(key, cutoff))
max_cutoff3 = max(best_cutoffs3.values())
print('Max triplet cutoff: %s' % max_cutoff3)

In [None]:
# c4_vals = c2_vals[0: biggest min(rmse_validation) cutoff3]
jupyter_stop('dont rerun cutoff-4-scan if not necessary - needs some time')

c4_vals = c2_vals[:c2_vals.index(max_cutoff3)+3]
df4 = {}

save_file = prevent_overwrite('/nfshome/winkelmann/ARL/tmp/cutoff_4_noTS_fit')
file_format = '%14s,\t%8s,\t%15s,\t%15s,\t%15s,\t%5s,\t%5s' + os.linesep
file = open(save_file,'w')
file.write(file_format % ('fit_method', 'cutoff4', 'validation', 'train', 'BIC', 'number', 'nonzero'))
file.close()
for fit_method in fit_methods:
    records[fit_method]= []
    for c4 in c4_vals:       
        cutoffs = [best_cutoffs2[fit_method], best_cutoffs3[fit_method], c4]
        row = train_ce(prim=prim_without_TS,
                       chemical_symbols=chemical_symbols_without_TS, 
                       cutoffs=cutoffs, 
                       energy_list = train_H_o_M, 
                       atoms_ref_list = train_structures,
                       outcar_list = file_location, 
                       position_tolerance = position_tolerance, 
                       symprec = symprec, 
                       tol_positions = tol_positions, 
                       fit_method = fit_method)
        records[fit_method].append({'c4': c4, **row})
        
        file = open(save_file,'a')
        file.write(file_format % (fit_method, c4, row['rmse_validation'], row['rmse_train'], row['BIC'], row['n_parameters'], row['n_nonzero_parameters']))
        file.close()
    df4[fit_method]= pd.DataFrame(records[fit_method])
print(save_file)

In [None]:
# read results cutoff4

best_cutoffs4 = {}
df4 = {}

for fit_method in fit_methods:
    df4[fit_method] = {'c4':[],'rmse_validation':[],'rmse_train':[],'BIC':[],'n_parameters':[],'n_nonzero_parameters':[]}
file = open('/nfshome/winkelmann/ARL/save/cutoff_4_noTS_fitting_data','r')
file.readline()
lines = file.readlines()
    
for line in lines:
    line = line.replace(' ', '')
    values = line.split(',')
    if len(values) > 7:
        print(values[7] + ',' + str(values[0:7]))
        continue
    df4[values[0]]['c4'].append(float(values[1]))
    df4[values[0]]['rmse_validation'].append(float(values[2]))
    df4[values[0]]['rmse_train'].append(float(values[3]))
    df4[values[0]]['BIC'].append(float(values[4]))
    df4[values[0]]['n_parameters'].append(float(values[5]))
    df4[values[0]]['n_nonzero_parameters'].append(float(values[6]))

for fit_method in fit_methods:
    df4[fit_method] = pd.DataFrame(df4[fit_method])
    best_cutoffs4[fit_method] = df4[fit_method].c4[df4[fit_method].rmse_validation.idxmin()]

for key,cutoff in best_cutoffs4.items():
    print("%s: %s" %(key, cutoff))

In [None]:
#write cutoff file

save_file = prevent_overwrite('/nfshome/winkelmann/ARL/tmp/best_cutoffs_noTS')
file_format = '%14s,\t%8s,\t%8s,\t%8s' + os.linesep
file = open(save_file,'w')
file.write(file_format % ('fit_method', 'cutoff2', 'cutoff3', 'cutoff4'))
for fit_method in fit_methods:
    file.write(file_format % (fit_method, best_cutoffs2[fit_method], best_cutoffs3[fit_method], best_cutoffs4[fit_method]))
file.close()


## comparing fitting algorithms

In [None]:
# Store stuff for use later
data = {'concentration': [], 'reference_energy': [], 'predicted_energy': [], 'file_location': []}

# Go trough all the data
for outcar, mapped_structure, h_o_m, location in zip(file_location, mapped_structures_without_TS, train_H_o_M, file_location):
    
    try:
        # Compute Li concentration
        data['concentration'].append(mapped_structure.get_chemical_symbols().count("Li")/(mapped_structure.get_chemical_symbols().count("O")/2))

        # Add original energy to dictthe factor of 1e3 serves to convert from eV/atom to meV/atom
        data['reference_energy'].append(1e3 * h_o_m)
        
        # keep the file location to allow parsing
        data['file_location'].append(location)
    
    # Catch errors in case something goes wrong
    except Exception as err:
        print(f"Problems with {outcar}")
        print(f"Original Error Message:\n {err}\n")

In [None]:
# Set up Clusterspace
cs = ClusterSpace(structure=prim_without_TS,
                      cutoffs=cutoffs,
                      chemical_symbols=chemical_symbols_without_TS,
                      position_tolerance=position_tolerance,
                      symprec=symprec)    

In [None]:
import time as pytime
comp_time = {}
ce_fittings = {}
rmse = {}

In [None]:
# scan ardr_lambda
start_time = pytime.time()
lambda_values = 1000 #factor for standard deviation?
cve = CrossValidationEstimator((A_without_TS, y_without_TS), fit_method='ardr', threshold_lambda=lambda_values)
cve.validate()
cve.train()
records = get_row(cve)
print(pd.DataFrame([records]))
df_ardr_lambda = pd.DataFrame([records])
print(records)
rmse['ardr_lambda'] = {'validation':records['rmse_validation'], 'train':records['rmse_train']}

ce_ardr_lambda = ClusterExpansion(cluster_space=cs, parameters=cve.parameters, metadata=cve.summary)
ce_ardr_lambda.write('/nfshome/winkelmann/ARL/tmp/mixing_energy_no_TS_ardr_lambda.ce')
ce_fittings['ardr_lambda'] = ce_ardr_lambda
comp_time['ardr_lambda'] = pytime.time() - start_time

In [None]:
# scan ARDR_lineScan
start_time = pytime.time()
cve = CrossValidationEstimator((A_without_TS, y_without_TS), fit_method='ardr', line_scan=True)
cve.validate()
cve.train()
records = get_row(cve)
print(pd.DataFrame([records]))
df_ardr_lineScan = pd.DataFrame([records])
print(records)
rmse['ardr_lineScan'] = {'validation':records['rmse_validation'], 'train':records['rmse_train']}

ce_ardr_lineScan = ClusterExpansion(cluster_space=cs, parameters=cve.parameters, metadata=cve.summary)
ce_ardr_lineScan.write('/nfshome/winkelmann/ARL/tmp/mixing_energy_no_TS_ardr_lineScan.ce')
ce_fittings['ardr_lineScan'] = ce_ardr_lineScan
comp_time['ardr_lineScan'] = pytime.time() - start_time

In [None]:
# scan rfe
start_time = pytime.time()
cve = CrossValidationEstimator((A_without_TS, y_without_TS), fit_method='rfe')
cve.validate()
cve.train()
records = get_row(cve)
print(pd.DataFrame([records]))
df_rfe = pd.DataFrame([records]) # strange error when not giving a list
print(records)
rmse['rfe'] = {'validation':records['rmse_validation'], 'train':records['rmse_train']}

ce_rfe = ClusterExpansion(cluster_space=cs, parameters=cve.parameters, metadata=cve.summary)
ce_rfe.write('/nfshome/winkelmann/ARL/tmp/mixing_energy_no_TS_rfe.ce')
ce_fittings['rfe'] = ce_rfe
comp_time['rfe'] = pytime.time() - start_time

In [None]:
# scan lasso
start_time = pytime.time()
cve = CrossValidationEstimator((A_without_TS, y_without_TS), fit_method='lasso')
cve.validate()
cve.train()
records = get_row(cve)
print(pd.DataFrame([records]))
df_lasso = pd.DataFrame([records]) # strange error when not giving a list
print(records)
rmse['lasso'] = {'validation':records['rmse_validation'], 'train':records['rmse_train']}

ce_lasso = ClusterExpansion(cluster_space=cs, parameters=cve.parameters, metadata=cve.summary)
ce_lasso.write('/nfshome/winkelmann/ARL/tmp/mixing_energy_no_TS_lasso.ce')
ce_fittings['lasso'] = ce_lasso
comp_time['lasso'] = pytime.time() - start_time

In [None]:
# scan least_squares
start_time = pytime.time()
cve = CrossValidationEstimator((A_without_TS, y_without_TS), fit_method='least-squares')
cve.validate()
cve.train()
records = get_row(cve)
print(pd.DataFrame([records]))
df_least_squares = pd.DataFrame([records]) # strange error when not giving a list
print(records)
rmse['least_squares'] = {'validation':records['rmse_validation'], 'train':records['rmse_train']}

ce_least_squares = ClusterExpansion(cluster_space=cs, parameters=cve.parameters, metadata=cve.summary)
ce_least_squares.write('/nfshome/winkelmann/ARL/tmp/mixing_energy_no_TS_least_squares.ce')
ce_fittings['least_squares'] = ce_least_squares
comp_time['least_squares'] = pytime.time() - start_time

In [None]:
# scan bayesian_ridge
start_time = pytime.time()
cve = CrossValidationEstimator((A_without_TS, y_without_TS), fit_method='bayesian-ridge')
cve.validate()
cve.train()
records = get_row(cve)
print(pd.DataFrame([records]))
df_bayesian_ridge = pd.DataFrame([records]) # strange error when not giving a list
print(records)
rmse['bayesian_ridge'] = {'validation':records['rmse_validation'], 'train':records['rmse_train']}

ce_bayesian_ridge = ClusterExpansion(cluster_space=cs, parameters=cve.parameters, metadata=cve.summary)
ce_bayesian_ridge.write('/nfshome/winkelmann/ARL/tmp/mixing_energy_no_TS_bayesian_ridge.ce')
ce_fittings['bayesian_ridge'] = ce_bayesian_ridge
comp_time['bayesian_ridge'] = pytime.time() - start_time

In [None]:
# scan elasticnet
start_time = pytime.time()
cve = CrossValidationEstimator((A_without_TS, y_without_TS), fit_method='elasticnet')
cve.validate()
cve.train()
records = get_row(cve)
print(pd.DataFrame([records]))
df_elasticnet = pd.DataFrame([records]) # strange error when not giving a list
print(records)
rmse['elasticnet'] = {'validation':records['rmse_validation'], 'train':records['rmse_train']}

ce_elasticnet = ClusterExpansion(cluster_space=cs, parameters=cve.parameters, metadata=cve.summary)
ce_elasticnet.write('/nfshome/winkelmann/ARL/tmp/mixing_energy_no_TS_elasticnet.ce')
ce_fittings['elasticnet'] = ce_elasticnet
comp_time['elasticnet'] = pytime.time() - start_time

In [None]:
# scan omp
start_time = pytime.time()
cve = CrossValidationEstimator((A_without_TS, y_without_TS), fit_method='omp')
cve.validate()
cve.train()
records = get_row(cve)
print(pd.DataFrame([records]))
df_omp = pd.DataFrame([records]) # strange error when not giving a list
print(records)
rmse['omp'] = {'validation':records['rmse_validation'], 'train':records['rmse_train']}

ce_omp  = ClusterExpansion(cluster_space=cs, parameters=cve.parameters, metadata=cve.summary)
ce_omp.write('/nfshome/winkelmann/ARL/tmp/mixing_energy_no_TS_omp.ce')
ce_fittings['omp'] = ce_omp
comp_time['omp'] = pytime.time() - start_time

In [None]:
# scan ridge 
# how ridge regression without regularization parameters
start_time = pytime.time()
cve = CrossValidationEstimator((A_without_TS, y_without_TS), fit_method='ridge')
cve.validate()
cve.train()
records = get_row(cve)
print(pd.DataFrame([records]))
df_ridge = pd.DataFrame([records]) # strange error when not giving a list
print(records)
rmse['ridge'] = {'validation':records['rmse_validation'], 'train':records['rmse_train']}

ce_ridge  = ClusterExpansion(cluster_space=cs, parameters=cve.parameters, metadata=cve.summary)
ce_ridge.write('/nfshome/winkelmann/ARL/tmp/mixing_energy_no_TS_ridge.ce')
ce_fittings['ridge'] = ce_ridge
comp_time['ridge'] = pytime.time() - start_time

In [None]:
# scan split_bregman
start_time = pytime.time()
cve = CrossValidationEstimator((A_without_TS, y_without_TS), fit_method='split-bregman')
cve.validate()
cve.train()
records = get_row(cve)
print(pd.DataFrame([records]))
df_split_bregman = pd.DataFrame([records]) # strange error when not giving a list
print(records)
rmse['split_bregman'] = {'validation':records['rmse_validation'], 'train':records['rmse_train']}

ce_split_bregman = ClusterExpansion(cluster_space=cs, parameters=cve.parameters, metadata=cve.summary)
ce_split_bregman.write('/nfshome/winkelmann/ARL/tmp/mixing_energy_no_TS_split_bregman.ce')
ce_fittings['split_bregman'] = ce_split_bregman
comp_time['split_bregman'] = pytime.time() - start_time

## plotting and comparing results

In [None]:
#adding data to compare Optimization
data_predict_opt = {}
for key,value in comp_time.items():
    data_predict_opt[key] = []    
    for outcar, mapped_structure, h_o_m, location in zip(file_location, mapped_structures_without_TS, train_H_o_M, file_location):
        # use the mapped structures to predict energy
        data_predict_opt[key].append(1e3 * ce_fittings[key].predict(mapped_structure))

In [None]:
#calculating diffrence of methods from training data and to convexhull
from icet.tools import ConvexHull
import os

hull = ConvexHull(data['concentration'], data['reference_energy'])
data['hull_energy'] = []
for concentration in data['concentration']:
    data['hull_energy'].append(hull.get_energy_at_convex_hull(concentration))
file = open('/nfshome/winkelmann/ARL/tmp/comparing_optimization', 'a') 


#calculating distance to hull 
hull_distances = {}
for key,value in data_predict_opt.items():
    hull_distances[key] = np.absolute(np.subtract(data['hull_energy'], value))
    write_string = '%14s \t RMSE validation: %22s \t train: %22s \t hull: %s \t time: %s' %(key, rmse[key]['validation'], rmse[key]['train'], sum(hull_distance), comp_time[key])
    file.write(write_string + os.linesep)
    print(write_string)
file.close()

In [None]:
sorted_concentration_indices = np.argsort(data['concentration'])
pdf = PdfPages('/nfshome/winkelmann/ARL/tmp/comparing_optimization.pdf')

for key, value in data_predict_opt.items():
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(18,9))

    # ax1 = HOM (predict & reference) & hull

    ax1.set_title('heat of mixing (%s)' % key)
    ax1.set_xlabel('x in Li$_x$')
    ax1.set_ylabel('Mixing energy (meV/atom)')

    ax1.plot(np.array(data['concentration'])[sorted_concentration_indices],
             np.array(hull_energy)[sorted_concentration_indices], label='convex hull of reference')
    ax1.scatter(data['concentration'], data['reference_energy'], marker='o', label='reference')
    ax1.scatter(data['concentration'], value, marker='x', label=key)
    
    
    # ax2 = reference vs predict (color coded for diffrence hull

    ax2.set_title('prediction error plot (%s)' % key)
    ax2.set_xlabel('Prediction (mev/atom)')
    ax2.set_ylabel('reference (meV/atom)')
    
    img = ax2.scatter(data['reference_energy'], value, c=hull_distances[key], cmap='coolwarm')
    cb = fig.colorbar(img)
    cb.set_label('distance to hull (predicted)')
    ax2.plot(ax2.get_xlim(), ax2.get_ylim(), ls='--', color='black')  # plotting a diagonal line for reference
    pdf.savefig(figure=fig, bbox_inches='tight')
pdf.close()

## via Convex Hull

In [None]:
# optimizing CE
from icet.tools import ConvexHull
from trainstation import Optimizer

cluster_vectors = A_without_TS.copy()
data_opt = data.copy()
data_opt['predicted_energy'] = []
hull = ConvexHull(data['concentration'], data['reference_energy'])
weight = np.ones(len(data_opt['concentration']))

n = 0
#calculating weighting factor - by distance to convex hull
for distance in deviations):
    hull_energy = hull.get_energy_at_convex_hull(data_opt['concentration'][i])
    deviation = abs(data_opt['reference_energy'][i] - hull_energy)
    if deviation != 0:
        weight[i] = 1/deviation
    else:
        weight[i] = 0 #number makes no noticable diffrence (weights should be small but this should be the highest)
max_weight = weight.max()
for w in weight:
    if w == 0:
        w = max_weight * 1.1
#calculate weighted ClusterVectors and energies
cluster_vectors_weighted = np.multiply(cluster_vectors, weight.reshape(-1, 1))
reference_energy_weighted = np.multiply(y_without_TS, weight)
opt = Optimizer((cluster_vectors_weighted, reference_energy_weighted), fit_method='ridge')
opt.train()

#create optimized CE from it
ce_opt = ClusterExpansion(cluster_space=cs, parameters=opt.parameters, metadata=opt.summary)
ce.write('/nfshome/winkelmann/ARL/tmp/mixing_energy_no_TS_opt_convexHull.ce')

In [None]:
#adding data to compare Optimization
for outcar, mapped_structure, h_o_m, location in zip(file_location, mapped_structures_without_TS, train_H_o_M, file_location):
    # use the mapped structures to predict energy
    data_opt['predicted_energy'].append(1e3 * ce_opt.predict(mapped_structure))

In [None]:
#plotting opt CE against training Data and not optized CE

fig, ((ax5, ax6), (ax7, ax8)) = plt.subplots(2, 2, figsize=(18, 18))

hull_energy = []
for concentration in data['concentration']:
    hull_energy.append(hull.get_energy_at_convex_hull(concentration))
sorted_concentration_indices = np.argsort(data['concentration'])

## ax6 = Optimised vs. training Data
ax5.set_xlabel(r'x in Li$_x$NiO2')
ax5.set_ylabel(r'Mixing energy (meV/atom)')
ax5.set_xlim([0, 1])
ax5.set_ylim([-40, 30])
ax5.scatter(data['concentration'], data['reference_energy'],
            marker='o', color='blue', label='reference')
ax5.scatter(data_opt['concentration'], data_opt['predicted_energy'],
            marker='+', color='green', label='CE_opt prediction')
ax5.legend()
ax5.set_title("Optimised vs. training Data")

## ax6 = Optimised vs. unoptimized
ax6.set_xlabel(r'x in Li$_x$NiO2')
ax6.set_ylabel(r'Mixing energy (meV/atom)')
ax6.set_xlim([0, 1])
ax6.set_ylim([-40, 30])
ax6.scatter(data['concentration'], data['predicted_energy'],
            marker='x', color='orange', label='CE prediction')
ax6.scatter(data_opt['concentration'], data_opt['predicted_energy'],
            marker='+', color='green', label='CE_opt prediction')

ax6.legend()
ax6.set_title("Optimised vs. unoptimized")

## ax7 = Optimised vs. convex_Hull
ax7.set_xlabel(r'x in Li$_x$NiO2')
ax7.set_ylabel(r'Mixing energy (meV/atom)')
ax7.set_xlim([0, 1])
ax7.set_ylim([-40, 30])
ax7.plot(np.array(data['concentration'])[sorted_concentration_indices],np.array(hull_energy)[sorted_concentration_indices])
#ax7.plot(data['concentration'][sorted_concentration_indices], hull_energy[sorted_concentration_indices],
#            marker='.', color='grey', label='Convex_Hull')
ax7.scatter(data_opt['concentration'], data_opt['predicted_energy'],
            marker='+', color='green', label='CE_opt prediction')
ax7.legend()
ax7.set_title("Optimised vs. convex_Hull")

##ax8 = Full heat of mixing plot comparison
ax8.set_xlabel(r'x in Li$_x$NiO2')
ax8.set_ylabel(r'Mixing energy (meV/atom)')
ax8.set_xlim([0, 1])
ax8.set_ylim([-40, 30])
ax8.scatter(data['concentration'], data['reference_energy'],
            marker='o', color='blue', label='reference')
ax8.scatter(data['concentration'], data['predicted_energy'],
            marker='x', color='orange', label='CE prediction')
ax8.scatter(data_opt['concentration'], data_opt['predicted_energy'],
            marker='+', color='green', label='CE_opt prediction')
ax8.plot(np.array(data['concentration'])[sorted_concentration_indices],np.array(hull_energy)[sorted_concentration_indices])
ax8.legend()
ax8.set_title("Full CE plot - comparison")

plt.savefig('/nfshome/winkelmann/ARL/tmp/optimization_comparison_no_TS_opt_convexHull.png',
            bbox_inches='tight')

# First fitting for testing purposes of python implementation (no Ni_Li, but with transition states)

In [None]:
# combine the structures 
train_structures.append(atoms_for_training_NEB_transition_states) 

# ... and energies
train_H_o_M.append(H_o_M_for_training_NEB_transition_states)

# to be able to retrieve problematic files, keep the paths
file_location.append(paths_for_training_NEB_transition_states)

In [None]:
# Assign chemical symbols
chemical_symbols= [['Li', 'X'],   # Li sublattice will contain: Li and Vacancies (=X), later also Ni
                    ['Ni'],       # Ni sublattice will not be changed
                    ['O'],        # O  sublattice will not be changed
                    ['O'],
                    ['Li','X'],   # Transition state sites
                    ['Li','X'],
                    ['Li','X'],]        

In [None]:
# get the fitting data
cutoffs = [8.57, 7.53, 6.42]
position_tolerance = 0.01
symprec = 0.01
tol_positions=0.05

(A, y), mapped_structures = get_A_y(prim=prim, 
                                   chemical_symbols=chemical_symbols, 
                                   cutoffs=cutoffs,                
                                   energy_list = train_H_o_M, 
                                   atoms_ref_list = train_structures, 
                                   outcar_list = file_location,
                                   position_tolerance=position_tolerance,
                                   symprec=symprec,
                                   tol_positions=tol_positions)

# Markus:  8.5966 Å, 8.1068 Å and 6.4169 Å

# Concerning pair cutoff within one Li layer
# Neighbor       distance           how many within layer      summed total per particle
#                                                               including Li layers above/below
# 1st neighbor at 2.8429                N= 6                         N =  6 (only intralayer bonds)
# 2nd neighbor at 4.92404               N= 6                         N = 12 (only intralayer bonds)
# 3rd neighbor at 5.68579 (=2*2.8429)   N= 6                         N = 24 (first bonds to other layers. 3 bonds to layer above and 3 to layer below)  
# 4th neighber at 7.5216                N=12                         N = 54 
# 5th neighbor at 8.52869 (=3*2.8429)   N= 6                         N = 78 

# Note: 4th cutoff here is just slightly smaller than 4th interlayer cutoff.
# Note: 5th cutoff here is just slightly smaller than 6th interlayer cutoff.


# Concerning cutoff for interlayer bonds:
# 1st interlayerbonds     at  4.99272   N= 3 (to both above and below)   Total interlayer summed up= 6
# 2nd interlayerbonds     at  5.74537   N= 3 (to both above and below)   Total interlayer summed up= 12
# 3rd interlayerbonds     at  6.41025   N= 6 (to both above and below)   Total interlayer summed up= 24
# 4th interlayerbonds     at  7.56673   N= 6 (to both above and below)   Total interlayer summed up= 36
# 5th interlayerbonds     at  8.08316   N= 3 (to both above and below)   Total interlayer summed up= 42
# 6th interlayerbonds     at  8.56852   N= 6 (to both above and below)   Total interlayer summed up= 54

#Total number of Li-Li Bonds as function of cutoff (inter/intra):
# 2.84    0  ( 0/ 0)
# 2.85    6  ( 6/ 0)
# 4.93   12  (12/ 0)
# 5.00   18  (12/ 6)
# 5.69   24  (18/ 6)
# 5.75   30  (18/12)  
# 6.42   42  (18/24)  3rd    3rd
# 7.53   54  (30/24)  2nd
# 7.57   66  (30/36)
# 8.09   72  (30/42)         2nd
# 8.53   78  (36/42)
# 8.57   90  (36/54)  1st    1st


In [None]:
# scan ARDR
lambda_values = [1000]
records = []
for lam in lambda_values:
    cve = CrossValidationEstimator((A, y), fit_method='ardr', threshold_lambda=lam)
    cve.validate()
    cve.train()
    row = get_row(cve)
    row['threshold_lambda'] = lam
    records.append(row)
df_ardr = pd.DataFrame(records)
print(row)

# Set up Clusterspace
cs = ClusterSpace(structure=prim,
                      cutoffs=cutoffs,
                      chemical_symbols=chemical_symbols,
                      position_tolerance=position_tolerance,
                      symprec=symprec)


ce = ClusterExpansion(cluster_space=cs, parameters=cve.parameters, metadata=cve.summary)
print(ce)
ce.write('/nfshome/winkelmann/ARL/tmp/mixing_energy.ce')
    

In [None]:
# Read the previously outputted CE and set up a dictionary with data to be plotted later
ce = ClusterExpansion.read('/nfshome/winkelmann/ARL/tmp/mixing_energy.ce')
write_vasp("/nfshome/winkelmann/ARL/tmp/real_prim.vasp", ce.primitive_structure)

# Store stuff for use later
data = {'concentration': [], 'reference_energy': [], 'predicted_energy': [], 'file_location': []}

# Go trough all the data
for outcar, mapped_structure, h_o_m, location in zip(file_location, mapped_structures, train_H_o_M, file_location):
    
    try:
        # Compute Li concentration
        data['concentration'].append(mapped_structure.get_chemical_symbols().count("Li")/(mapped_structure.get_chemical_symbols().count("O")/2))

        # Add original energy to dictthe factor of 1e3 serves to convert from eV/atom to meV/atom
        data['reference_energy'].append(1e3 * h_o_m)

        # use the mapped structures to predict energy
        data['predicted_energy'].append(1e3 * ce.predict(mapped_structure))
        
        # keep the file location to allow parsing
        data['file_location'].append(location)
    
    # Catch errors in case something goes wrong
    except Exception as err:
        print(f"Problems with {outcar}")
        print(f"Original Error Message:\n {err}\n")
        

In [None]:
# Retrieve the energy barriers and the corresponding predictons
barrier_concentrations  = []
ref_frontjump_barriers  = []
ref_backjump_barriers   = []
pred_frontjump_barriers = []
pred_backjump_barriers  = []

# Iterate over all run_final folders
for path in paths_for_training_NEB_transition_states:
        
    # Check the energy along the path. Use initial and final energies from the corresponding relaxed structures + the last steps of the 
    # optimized intermediate images
    energies = []
    initial_image = ASEread(path.replace("run_final", "OUTCAR_initial_image"))
    energies.append(initial_image.get_potential_energy())
    for i in ["01", "02", "03", "04", "05"]:
        energies.append(ASEread(f"{path}/{i}/OUTCAR").get_potential_energy())
    final_image = ASEread(path.replace("run_final", "OUTCAR_final_image"))
    energies.append(final_image.get_potential_energy())
    
    # For "Proper" paths, there should be maximum in energy !between! initial and final paths... ignore those where this is not the case
    index_highest_energy = energies.index(max(energies))
    
    if index_highest_energy == 0 or index_highest_energy == 6:
        print(f"Ignore {path}\n  ---> image {index_highest_energy} has highest energy!")
    
    else:
        # Compute reference frontjump and backjump barrier
        ref_frontjump_barriers.append(max(energies) - energies[0])
        ref_backjump_barriers.append(max(energies) - energies[-1])
        
        # Compute the predicted barriers...
        # get the interpolated middle points of the initially created, straight odh-type path to be used as ideal position for the CE training
        ideal_TS_structure_file = glob.glob(path.replace("run_final", "run01*/03/POSCAR_orig_linear_interpolation"))
        if len(ideal_TS_structure_file) == 0:                # For ODH-type jumps there is no POSCAR_orig_linear_interpolation
            ideal_TS_structure_file = glob.glob(path.replace("run_final", "run01*/03/POSCAR"))
        ideal_TS_atoms = ASEread(ideal_TS_structure_file[0])
        
        # Map the initial, final and TS state:        
        initial_mapped_atoms, info = icet.tools.map_structure_to_reference(structure=initial_image, 
                                                             reference=prim, 
                                                             inert_species=["O"], 
                                                             tol_positions=0.05, 
                                                             suppress_warnings=False, 
                                                             assume_no_cell_relaxation=False)
                          
        final_mapped_atoms,   info = icet.tools.map_structure_to_reference(structure=final_image, 
                                                             reference=prim, 
                                                             inert_species=["O"], 
                                                             tol_positions=0.05, 
                                                             suppress_warnings=False, 
                                                             assume_no_cell_relaxation=False)
        
        TS_mapped_atoms, info = icet.tools.map_structure_to_reference(structure=ideal_TS_atoms, 
                                                             reference=prim, 
                                                             inert_species=["O"], 
                                                             tol_positions=0.05, 
                                                             suppress_warnings=False, 
                                                             assume_no_cell_relaxation=False)
        
        # Fromt their predicted <<<heat of mixing>>>, get the energies and from those get the barriers:
        N_atoms = len(ideal_TS_atoms)
        Li_count = ideal_TS_atoms.get_chemical_symbols().count("Li")
        O_count  = ideal_TS_atoms.get_chemical_symbols().count("O")
                          
        pred_E_init  = ce.predict(initial_mapped_atoms) * N_atoms + Li_count * E_ref_LiNiO2_per_O2 + (O_count/2-Li_count) * E_ref_NiO2_per_O2
        pred_E_final = ce.predict(final_mapped_atoms)   * N_atoms + Li_count * E_ref_LiNiO2_per_O2 + (O_count/2-Li_count) * E_ref_NiO2_per_O2
        pred_E_TS    = ce.predict(TS_mapped_atoms)      * N_atoms + Li_count * E_ref_LiNiO2_per_O2 + (O_count/2-Li_count) * E_ref_NiO2_per_O2
                        
        pred_frontjump_barriers.append(pred_E_TS - pred_E_init)
        pred_backjump_barriers.append( pred_E_TS - pred_E_final)                  
                        
        barrier_concentrations.append(Li_count/(O_count/2))

In [None]:
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3,2, figsize=(18, 25))

### ax1 = Full heat of mixing plot

ax1.set_xlabel(r'x in Li$_x$NiO2')
ax1.set_ylabel(r'Mixing energy (meV/atom)')
ax1.set_xlim([0, 1])
ax1.set_ylim([-40, 30])

ax1.scatter(data['concentration'], data['reference_energy'],
           marker='o', label='reference')
ax1.scatter(data['concentration'], data['predicted_energy'],
           marker='x', label='CE prediction')

ax1.legend()
ax1.set_title("Full CE plot")



### ax2 = Heat of mixing plot of my own structures

ax2.set_xlabel(r'x in Li$_x$NiO2')
ax2.set_ylabel(r'Mixing energy (meV/atom)')
ax2.set_xlim([0, 1])
ax2.set_ylim([-40, 30])

# Take only the ones we are interested here
concentration    = []
reference_energy = []
predicted_energy = []

for c, ref , pred, location in zip(data["concentration"], data["reference_energy"], data["predicted_energy"], data["file_location"]):
    if "02_enumerate_P21c_0-4fu" in location:
        concentration.append(c)
        reference_energy.append(ref)
        predicted_energy.append(pred)

ax2.scatter(concentration, reference_energy,
           marker='o', label='reference')
ax2.scatter(concentration, predicted_energy,
           marker='x', label='CE prediction')

ax2.legend()
ax2.set_title("Marcel's enumerated data")



### ax3 = Heat of mixing plot of Markus's re-relaxed_structures

ax3.set_xlabel(r'x in Li$_x$NiO2')
ax3.set_ylabel(r'Mixing energy (meV/atom)')
ax3.set_xlim([0, 1])
ax3.set_ylim([-40, 30])

# Take only the ones we are interested here
concentration    = []
reference_energy = []
predicted_energy = []

for c, ref , pred, location in zip(data["concentration"], data["reference_energy"], data["predicted_energy"], data["file_location"]):
    if "re-relax_Markus" in location:
        concentration.append(c)
        reference_energy.append(ref)
        predicted_energy.append(pred)

ax3.scatter(concentration, reference_energy,
           marker='o', label='reference')
ax3.scatter(concentration, predicted_energy,
           marker='x', label='CE prediction')

ax3.legend()
ax3.set_title("Markus' re-relaxed structures")



### ax4 = Heat of mixing plot of the initial and final NEB images

ax4.set_xlabel(r'x in Li$_x$NiO2')
ax4.set_ylabel(r'Mixing energy (meV/atom)')
ax4.set_xlim([0, 1])
ax4.set_ylim([-40, 30])

# Take only the ones we are interested here
concentration    = []
reference_energy = []
predicted_energy = []

for c, ref , pred, location in zip(data["concentration"], data["reference_energy"], data["predicted_energy"], data["file_location"]):
    if "01_initial_structure" in location \
    or "02_odh/image"         in location \
    or "03_tsh/image"         in location \
    or "04_doube_tsh/image"   in location \
    or "0250/image"           in location \
    or "0500/image"           in location \
    or "0750/image"           in location:
        concentration.append(c)
        reference_energy.append(ref)
        predicted_energy.append(pred)

ax4.scatter(concentration, reference_energy,
           marker='o', label='reference')
ax4.scatter(concentration, predicted_energy,
           marker='x', label='CE prediction')

ax4.legend()
ax4.set_title("Initial and final NEB images")



### ax5 = Heat of mixing plot of the NEB transition states

ax5.set_xlabel(r'x in Li$_x$NiO2')
ax5.set_ylabel(r'Mixing energy (meV/atom)')
ax5.set_xlim([0, 1])
ax5.set_ylim([-40, 30])

# Take only the ones we are interested here
concentration    = []
reference_energy = []
predicted_energy = []

for c, ref , pred, location in zip(data["concentration"], data["reference_energy"], data["predicted_energy"], data["file_location"]):
    if "/02_odh/NEB_"          in location \
    or "/03_tsh/NEB_"          in location \
    or "/04_double_tsh/NEB_"   in location \
    or "/0250/NEB_"            in location \
    or "/0500/NEB_"            in location \
    or "/0750/NEB_"            in location:
        concentration.append(c)
        reference_energy.append(ref)
        predicted_energy.append(pred)

ax5.scatter(concentration, reference_energy,
           marker='o', label='reference')
ax5.scatter(concentration, predicted_energy,
           marker='x', label='CE prediction')

ax5.legend()
ax5.set_title("NEB transition states")



### ax6 = Check the real barriers

ax6.set_xlabel(r'x in Li$_x$NiO2')
ax6.set_ylabel(r'Barriers in eV')
ax6.set_xlim([0, 1])
#ax6.set_ylim([-40, 30])

# Front jump
concentration    = []
reference_energy = []
predicted_energy = []
for c, ref , pred, location in zip(barrier_concentrations, ref_frontjump_barriers, pred_frontjump_barriers, paths_for_training_NEB_transition_states):
    concentration.append(c)
    reference_energy.append(ref)
    predicted_energy.append(pred)
ax6.scatter(concentration, reference_energy,
           marker='o', label='reference frontjump')
ax6.scatter(concentration, predicted_energy,
           marker='x', label='CE prediction frontjump')

# Back jump
concentration    = []
reference_energy = []
predicted_energy = []
for c, ref , pred, location in zip(barrier_concentrations, ref_backjump_barriers, pred_backjump_barriers, paths_for_training_NEB_transition_states):
    concentration.append(c)
    reference_energy.append(ref)
    predicted_energy.append(pred)
ax6.scatter(concentration, reference_energy,
           marker='o', label='reference backjump')
ax6.scatter(concentration, predicted_energy,
           marker='x', label='CE prediction backjump')


ax6.legend()
ax6.set_title("NEB transition states")





plt.savefig('/nfshome/winkelmann/ARL/tmp/mixing_energy_comparison.png', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
ax.set_xlabel(r'reference heat of mixing [meV/atom]')
ax.set_ylabel(r'predicted heat of mixing [meV/atom]')
#ax.set_xlim([0, 1])
#ax.set_ylim([-69, 15])
ax.scatter(data['reference_energy'], data['predicted_energy'],
           marker='o', label='reference')

ax.plot(range(-30,30), range(-30,30), label='slope 1', color="red")

ax.legend()
plt.savefig('/nfshome/winkelmann/ARL/tmp/parity_plot.png', bbox_inches='tight')

In [None]:
#data['file_location']

In [None]:
len(file_location)

In [None]:
#READMEs

In [None]:
"/".join(["hello","new","world"])

In [None]:
a = "/hello/new/world"

In [None]:
a.split("/")

In [None]:
#paths_for_training_NEB_transition_states

In [None]:
#file_location

In [None]:
data["file_location"]