# Prerequisites

## Modules

In [1]:
# Modules are available in conda environment with name: icet
# conda activate icet

import ase
from ase.io import read as ASEread
from ase.io.vasp import write_vasp
from ase.db import connect
from ase.cell import Cell
from ase.neighborlist import NewPrimitiveNeighborList
from ase.build import make_supercell

import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import colormaps
import glob
import sys
import os
import random
import shutil

import icet
from icet import ClusterSpace, StructureContainer, ClusterExpansion
from trainstation import CrossValidationEstimator
from icet.tools import enumerate_structures
from icet.tools.structure_generation import generate_sqs_by_enumeration

try:
    import seaborn as sns
    sns.set_context('notebook')
except ImportError:
    print('sad')
    
import subprocess

import datetime
import warnings

## Misc Functions

In [2]:
# Stop message
def jupyter_stop(ErrorMessage="User-defined stop via jupyter_stop() function"):
    """
    User defined stop function, similar to exit(). Mostly for testing purpose or to 
    avoid overwriting of already generated data.
    """
    raise SystemExit(ErrorMessage)

## CE Functions

## Reorder Atoms

In [3]:
# S only in this list to 'trick' the structure enumeration
# S as extra Nickel
atomic_label2number = {"Li" :  3,
                       "O"  :  8,
                       "S"  : 16,
                       "Ni" : 28}

atomic_number2label = { 3 : "Li",
                        8 :  "O",
                       16 :  "S",
                       28 :  "Ni"}


def order_atoms(atoms,order=["Li","Ni","O"]):
    
    # get old positions and atomic numbers
    old_positions       = atoms.get_positions()
    old_atomic_number   = atoms.get_atomic_numbers()
    
    # create empty dict for all types
    atomic_pos_dict = {}
    for sym in order:
        atomic_pos_dict[sym] = []
    
    # append positions to dict 
    for num, pos in zip(old_atomic_number, old_positions):
        atomic_pos_dict[atomic_number2label[num]].append(pos)
    
    # put together the new ordered positions and atomic numbers
    new_positions = []
    new_atomic_numbers = []
    for sym in order:
        new_positions.extend(atomic_pos_dict[sym])
        new_atomic_numbers.extend( [ atomic_label2number[sym] ] * len(atomic_pos_dict[sym]) )
    
    # copy original atoms object and modify it
    copy_atoms = atoms.copy()
    copy_atoms.set_positions(new_positions)
    copy_atoms.set_atomic_numbers(new_atomic_numbers)
    
    return copy_atoms


In [4]:
# Basic setups

def get_fit_data(prim, chemical_symbols, cutoffs, energy_list, atoms_ref_list, outcar_list, position_tolerance, symprec, tol_positions):
    """
    Construct cluster space and structure container for the given cutoffs
    and return the fit matrix along with the target energies
    """
    # stepsize to print update of training:
    stepsize = int(0.1*len(outcar_list))
    
    # Collect the mapped structures
    mapped_structures = []
    
    # Set up Clusterspace
    cs = ClusterSpace(structure=prim,
                      cutoffs=cutoffs,
                      chemical_symbols=chemical_symbols,
                      position_tolerance=position_tolerance,
                      symprec=symprec)
    
    #print(cs)
    
    # Set up StructureContainer with the previsouly generated ClusterSpace
    sc = StructureContainer(cluster_space=cs)
    
    # Fill the StructureContainer
    for i, (outcar, E, at_ref) in enumerate(zip(outcar_list, energy_list, atoms_ref_list)):
        
        # print update of training
        # if i % stepsize == 0:
        #     print(f"Computing structure {i} of {len(outcar_list)} ({i/len(outcar_list):.1%})   {datetime.datetime.now()}")
        
        
        # Read the OUTCAR [by default last step is used] and get energy
        #at     = ASEread(outcar)
        #total_energy = at.get_potential_energy() # total_energy = atoms.get_potential_energy(force_consistent=True)
        
        # Map the enumerated structure to the primitive cell, add it to cluster space with the energy of the properly relaxed system
        try:
            mapped_atoms, info = icet.tools.map_structure_to_reference(structure=at_ref, 
                                                             reference=prim, 
                                                             inert_species=["O"], 
                                                             tol_positions=tol_positions, 
                                                             suppress_warnings=False, 
                                                             assume_no_cell_relaxation=False)
            mapped_structures.append(mapped_atoms)

            sc.add_structure(structure=mapped_atoms,
                     properties={'Total Energy': E},
                     user_tag = outcar,
                     sanity_check=True,
                     )
        
        except ValueError as err:
            print(f"Mapping Error with {outcar}")
            print(f"Note: Possibly a different structure was used for the mapping!")
            print("Original Error Message:")
            print(err , "\n")
            

            
    print(f"len(cs) = {len(cs)}")
    
    return sc.get_fit_data(key='Total Energy'), mapped_structures


def get_A_y(prim, chemical_symbols, cutoffs, energy_list, atoms_ref_list, outcar_list, position_tolerance, symprec, tol_positions):
    return get_fit_data(prim, chemical_symbols, cutoffs, energy_list, atoms_ref_list, outcar_list, position_tolerance, symprec, tol_positions)



def get_row(cve, alpha=None):
    row = dict()
    row['rmse_validation'] = cve.rmse_validation
    row['rmse_train'] = cve.rmse_train
    row['BIC'] = cve.model.BIC
    row['n_parameters'] = cve.n_parameters
    row['n_nonzero_parameters'] = cve.n_nonzero_parameters
    
    if alpha != None:
        row['alpha'] = alpha
    
    return row


def train_ce(prim, chemical_symbols, cutoffs, energy_list, atoms_ref_list, outcar_list, position_tolerance, symprec, tol_positions, fit_method):
    """
    Train a cluster expansion with the given cutoffs and return fit metrics of the obtained model.
    prim: ase atoms object, its the primitive structure that the CE lives on
    chemical_symbols: List of the possible atoms types on the different sites of prim
    cutoffs: cutoffs for the 2-body, 3-body, ... terms
    atoms_list: list of all the atoms objects to use for training/testing
    outcar_list : list with paths (strings) of the corresponding atoms objects
    fit_method examples with additional options (to be implemented at a later point): 
        fit_method='rfe'
        fit_method='ardr', threshold_lambda=4e5
        fit_method='ardr', line_scan=True
        fit_method='lasso'
        fit_method='least-squares'
    """
    (A, y), mapped_structures = get_fit_data(prim, chemical_symbols, cutoffs, energy_list, atoms_ref_list, outcar_list, position_tolerance, symprec, tol_positions)
    n_splits=10
    if fit_method == 'ardr-lambda':
        cve = CrossValidationEstimator((A, y), fit_method='ardr', threshold_lambda=1000, validation_method='shuffle-split', n_splits=n_splits)
    elif fit_method == 'ardr-lineScan':
        cve = CrossValidationEstimator((A, y), fit_method='ardr', line_scan=True, validation_method='shuffle-split', n_splits=n_splits)
    else:
        cve = CrossValidationEstimator((A, y), fit_method=fit_method, validation_method='shuffle-split', n_splits=n_splits)
    # print(f'starting validation with {fit_method} algorithm  {datetime.datetime.now()}')
    cve.validate()
    cve.train()

    row = get_row(cve)
    
    return cve

def prevent_override(path):
    # Check if the file exists
    if not os.path.exists(path):
        return path
    # Get the file name and extension
    base, ext = os.path.splitext(path)
    # Initialize a counter
    counter = 1
    # Loop to find a non-existing file name
    while os.path.exists(f"{base}_{counter}{ext}"):
        counter += 1
    # Return the new file name
    return f"{base}_{counter}{ext}"

# Collect data

In [5]:
### Get the reference energies of LiNiO2 and NiO2 normed per unit cell
LiNiO2 = ASEread("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/CE_database_Marcel/02_enumerate_P21c_0-4fu/0001_finished_approved/run_final_approved/OUTCAR")
E_ref_LiNiO2_per_O2 = LiNiO2.get_potential_energy() / LiNiO2.get_chemical_symbols().count("O") * 2 #or per Ni in case of no extra Ni

NiO2   = ASEread("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/CE_database_Marcel/02_enumerate_P21c_0-4fu/0003_finished_approved/run_final_approved/OUTCAR")
E_ref_NiO2_per_O2   = NiO2.get_potential_energy() / NiO2.get_chemical_symbols().count("O") * 2 #or per Ni in case of no extra Ni

## Own enumerated structures based on P21/c

In [None]:
#Li verteilung ohne trans
atoms_for_training_from_own_enumerated_structures = []
H_o_M_for_training_from_own_enumerated_structures = []

# Get all the outcars of interest
outcars_for_training_from_own_enumerated_structures= sorted(glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/CE_database_Marcel/02_enumerate_P21c_0-4fu/0*_finished_approved/run_final_approved/OUTCAR"))

# Iterate over them
for outcar in outcars_for_training_from_own_enumerated_structures:

    # get atoms object
    atoms = ASEread(outcar, index=":")

    # Compute total heat of mixing
    Li_count = atoms[-1].get_chemical_symbols().count("Li")
    O_count  = atoms[-1].get_chemical_symbols().count("O")
    H_o_M    = atoms[-1].get_potential_energy() - Li_count * E_ref_LiNiO2_per_O2 - (O_count/2 - Li_count) * E_ref_NiO2_per_O2

    # Append data...
    # ... but to make mapping easier take the originally generated structures instead of the relaxed ones
    ref = "/".join(outcar.split("/")[:-2]) + "/POSCAR_enumerated"
    atoms_for_training_from_own_enumerated_structures.append(ASEread(ref))        # total number of lattice sites:
    # O_count   for Oxygen
    # O_count/2 for Li-Sites
    # O_count/2 for Ni-Sites
    # = 2*O_count --> Lattice Sites
    H_o_M_for_training_from_own_enumerated_structures.append( H_o_M / (2*O_count) )

## Markus low energy CE data

In [7]:
# The following code takes Markus' relaxed structures and rotates them around the ccartesian z axis to fit the primitive structure we are using
# Does not need to be run again
# REMOVED error so it can run through until optimization

In [8]:
#Li verteilung ohne trans
# Compare Markus last step energy and volume with the ones re-relaxed from me
# To this end, use the README files where 

atoms_for_training_from_Markus_low_energy_structures   = []
H_o_M_for_training_from_Markus_low_energy_structures   = []
outcars_for_training_from_Markus_low_energy_structures = []

READMEs = glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/CE_database_Marcel/03_Markus_approved_low_energy_data/*/README_original_path_from_Markus")

for README in READMEs:
    
    if os.path.isdir(README.replace("README_original_path_from_Markus","run_final")):
    
        # get the transformed CONTCAR to enable correct mapped to our prim structure later
        # Only for the 2 structures that made problems, take the original (rotated) POSCAR to enable mapping later
        if "re-relax_Markus038_finished" in README or "re-relax_Markus115_finished" in README:
            with open(README, "r") as f:
                line = f.readlines()[0]
            transformed_contcar = "/".join(line.split("/")[0:-1]) + "/run01/POSCAR_rotated.vasp" 
        else:
            transformed_contcar = README.replace("README_original_path_from_Markus","run_final/CONTCAR_rotated.vasp")
        atoms_transformed_contcar = ASEread(transformed_contcar)
        atoms_for_training_from_Markus_low_energy_structures.append(atoms_transformed_contcar)
        
        # get the outcar from the relaxation to get the energy
        outcar = README.replace("README_original_path_from_Markus","run_final/OUTCAR")
        outcars_for_training_from_Markus_low_energy_structures.append(outcar)
        atoms = ASEread(outcar, index=":")

        # Compute heat of mixing and per atom
        Li_count = atoms[-1].get_chemical_symbols().count("Li")
        O_count  = atoms[-1].get_chemical_symbols().count("O")
        H_o_M    = atoms[-1].get_potential_energy() - Li_count * E_ref_LiNiO2_per_O2 - (O_count/2 - Li_count) * E_ref_NiO2_per_O2
        # total number of lattice sites:
        # O_count   for Oxygen
        # O_count/2 for Li-Sites
        # O_count/2 for Ni-Sites
        # = 2*O_count --> Lattice Sites
        H_o_M_for_training_from_Markus_low_energy_structures.append( H_o_M / (2*O_count) )
        

## NEB initial and final images (without Ni_Li)

In [9]:
#Li verteilung ohne trans
atoms_for_training_NEB_initial_and_final_images = []
H_o_M_for_training_NEB_initial_and_final_images = []

# Find the ordered ones from 0250, 0500 and 0750 first
outcars_for_training_NEB_initial_and_final_images  = glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0250/image*/02_scan/*final/OUTCAR", recursive=True)
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0500/image*/02_scan/*final/OUTCAR", recursive=True)
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0750/image*/02_scan/*final/OUTCAR", recursive=True)

# and the ones from the random structures
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0*random*/01_initial_structure/02_scan/*final/OUTCAR", recursive=True)
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0*random*/02_odh/image*/02_scan/*final/OUTCAR", recursive=True)
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0*random*/03_tsh/image*/02_scan/*final/OUTCAR", recursive=True)
outcars_for_training_NEB_initial_and_final_images += glob.glob("/nfshome/sadowski/work/LiNiO2_data_base_Sabrina/DFT_database/NEBs_Marcel/0*random*/04_double_tsh/image*/02_scan/*final/OUTCAR", recursive=True)

# Iterate over OUTCARs
for OUTCAR in outcars_for_training_NEB_initial_and_final_images:
    
    # get the atoms object
    atoms = ASEread(OUTCAR, index=":") 
    
    # Compute heat of mixing and per atom
    Li_count = atoms[-1].get_chemical_symbols().count("Li")
    O_count  = atoms[-1].get_chemical_symbols().count("O")
    H_o_M    = atoms[-1].get_potential_energy() - Li_count * E_ref_LiNiO2_per_O2 - (O_count/2 - Li_count) * E_ref_NiO2_per_O2
        
    # append them to the lists
    atoms_for_training_NEB_initial_and_final_images.append(atoms[-1])
    # total number of lattice sites:
    # O_count   for Oxygen
    # O_count/2 for Li-Sites
    # O_count/2 for Ni-Sites
    # = 2*O_count --> Lattice Sites
    H_o_M_for_training_NEB_initial_and_final_images.append( H_o_M / (2*O_count) )

## NEB transition states

In [None]:
#Li-trans

atoms_for_training_NEB_transition_states = []
H_o_M_for_training_NEB_transition_states = []
paths_for_training_NEB_transition_states = []

# the ones generated manually (0250, 0500, 0750)
paths_for_training_NEB_transition_states  = glob.glob("/nfshome/winkelmann/ARL/NEBs_Marcel/*/NEB_*/run_final*")

# the random ones
paths_for_training_NEB_transition_states += glob.glob("/nfshome/winkelmann/ARL/NEBs_Marcel/*random*/02_odh/NEB*/run_final*")
paths_for_training_NEB_transition_states += glob.glob("/nfshome/winkelmann/ARL/NEBs_Marcel/*random*/03_tsh/NEB*/run_final*")
paths_for_training_NEB_transition_states += glob.glob("/nfshome/winkelmann/ARL/NEBs_Marcel/*random*/04_double_tsh/NEB*/run_final*")

# Iterate over all run_final folders
for path in paths_for_training_NEB_transition_states:
    #get the folder path
    source_folder = '/'.join(path.split('/')[0:-1])
    # Check the energy along the path. Use initial and final energies from the corresponding relaxed structures + the last steps of the 
    # optimized intermediate images
    energies = []
    energies.append(ASEread(source_folder + "/OUTCAR_initial_image").get_potential_energy())
    could_not_read_counter = 0
    for i in ["01", "02", "03", "04", "05"]:
        try:
            energies.append(ASEread(f"{path}/{i}/OUTCAR").get_potential_energy())
        except:
            could_not_read_counter += 1
            print(f"{could_not_read_counter}. \t Could not read {path}/{i}/OUTCAR")
    if could_not_read_counter == 5:
        print(f"Ignore {path}\n  ---> could not read any OUTCARs!")
        continue
    energies.append(ASEread(source_folder + "/OUTCAR_final_image").get_potential_energy())
    
    # For "Proper" paths, there should be maximum in energy !between! initial and final paths... ignore those where this is not the case
    index_highest_energy = energies.index(max(energies))
    
    if index_highest_energy == 0 or index_highest_energy == 6:
        print(f"Ignore {path}\n  ---> image {index_highest_energy} has highest energy!")
    
    else:    
        # get the interpolated middle points of the initially created, straight odh-type path to be used as ideal position for the CE training
        ideal_TS_structure_file = glob.glob(source_folder + "/anchor_trans_image.vasp")
        ideal_TS_atoms = ASEread(ideal_TS_structure_file[0])
     
        atoms_for_training_NEB_transition_states.append(ideal_TS_atoms)
        
        # Compute heat of mixing per atom and append to list
        Li_count = ideal_TS_atoms.get_chemical_symbols().count("Li") + 1 # +1 for the jumping Li
        O_count  = ideal_TS_atoms.get_chemical_symbols().count("O")
        H_o_M    = max(energies) - Li_count * E_ref_LiNiO2_per_O2 - (O_count/2-Li_count) * E_ref_NiO2_per_O2
        # total number of lattice sites:
        # O_count   for Oxygen
        # O_count/2 for Li-Sites
        # O_count/2 for Ni-Sites
        # = 2*O_count --> Lattice Sites
        H_o_M_for_training_NEB_transition_states.append( H_o_M / (2*O_count) ) #todo recheck if that is right why per lattice sit and not per atom?


1. 	 Could not read /nfshome/winkelmann/ARL/NEBs_Marcel/0375_random02_seed_85/02_odh/NEB_initial-image01/run_final_decided_to_be_converged_patchworked/01/OUTCAR
2. 	 Could not read /nfshome/winkelmann/ARL/NEBs_Marcel/0375_random02_seed_85/02_odh/NEB_initial-image01/run_final_decided_to_be_converged_patchworked/04/OUTCAR
1. 	 Could not read /nfshome/winkelmann/ARL/NEBs_Marcel/0625_random01_seed_178/04_double_tsh/NEB_initial-image02/run_final_patchworked_decided_to_be_converged/05/OUTCAR
1. 	 Could not read /nfshome/winkelmann/ARL/NEBs_Marcel/0625_random02_seed_258/04_double_tsh/NEB_initial-image04/run_final_patchworked/05/OUTCAR


## Combine data

In [None]:
# combine the structures 
train_structures = ( atoms_for_training_from_own_enumerated_structures 
                    + atoms_for_training_from_Markus_low_energy_structures
                    + atoms_for_training_NEB_initial_and_final_images 
                    + atoms_for_training_NEB_transition_states 
             )

# ... and energies
train_H_o_M      = ( H_o_M_for_training_from_own_enumerated_structures 
                     + H_o_M_for_training_from_Markus_low_energy_structures
                     + H_o_M_for_training_NEB_initial_and_final_images 
                     + H_o_M_for_training_NEB_transition_states 
             )

# to be able to retrieve problematic files, keep the paths
file_location = ( outcars_for_training_from_own_enumerated_structures 
                + outcars_for_training_from_Markus_low_energy_structures 
                + outcars_for_training_NEB_initial_and_final_images 
                + paths_for_training_NEB_transition_states
                )

# calc and print how many non Ts and TS structures are used for training
no_nonTS_structures = len( outcars_for_training_from_own_enumerated_structures 
                           + outcars_for_training_from_Markus_low_energy_structures 
                           + outcars_for_training_NEB_initial_and_final_images)
no_TS_structures = len(paths_for_training_NEB_transition_states)

print('# nonTS-Structures:\t', no_nonTS_structures)
print('# TS-Structures:\t', no_TS_structures)


# nonTS-Structures:	 882
# TS-Structures:	 135


# Fitting of just the Li sublattice

without TS


<font color='red'> deleted traing of ardr_lambda </font>
just left the setting of the Clusterspace

<font color='red'> deleted code cell for calculating with the model with respect to the trained structure </font>
kept initalization of data[] (without predicted_energy)

<font color='red'> deleted code cell for plotting the ardr_lambda fitting vs reference data with respect to the diffrent data sets </font>

# First fitting for testing purposes of python implementation (no Ni_Li, but with transition states)

In [12]:
# Read R-3m model of LiNiO2 in R-3m symmetry with transition states
prim_TS = ASEread("/nfshome/sadowski/work/LiNiO2_Sabrina/37_CE_for_Li_diffusion/00_LNO_R-3m.vasp")

print(prim_TS)
print(prim_TS.get_chemical_symbols())

Atoms(symbols='LiNiO2', pbc=True, cell=[[2.8428983688, 0.0, 0.0], [-1.4214491844, 2.4620222078, 0.0], [1.42145, 0.82067, 4.71521]])
['Li', 'Ni', 'O', 'O']


In [13]:
chemical_symbols_TS= [['Li', 'X', 'Ti'],    # Li sublattice will contain: Li and Vacancies (=X), later also Ni
                      ['Ni'],               # Ni sublattice will not be changed
                      ['O'],                # O  sublattice will not be changed
                      ['O']]        

Idee pickle here (als binärdaten speichern)

# Optimizing CE

## finding cutoffs
list possile cutoffs and define standard variables

In [14]:
# read results cutoffs_noTS
best_cutoffs_noTS = {}
    
file = open('/nfshome/winkelmann/ARL/save/best_cutoffs_noTS.csv','r')
file.readline()
lines = file.readlines()
file.close()
    
for line in lines:
    line = line.replace(' ', '')
    values = line.split(',')
    best_cutoffs_noTS[values[0]] = [float(values[1]), float(values[2]), float(values[3])]
    print('%14s:\t%s' % (values[0], best_cutoffs_noTS[values[0]]))

   ardr-lambda:	[11.38, 6.42, 6.42]
 ardr-lineScan:	[12.83, 6.42, 5.69]
           rfe:	[11.84, 6.42, 5.75]
         lasso:	[12.75, 6.42, 5.69]
 least-squares:	[11.84, 6.42, 5.69]
bayesian-ridge:	[11.84, 6.42, 5.69]
    elasticnet:	[12.4, 6.42, 5.75]
           omp:	[11.5, 6.42, 5.69]
         ridge:	[11.84, 6.42, 5.69]
 split-bregman:	[11.84, 6.42, 5.69]


In [15]:
import time as pytime

cutoff_vals = [2.84, 2.85, ] # till 8.09 - biggest cutoff so the transition stated does not see itself
# 
position_tolerance = 0.01
symprec = 0.01
tol_positions=0.05


fit_methods_all = ['ardr-lambda', 'rfe', 'lasso', 'least-squares', 'bayesian-ridge', 'omp', 'ridge', 'split-bregman', 'elasticnet', 'ardr-lineScan']
fit_methods = ['least-squares', 'bayesian-ridge', 'omp', 'ridge', 'split-bregman', 'elasticnet', 'ardr-lineScan', 'lasso']
# 
records = {}
#todo: look at TS-mapped in ovito
#todo: try weighting TS --> not working       
#        #weighting TS-Structures higher
#        weights = np.ones(len(y))
#        for structure, weight in zip(train_structures, weights):
#            if structure in atoms_for_training_NEB_transition_states:
#                weight = 10

In [16]:
#initalizoing savefiles to prevent naming it on several ocasions
save_file_pair_cutoffs = '/nfshome/winkelmann/ARL/tmp/cutoffs_2_anchor_fit'
save_file_triplet_cutoffs = '/nfshome/winkelmann/ARL/tmp/cutoffs_3_anchor_fit'
save_file_quartet_cutoffs = '/nfshome/winkelmann/ARL/tmp/cutoffs_4_anchor_fit'
save_file_best_cutoffs = '/nfshome/winkelmann/ARL/tmp/best_cutoffs_anchor_fit'

### pair cutoff

In [None]:
save_file_pair_cutoffs = prevent_override(save_file_pair_cutoffs)
file = open(save_file_pair_cutoffs, 'w')
file_format = '%14s,\t%20s,\t%13s,\t%15s,\t%15s,\t%15s,\t%7s,\t%7s,\t%7s' + os.linesep
file.write(file_format % ('fit_method', 'cutoffs' , 'cutoff2', 'validation', 'train', 'BIC', 'number', 'nonzero','time'))
file.close()
for fit_method in fit_methods:
    records[fit_method] = []
    c2_vals = cutoff_vals
    for c2 in c2_vals:
        start_time = pytime.time()
        cutoffs = best_cutoffs_noTS[fit_method]
        cutoffs[0] = c2
        cve = train_ce(prim=prim_TS,
                       chemical_symbols=chemical_symbols_TS,
                       cutoffs=cutoffs,
                       energy_list=train_H_o_M,
                       atoms_ref_list=train_structures,
                       outcar_list=file_location,
                       position_tolerance=position_tolerance,
                       symprec=symprec,
                       tol_positions=tol_positions,
                       fit_method=fit_method)
        records[fit_method].append({'c2': c2, **cve})
        file = open(save_file_pair_cutoffs, 'a')
        file.write(file_format % (fit_method, cutoffs, c2, cve.rmse_validation, cve.rmse_train, cve.model.BIC, cve.n_parameters, cve.n_nonzero_parameters, pytime.time()-start_time))
        file.close()                       

#Total number of Li-Li Bonds as function of cutoff (intra layer/inter layer):
#                       same Layer          next layer x2   second next layer x2
# 2.84    0  ( 0/ 0)    
# 2.85    6  ( 6/ 0)    first  (6)
# 4.93   12  (12/ 0)    second (6)
# 5.00   18  (12/ 6)                        first  (3) 
# 5.69   24  (18/ 6)    third  (6)
# 5.75   30  (18/12)                        second (3) 
# 6.42   42  (18/24)                        third  (6) 
# 7.53   54  (30/24)    fourth (12) 
# 7.57   66  (30/36)                        fourth (6) 
# 8.09   72  (30/42)                        fifth  (3) 
# 8.53   78  (36/42)    fifth  (6)
# 8.57   90  (36/54)                        sixth  (6) 
# 9.47   96  (36/60)                        seventh(3)
# 9.58  102  (36/60/6)                                      first  (3)
# 9.85  108  (42/60/6)  sixth  (6)          
# 9.89  120  (42/72/6)                      eigth  (6)
#10.00  126  (42/72/12)                                     second (3)
#10.26  138  (54/72/12) seventh(12)
#10.29  150  (54/84/12)                     ninth  (6)
#10.39  162  (54/84/24)                                     third  (6)
#11.10  174  (54/96/24)                     tenth(6)
#11.14  186  (54/96/36)                                     fourth(6)
#11.38  192  (60/96/36) eighth(6)
#11.50  198  (60/96/42)                                     fifth(3)
#11.76  210  (60/108/42)                    ninth(6)
#11.84  222  (60/108/54)                                    sixth(6)
#12.40  234  (72/108/54) ninth(12)
#12.42  252  (72/126/54)                    tenth(9)
#12.51  258  (72/126/60)                                    seventh(3)
#12.75  270  (72/138/60)                    eleventh(6)
#12.83  282  (72/138/72)                                    eighth(6)
#13.03  294  (84/138/72) tenth(12)
#13.14  306  (84/138/84)                                    ninth(6)
#13.66  318  (84/150/84)                    twelfth(6)
#13.74  330  (84/150/96)                                    tenth(6)
#13.96  336  (84/156/96)                    thiteenth(3)
#14.15  338  (84/156/96/2)                                                  first(1)


### triplett cutoff

In [17]:
# read results cutoff2

best_cutoffs2 = {}
df2 = {}

for fit_method in fit_methods_all:
    df2[fit_method] = {'c2': [], 'rmse_validation': [], 'rmse_train': [], 'BIC': [], 'n_parameters': [],
                       'n_nonzero_parameters': []}
file = open(save_file_pair_cutoffs, 'r')
file.readline()
lines = file.readlines()
for line in lines:
    line = line.replace(' ', '')
    values = line.split(',')
    df2[values[0]]['c2'].append(float(values[4]))
    df2[values[0]]['rmse_validation'].append(float(values[5]))
    df2[values[0]]['rmse_train'].append(float(values[6]))
    df2[values[0]]['BIC'].append(float(values[7]))
    df2[values[0]]['n_parameters'].append(float(values[8]))
    df2[values[0]]['n_nonzero_parameters'].append(float(values[9]))

for fit_method in fit_methods_all:
    df2[fit_method] = pd.DataFrame(df2[fit_method])
    best_cutoffs2[fit_method] = df2[fit_method].c2[df2[fit_method].rmse_validation.idxmin()]

save_file_best_cutoffs = prevent_override(save_file_best_cutoffs)
file = open(save_file_best_cutoffs, 'w')
file_format = '%14s,\t%13s' + os.linesep
file.write(file_format % ('fit_method', 'cutoff2'))
for key, cutoff in best_cutoffs2.items():
    file.write(file_format % (key, cutoff))
file.close()
max_cutoff2 = max(best_cutoffs2.values())
print('Max pair cutoff: %s' % max_cutoff2)

Max pair cutoff: 8.09


In [None]:
# c3_vals = cutoff_vals[0: biggest min(rmse_validation) cutoff2]
# changed to 10 since everything above 8.09=cutoff_vals[9]
save_file_triplet_cutoffs = prevent_override(save_file_triplet_cutoffs)
file = open(save_file_triplet_cutoffs, 'w')
file_format = '%14s,\t%20s,\t%13s,\t%15s,\t%15s,\t%15s,\t%7s,\t%7s,\t%7s' + os.linesep
file.write(file_format % ('fit_method','cutoffs', 'cutoff3', 'validation', 'train', 'BIC', 'number', 'nonzero','time'))
file.close()
c3_vals = cutoff_vals
for fit_method in fit_methods:
    records[fit_method] = []
    #c3_opt_noTS_index = cutoff_vals.index(best_cutoffs_noTS[fit_method][1])
    for c3 in c3_vals:
        #c2TS from here
        start_time = pytime.time()
        cutoffs = best_cutoffs_noTS[fit_method].copy()
        cutoffs[0] = best_cutoffs2[fit_method] #best Parameter for all useful C2-jochen-data
        cutoffs[1] = c3
        if c3 < cutoffs[2]:
            cutoffs[2] = c3
        cve = train_ce(prim=prim_TS,
                       chemical_symbols=chemical_symbols_TS,
                       cutoffs=cutoffs,
                       energy_list=train_H_o_M,
                       atoms_ref_list=train_structures,
                       outcar_list=file_location,
                       position_tolerance=position_tolerance,
                       symprec=symprec,
                       tol_positions=tol_positions,
                       fit_method=fit_method)
        records[fit_method].append({
            'c3': c3,
            'rmse_validation': cve.rmse_validation,
            'rmse_train': cve.rmse_train,
            'BIC': cve.model.BIC,
            'n_parameters': cve.n_parameters,
            'n_nonzero_parameters': cve.n_nonzero_parameters
        })

        file = open(save_file_triplet_cutoffs, 'a')
        file.write(file_format % (fit_method, cutoffs, c3, cve.rmse_validation, cve.rmse_train, cve.model.BIC, cve.n_parameters, cve.n_nonzero_parameters, pytime.time()-start_time))
        file.close()

### quartett cutoff

In [18]:


cutoff_vals = [2.84, 2.85, 4.93, 5.00, 5.69, 5.75, 6.42, 7.53, 7.57, 8.09] # till 8.09 - biggest cutoff so the transition stated does not see itself
# 
position_tolerance = 0.01
symprec = 0.01
tol_positions=0.05

fit_methods = ['least-squares', 'bayesian-ridge']
#'ardr-lambda', 'rfe',  'omp', 'ridge', 'split-bregman', 'elasticnet', 'ardr-lineScan', lasso

save_file_triplet_cutoffs = '/nfshome/winkelmann/ARL/tmp/cutoffs_3_anchor_fit'
save_file_quartet_cutoffs = '/nfshome/winkelmann/ARL/tmp/cutoffs_4_anchor_fit'

In [19]:
# read results cutoff3

best_cutoffs3 = {}
df3 = {}

for fit_method in fit_methods_all:
    df3[fit_method] = {'c3': [], 'rmse_validation': [], 'rmse_train': [], 'BIC': [], 'n_parameters': [],
                       'n_nonzero_parameters': []}
file = open(save_file_triplet_cutoffs,'r')
file.readline()
lines = file.readlines()
    
for line in lines:
    line = line.replace(' ', '')
    values = line.split(',')
    df3[values[0]]['c3'].append(float(values[4]))
    df3[values[0]]['rmse_validation'].append(float(values[5]))
    df3[values[0]]['rmse_train'].append(float(values[6]))
    df3[values[0]]['BIC'].append(float(values[7]))
    df3[values[0]]['n_parameters'].append(float(values[8]))
    df3[values[0]]['n_nonzero_parameters'].append(float(values[9]))

for fit_method in fit_methods_all:
    df3[fit_method] = pd.DataFrame(df3[fit_method])
    best_cutoffs3[fit_method] = df3[fit_method].c3[df3[fit_method].rmse_validation.idxmin()]

#append best cutoffs save file
file = open(save_file_best_cutoffs, 'r')
file.readline()
lines = file.readlines()
line0 = lines[0].replace(' ', '').split(',')
file.close()
if(len(line0) > 2):
    save_file_best_cutoffs = prevent_override(save_file_best_cutoffs)
file = open(save_file_best_cutoffs, 'w')
file_format = '%14s,\t%13s,\t%13s' + os.linesep
file.write(file_format % ('fit_method', 'cutoff2', 'cutoff3'))
for key,cutoff in best_cutoffs3.items():
    file.write(file_format %(key,best_cutoffs2[key], cutoff))
file.close()
max_cutoff3 = max(best_cutoffs3.values())
print('Max triplet cutoff: %s' % max_cutoff3)

Max triplet cutoff: 8.09


In [None]:
# c4_vals = cutoff_vals[0: biggest min(rmse_validation) cutoff3]
# changed to 10 since everything above 8.09=cutoff_vals[9]
save_file_quartet_cutoffs = prevent_override(save_file_quartet_cutoffs)
file = open(save_file_quartet_cutoffs, 'w')
file_format = '%14s,\t%20s,\t%13s,\t%15s,\t%15s,\t%15s,\t%7s,\t%7s,\t%7s' + os.linesep
file.write(file_format % ('fit_method','cutoffs', 'cutoff4', 'validation', 'train', 'BIC', 'number', 'nonzero','time'))
file.close()
for fit_method in fit_methods:
    print(f'\33[46mfit_method: {fit_method}\33[0m')
    records[fit_method] = []
    c4_vals = cutoff_vals
    for c4 in c4_vals:
        #c2TS from here
        start_time = pytime.time()
        cutoffs = []
        cutoffs.append(best_cutoffs2[fit_method]) #best Parameter for all useful C2-jochen-data
        cutoffs.append(best_cutoffs3[fit_method]) #best Parameter for all useful C3-jochen-data
        cutoffs.append(c4)
        if c4 > cutoffs[1]:
            print(f'stopped {fit_method} since c3 limit was reached at {c4}')
            break
        print(f'{fit_method} with cutoffs: {cutoffs} started at {datetime.datetime.now()}')
        try:
            with warnings.catch_warnings():
                warnings.simplefilter('once')
                cve = train_ce(prim=prim_TS,
                            chemical_symbols=chemical_symbols_TS,
                            cutoffs=cutoffs,
                            energy_list=train_H_o_M,
                            atoms_ref_list=train_structures,
                            outcar_list=file_location,
                            position_tolerance=position_tolerance,
                            symprec=symprec,
                            tol_positions=tol_positions,
                            fit_method=fit_method)
        except Exception as e: #\33[<style>m is the ANSI escape code for color and style
            print(f'aborted due to \33[41m Error:\33[0m \n \t \33[45m{e} \33[0m')
            continue
        records[fit_method].append({
            'c4': c4,
            'rmse_validation': cve.rmse_validation,
            'rmse_train': cve.rmse_train,
            'BIC': cve.model.BIC,
            'n_parameters': cve.n_parameters,
            'n_nonzero_parameters': cve.n_nonzero_parameters
        })

        file = open(save_file_quartet_cutoffs, 'a')
        file.write(file_format % (fit_method, cutoffs, c4, cve.rmse_validation, cve.rmse_train, cve.model.BIC, cve.n_parameters, cve.n_nonzero_parameters, pytime.time()-start_time))
        file.close()

In [20]:
# write overall best cutoff results

best_cutoffs4 = {}
df4 = {}

for fit_method in fit_methods_all:
    df4[fit_method] = {'c4': [], 'rmse_validation': [], 'rmse_train': [], 'BIC': [], 'n_parameters': [],
                       'n_nonzero_parameters': []}
file = open(save_file_quartet_cutoffs,'r')
file.readline()
lines = file.readlines()

for line in lines:
    line = line.replace(' ', '')
    values = line.split(',')
    df4[values[0]]['c4'].append(float(values[4]))
    df4[values[0]]['rmse_validation'].append(float(values[5]))
    df4[values[0]]['rmse_train'].append(float(values[6]))
    df4[values[0]]['BIC'].append(float(values[7]))
    df4[values[0]]['n_parameters'].append(float(values[8]))
    df4[values[0]]['n_nonzero_parameters'].append(float(values[9]))

for fit_method in fit_methods_all:
    df4[fit_method] = pd.DataFrame(df4[fit_method])
    best_cutoffs4[fit_method] = df4[fit_method].c4[df4[fit_method].rmse_validation.idxmin()]

file = open(save_file_best_cutoffs, 'r')
file.readline()
lines = file.readlines()
line0 = lines[0].replace(' ', '').split(',')
file.close()
if (len(line0) > 3):
    save_file_best_cutoffs = prevent_override(save_file_best_cutoffs)
file = open(save_file_best_cutoffs, 'w')
file_format = '%14s,\t%13s,\t%13s,\t%13s,\t%13s' + os.linesep
file.write(file_format % ('fit_method', 'cutoff2', 'cutoff3', 'cutoff4', 'RMSE'))
for key, cutoff in best_cutoffs4.items():
    file.write(file_format % (key, best_cutoffs2[key], best_cutoffs3[key], cutoff, df4[key].rmse_validation.min()))
file.close()
max_cutoff4 = max(best_cutoffs4.values())
print('Max quartet cutoff: %s' % max_cutoff4)

Max quartet cutoff: 8.09


### creating Cluster_Expansions

In [21]:
records = {}
rmses = []
for fit_method in fit_methods:
    records[fit_method] = []
    start_time = pytime.time()
    cutoffs = []
    cutoffs.append(best_cutoffs2[fit_method]) #best Parameter for all useful C2-jochen-data
    cutoffs.append(best_cutoffs3[fit_method]) #best Parameter for all useful C3-jochen-data
    cutoffs.append(best_cutoffs4[fit_method]) #best Parameter for all useful C4-jochen-data
    cve = train_ce(prim=prim_TS, 
                   chemical_symbols=chemical_symbols_TS,
                   cutoffs=cutoffs,
                   energy_list=train_H_o_M,
                   atoms_ref_list=train_structures,
                   outcar_list=file_location,
                   position_tolerance=position_tolerance,
                   symprec=symprec,
                   tol_positions=tol_positions,
                   fit_method=fit_method)
    records[fit_method].append({'c4': cve})
    rmses.append(cve.rmse_validation)

    cs = ClusterSpace(structure=prim_TS,
                    cutoffs=cutoffs,
                    chemical_symbols=chemical_symbols_TS,
                    position_tolerance=position_tolerance,
                    symprec=symprec)
    
    ce = ClusterExpansion(cluster_space=cs, parameters=cve.parameters, metadata=cve.summary)
    ce.write(f'/nfshome/winkelmann/ARL/tmp/mixing_energy_anchorTS_{fit_method}.ce')
    print(f'wrote {fit_method} with RMSE: {cve.rmse_validation}')
print(f'\n best RMSE: {min(rmses)} with {fit_methods[rmses.index(min(rmses))]}')

Condition number is large, 1.019489856321668e+18
Condition number is large, 3.531758669050027e+17
Condition number is large, 1.4867479338960443e+18
Condition number is large, 6.722971955944794e+17


len(cs) = 172


Condition number is large, 3.178305390695976e+17
Condition number is large, 9.0036720130702e+17
Condition number is large, 4.41053433205858e+17
Condition number is large, 2.6448065997812707e+17
Condition number is large, 2.9209849663759443e+17
Condition number is large, 1.1851314888419638e+18
Condition number is large, 2.8767688818192067e+18


wrote least-squares with RMSE: 0.0022913781869000633


Condition number is large, 7.642396684174648e+17


len(cs) = 224


Condition number is large, 6.361726041015754e+17
Condition number is large, 1.085613296204174e+19
Condition number is large, 1.0728763209445961e+18
Condition number is large, 1.07054192065575e+18
Condition number is large, 9.683149129396754e+17
Condition number is large, 2.441862125007248e+18
Condition number is large, 1.7404695756362191e+18
Condition number is large, 3.743853648737062e+17
Condition number is large, 9.026153451655617e+17
Condition number is large, 4.696940876929178e+17


wrote bayesian-ridge with RMSE: 0.002282041488416971

 best RMSE: 0.002282041488416971 with bayesian-ridge


In [None]:
#for just one method
fit_method = 'ardr-lambda'
rmses = []
records[fit_method] = {}
start_time = pytime.time()
cutoffs = []
cutoffs.append(8.09) #best Parameter for all useful C2-jochen-data
cutoffs.append(6.42) #best Parameter for all useful C3-jochen-data
cutoffs.append(6.42) #best Parameter for all useful C4-jochen-data
cve = train_ce(prim=prim_TS,
                chemical_symbols=chemical_symbols_TS,
                cutoffs=cutoffs,
                energy_list=train_H_o_M,
                atoms_ref_list=train_structures,
                outcar_list=file_location,
                position_tolerance=position_tolerance,
                symprec=symprec,
                tol_positions=tol_positions,
                fit_method=fit_method)
rmses.append(cve.rmse_validation)

cs = ClusterSpace(structure=prim_TS,
                cutoffs=cutoffs,
                chemical_symbols=chemical_symbols_TS,
                position_tolerance=position_tolerance,
                symprec=symprec)

ce = ClusterExpansion(cluster_space=cs, parameters=cve.parameters, metadata=cve.summary)
ce.write(f'/nfshome/winkelmann/ARL/tmp/mixing_energy_anchorTS_{fit_method}.ce')
print(f'wrote {fit_method} with RMSE: {rmses[0]}')

evtl auch schon alter rest?

In [None]:
# c4_vals = cutoff_vals[0: biggest min(rmse_validation) cutoff3]

save_file_pair_cutoffs = prevent_override('/nfshome/winkelmann/ARL/tmp/cutoff_4_jochen_fit')
file_format = '%14s,\t%13s,\t%13s,\t%15s,\t%15s,\t%15s,\t%7s,\t%7s,\t%7s' + os.linesep
file = open(save_file_pair_cutoffs, 'w')
file.write(file_format % ('fit_method','cutoff3', 'cutoff4', 'validation', 'train', 'BIC', 'number', 'nonzero','time'))
file.close()
c4_vals = cutoff_vals
for fit_method in fit_methods:
    records[fit_method] = {'8.09':[],
                           '6.42':[]}
    for key in records[fit_method].keys():

        for c4 in c4_vals: 
            #c2TS c3TS from here
            if (c4 > float(key)):
                continue
            start_time = pytime.time()
            cutoffs = [8.09, float(key), c4]
            cve = train_ce(prim=prim_TS,
                           chemical_symbols=chemical_symbols_TS,
                           cutoffs=cutoffs,
                           energy_list=train_H_o_M,
                           atoms_ref_list=train_structures,
                           outcar_list=file_location,
                           position_tolerance=position_tolerance,
                           symprec=symprec,
                           tol_positions=tol_positions,
                           fit_method=fit_method)
            records[fit_method][key].append({'key': float(key), **cve})
    
            file = open(save_file_pair_cutoffs, 'a')
            file.write(file_format % (fit_method, key, c4, cve['rmse_validation'], cve['rmse_train'], cve['BIC'], cve['n_parameters'],cve['n_nonzero_parameters'], pytime.time()-start_time))
             file.close()