In [1]:
%load_ext autoreload
%autoreload 2
from Ge_calculation import GAP, MD_run
from Ge_analysis import *
import pickle
from ase.io.cfg import read_cfg
import numpy as np
import os
from ase import build
from ase.atoms import Atoms
from quippy.potential import Potential
from quippy.descriptors import Descriptor
from ase.io.proteindatabank import read_proteindatabank, write_proteindatabank
from ase.io import read
from copy import deepcopy
from ase.io.castep import read_castep_castep, read_param
from ase.io import read
from ase.io.extxyz import read_xyz, write_xyz
from ase.data import atomic_masses
from ase.neighborlist import neighbor_list
from ase.data import covalent_radii
import pandas as pd
from ase.lattice import hexagonal, tetragonal, orthorhombic
from ase.constraints import StrainFilter, UnitCellFilter, ExpCellFilter, FixAtoms
from ase.optimize import BFGS
import pymatgen.ext.matproj as mp
import pymatgen.core.periodic_table as pt
import pymatgen.io.ase as pase
from sklearn import decomposition
from pyuoi.decomposition import CUR
import matplotlib.pyplot as plt

In [119]:
def reduce_DB(xyz, gen_red=1, ctypes = [None], outfile=None, size_dep_prob={}):
    '''Reduces a database's configuration count for robustness stress-testing
    ctypes: list of str, config_types to keep in new DB. Leave None for all
    size_dist: dict, sizes and their desired relative fractions
    outfile: str, optional custom name for new DB file'''
    r = np.random.rand
    ats = list(read_xyz(xyz, index=slice(0,None)))
    sizes = np.unique([len(i) for i in ats])
    for i in sizes:
        if str(i) not in size_dep_prob.keys():
            size_dep_prob.update({str(i):1})
    ini = sum([len(i) for i in ats])
    keeps = []
    ct = 0; ct_kept = 0; popped = []

    for i in range(len(ats)-1, -1, -1):
        if r() > gen_red and len(ats[i]) != 1:
            popped.append(ats.pop(i))

    for i in range(len(ats)-1, -1, -1):
        if r() > size_dep_prob[str(len(ats[i]))]:
            popped.append(ats.pop(i))

    for i, val in enumerate(ats):
        if (val.info['config_type'] in ctypes or 'isol' in val.info['config_type'] or None in ctypes):
            keeps.append(val)
        else:
            popped.append(val)


    if outfile is None:
        outfile = xyz.split('.')[:-1] + '_red.xyz'
        valfile = xyz.split('.')[:-1] + '_extraval.xyz'

    write_xyz(outfile, keeps)

    print('total {0} popped\n'.format(len(popped)))
    print('final fraction = {:2.2f}%'.format(100*sum([len(i) for i in keeps])/ini))
    print_DB_stats(keeps)

    return keeps, popped

In [47]:
def print_DB_stats(atoms, by_config_type=True):
    print('Size statistics:\n'+'-'*36)
    if isinstance(atoms[0], list):
        atoms = flatten(atoms)
    hist = [len(i) for i in atoms]
    tot = sum(hist)
    sizes, freq = np.unique(hist, return_counts=True)
    print(('{:<12s}'*3).format('size', 'freq', 'percentage'))
    for j in range(len(sizes)):
        print('{:<12d}{:<12d}{:<11.1f}'.format(sizes[j], freq[j], 100*sizes[j]*freq[j]/tot))

    if by_config_type:
        labels = []
        catoms = []
        for i in atoms:
            if 'config_type' in i.info.keys():
                if (l := i.info['config_type']) not in labels:
                    labels.append(l)
                    catoms.append([])
                catoms[labels.index(l)].append(i)
        print('\nBy config types:\n'+'-'*36)
        for i, val in enumerate(catoms):
            hist = [len(j) for j in val]
            tot = sum(hist)
            sizes, freq = np.unique(hist, return_counts=True)
            print('{:<16s} {} atoms'.format(val[0].info['config_type'], tot))
            for j in range(len(sizes)):
                print('{:<12d}{:<12d}{:<11.1f}'.format(sizes[j], freq[j], 100*sizes[j]*freq[j]/tot))
            print('-'*36+'\n')
    return

In [49]:
os.chdir('/home/joe/Documents/Ge_od/Potentials/Si_myDB_reduction_testing')
my = GAP('Si_myDB_64_125_216_train.xyz',
         val_file='Si_myDB_64_125_216_validate.xyz',
         pot='/home/joe/Documents/Ge_od/Potentials/Si_myDB_216/Si_myDB_liqamo_loose.xml')

Training set structure count: 281
Validation set structure count: 70
Reading xyz file (may take a while)
Read configs, now fixing virials
Config labels: ['amorph', 'inter', 'hiT_amorph', 'liq', 'hiT_liq']


In [48]:
print_DB_stats(list(read_xyz('Si_myDB_64_125_216_train.xyz', index=slice(0,None))))

Size statistics:
------------------------------------
size        freq        percentage  
1           1           0.0        
64          200         48.4       
125         40          18.9       
216         40          32.7       

By config types:
------------------------------------
isol             1 atoms
1           1           100.0      
------------------------------------

amorph           5288 atoms
64          40          48.4       
125         8           18.9       
216         8           32.7       
------------------------------------

inter            5288 atoms
64          40          48.4       
125         8           18.9       
216         8           32.7       
------------------------------------

hiT_amorph       5288 atoms
64          40          48.4       
125         8           18.9       
216         8           32.7       
------------------------------------

liq              5288 atoms
64          40          48.4       
125         8           1

Trying various reductions in DB size to see where failure occurs
* use loose gap_fit default_sigma
* automatic run of 64-atom quench at the end

Ideas:
* amorphous only
* liquid only
* 75%, 50%, 25% reduction
* 64-atom cells only


In [52]:
my.config_labels


['amorph', 'inter', 'hiT_amorph', 'liq', 'hiT_liq']

In [63]:
lk, lp = reduce_DB('Si_myDB_64_125_216_train.xyz', ctypes=['liq', 'inter', 'hiT_liq'],
                   outfile='Si_myDB_64_125_216_t_liq.xyz')


total 112 popped

config_types removed: ['amorph', 'hiT_amorph']


In [64]:
ak, ap = reduce_DB('Si_myDB_64_125_216_train.xyz', ctypes=['amorph', 'hiT_amorph'],
                   outfile='Si_myDB_64_125_216_t_amo.xyz')



total 168 popped

config_types removed: ['inter', 'liq', 'hiT_liq']


In [112]:
r75k, r75p = reduce_DB('Si_myDB_64_125_216_train.xyz', gen_red=0.75,
                   outfile='Si_myDB_64_125_216_t_r75.xyz')

total 56 popped

final fraction = 76.21%
Size statistics:
------------------------------------
size        freq        percentage  
1           1           0.0        
64          166         52.7       
125         33          20.5       
216         25          26.8       

By config types:
------------------------------------
isol             1 atoms
1           1           100.0      
------------------------------------

amorph           4067 atoms
64          33          51.9       
125         7           21.5       
216         5           26.6       
------------------------------------

inter            4070 atoms
64          35          55.0       
125         6           18.4       
216         5           26.5       
------------------------------------

hiT_amorph       4222 atoms
64          34          51.5       
125         6           17.8       
216         6           30.7       
------------------------------------

liq              3939 atoms
64          31      

In [113]:
r50k, r50p = reduce_DB('Si_myDB_64_125_216_train.xyz', gen_red=0.5,
                   outfile='Si_myDB_64_125_216_t_r50.xyz')

total 141 popped

final fraction = 49.07%
Size statistics:
------------------------------------
size        freq        percentage  
1           1           0.0        
64          99          48.8       
125         22          21.2       
216         18          30.0       

By config types:
------------------------------------
isol             1 atoms
1           1           100.0      
------------------------------------

amorph           2367 atoms
64          21          56.8       
125         3           15.8       
216         3           27.4       
------------------------------------

hiT_amorph       2766 atoms
64          18          41.6       
125         6           27.1       
216         4           31.2       
------------------------------------

inter            2391 atoms
64          18          48.2       
125         3           15.7       
216         4           36.1       
------------------------------------

hiT_liq          3049 atoms
64          21     

In [116]:
r25k, r25p = reduce_DB('Si_myDB_64_125_216_train.xyz', gen_red=0.25,
                   outfile='Si_myDB_64_125_216_t_r25.xyz')


total 220 popped

final fraction = 19.48%
Size statistics:
------------------------------------
size        freq        percentage  
1           1           0.0        
64          46          57.2       
125         9           21.8       
216         5           21.0       

By config types:
------------------------------------
isol             1 atoms
1           1           100.0      
------------------------------------

amorph           765 atoms
64          10          83.7       
125         1           16.3       
------------------------------------

hiT_amorph       1167 atoms
64          9           49.4       
125         3           32.1       
216         1           18.5       
------------------------------------

inter            1255 atoms
64          7           35.7       
125         3           29.9       
216         2           34.4       
------------------------------------

hiT_liq          765 atoms
64          10          83.7       
125         1        

In [121]:
cell64k, cell64p = reduce_DB('Si_myDB_64_125_216_train.xyz', size_dep_prob={'125':0, '216':0},
                             outfile='Si_myDB_64_125_216_64at.xyz')

total 80 popped

final fraction = 48.41%
Size statistics:
------------------------------------
size        freq        percentage  
1           1           0.0        
64          200         100.0      

By config types:
------------------------------------
isol             1 atoms
1           1           100.0      
------------------------------------

amorph           2560 atoms
64          40          100.0      
------------------------------------

inter            2560 atoms
64          40          100.0      
------------------------------------

hiT_amorph       2560 atoms
64          40          100.0      
------------------------------------

liq              2560 atoms
64          40          100.0      
------------------------------------

hiT_liq          2560 atoms
64          40          100.0      
------------------------------------

