In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas as pd

In [3]:
from pyace.preparedata import WEIGHTS_ENERGY_COLUMN, WEIGHTS_FORCES_COLUMN, normalize_energy_forces_weights

# 1. Loading reference dataframe

## 1.1 Load custom pickled dataframe

In [64]:
df=pd.read_pickle("/some/fit/data/Al-Li/data.pkl.gz")

In [66]:
df.shape

(1487, 9)

In [67]:
df.head()

Unnamed: 0,ase_atoms,name,energy,energy_corrected,forces,NUMBER_OF_ATOMS,pbc,energy_corrected_per_atom,tp_atoms
0,"(Atom('Li', [-0.014446006840260196, -0.0049246...",/home/users/lysogy36/tools/VASP/Al-Li/DFT/LiAl...,-94.550144,-87.717829,"[[0.11097906, 0.05236876, 0.50914217], [-0.284...",32,True,-2.741182,"{'_ind_i': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
1,"(Atom('Al', [0.0013193089365453575, 0.02689491...",/home/users/lysogy36/tools/VASP/Al-Li/DFT/Al_f...,-116.333506,-107.673578,"[[-0.00163197, -0.13034093, 0.0503274], [-0.11...",32,True,-3.364799,"{'_ind_i': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
2,"(Atom('Li', [-1.1745012357699007, 3.5237344009...",/home/users/lysogy36/tools/VASP/Al-Li/DFT/Li2A...,-126.763253,-117.428585,"[[-0.04736383, 0.07383792, 0.1456961], [-0.108...",48,True,-2.446429,"{'_ind_i': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
3,"(Atom('Li', [0.08259027073574945, 0.0310304786...",/home/users/lysogy36/tools/VASP/Al-Li/DFT/LiAl...,-92.792396,-85.96008,"[[-0.11177808, 0.14575526, -0.62374615], [-0.2...",32,True,-2.686253,"{'_ind_i': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
4,"(Atom('Li', [0.08554239130188349, 0.0321396372...",/home/users/lysogy36/tools/VASP/Al-Li/DFT/LiAl...,-93.870466,-87.038151,"[[-0.06677056, 0.09980793, -0.46699359], [-0.2...",32,True,-2.719942,"{'_ind_i': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."


# 2. Custom energy/forces weights  and all other columns

## 2.1 Set custom weights for all structures

Example: Energy weights are inverse proportional to number of atoms

In [72]:
df[WEIGHTS_ENERGY_COLUMN] = 1./df["NUMBER_OF_ATOMS"]

Force weights are inverse proportional to number of atoms

In [73]:
def generate_force_weights(row):
    n = int(row["NUMBER_OF_ATOMS"])
    return np.ones(n)/n

In [74]:
df[WEIGHTS_FORCES_COLUMN] =  df.apply(generate_force_weights, axis=1)

Normalize the weights

In [75]:
normalize_energy_forces_weights(df);

Store dataframe with weights to pickle gzip file

In [19]:
df.to_pickle("df_weights.pkl.gz", protocol=4)

## 2.2 Put more weights on the elastic-matrix related structures

Identify the elastic matrix calculations by name (if it contains "elastic")

In [13]:
elastic_mask = df["name"].str.contains("elast")

In [14]:
df[elastic_mask].head()

Unnamed: 0,ase_atoms,name,energy,energy_corrected,forces,NUMBER_OF_ATOMS,pbc,energy_corrected_per_atom,tp_atoms,w_energy,w_forces
6,"(Atom('Li', [0.0, 0.0, 0.0], index=0), Atom('L...",/home/users/lysogy36/tools/VASP/Al-Li/DFT/Li_b...,-3.825339,-3.512545,"[[0.0, -0.0, 0.0], [-0.0, 0.0, -0.0]]",2,True,-1.756272,"{'_ind_i': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",0.00472,"[0.0003362474781439139, 0.0003362474781439139]"
9,"(Atom('Li', [-1.2512191635970822, 3.1353098274...",/home/users/lysogy36/tools/VASP/Al-Li/DFT/Li2A...,-16.165183,-14.998349,"[[0.00031314, -0.00087023, -0.0], [-0.00031314...",6,True,-2.499725,"{'_ind_i': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",0.001573,"[0.00011208249271463797, 0.0001120824927146379..."
15,"(Atom('Li', [-1.2512195060957696, 3.1459557238...",/home/users/lysogy36/tools/VASP/Al-Li/DFT/Li2A...,-16.165225,-14.998391,"[[2.185e-05, 1.926e-05, 0.00136299], [-2.185e-...",6,True,-2.499732,"{'_ind_i': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",0.001573,"[0.00011208249271463797, 0.0001120824927146379..."
16,"(Atom('Li', [4.564045464545241, 0.090695628928...",/home/users/lysogy36/tools/VASP/Al-Li/DFT/LiAl...,-68.981918,-63.857682,"[[0.00029379, 0.00040004, 0.00185958], [-0.000...",24,True,-2.660737,"{'_ind_i': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",0.000393,"[2.8020623178659493e-05, 2.8020623178659493e-0..."
19,"(Atom('Li', [0.0, 0.0, 0.0], index=0), Atom('L...",/home/users/lysogy36/tools/VASP/Al-Li/DFT/LiAl...,-12.057534,-11.203494,"[[-0.0, -0.0, -0.00272016], [0.0, 0.0, 0.00272...",4,True,-2.800874,"{'_ind_i': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",0.00236,"[0.00016812373907195696, 0.0001681237390719569..."


Increase the weights on the "elastic"-related structures by factor of 10

In [15]:
df.loc[elastic_mask,WEIGHTS_ENERGY_COLUMN]*=10

df.loc[elastic_mask,WEIGHTS_FORCES_COLUMN]*=10

(optional) Renormalize weights, to sum up to 1. It will be done anyway by `pacemaker`

In [16]:
normalize_energy_forces_weights(df);

Store dataframe with weights to pickle gzip file

In [None]:
df.to_pickle("df_weights_elastic_x10.pkl.gz", protocol=4)

## 2.3 Increase force weights on Li atom types

In [29]:
def increased_Li_force_weights(row):    
    atoms = row["ase_atoms"]
    n = len(atoms)
    # generate uniform weights, i.e. 1
    weights =  np.ones(n)
    symb = np.array(atoms.get_chemical_symbols())
    # increase weights by factor of 5 for Li
    weights[symb=="Li"]*=5
    return weights

In [17]:
row=df.loc[0]

In [31]:
increased_Li_force_weights(row)

array([5., 5., 1., 1., 5., 5., 1., 1., 5., 5., 1., 1., 5., 5., 1., 1., 5.,
       5., 1., 1., 5., 5., 1., 1., 5., 5., 1., 1., 5., 5., 1., 1.])

In [32]:
df[WEIGHTS_FORCES_COLUMN] =  df.apply(increased_Li_force_weights, axis=1)

In [34]:
normalize_energy_forces_weights(df);

In [None]:
df.to_pickle("df_weights_Li_x5.pkl.gz", protocol=4)

## 2.4 Usage

**input.yaml:**

```
...

data:
    filename: df_weights.pkl.gz

...
``` 

If you want to use dataframe, but ignore provided weights:

**input.yaml:**

```
...

data:
    filename: df_weights.pkl.gz
    ignore_weights: True

...
``` 

# 3. Custom energy/forces weights only

If the dataset above is too large, you probably don't want to have many copies of it, that differs only by two weights column. It is possible to save only weights columns in a separate file and provide it with `ExternalWeightingPolicy` 

NOTE! Be sure that original dataframe `df` has **UNIQUE** index `df.index`. Check, for example, it with:

In [76]:
assert len(set(df.index))==len(df)

## 3.1 Randomly select 10% of the data and increase weights for 'elast' structures

original dataset has 1487 structures

In [87]:
df.shape

(1487, 11)

Take only 10% samples and WEIGHTS_ENERGY_COLUMN/WEIGHTS_FORCES_COLUMN columns and "name" column

In [88]:
weights_only = df.sample(frac = 0.1, random_state=42)[["name",WEIGHTS_ENERGY_COLUMN, WEIGHTS_FORCES_COLUMN]]

In [89]:
weights_only.shape

(149, 3)

In [90]:
elast_mask = weights_only["name"].str.contains("elast")

In [92]:
sum(elast_mask)

25

25 out of 149 structures belong to "elastic matrix" calculations. Increase weights on them by factor of 5

In [94]:
weights_only.loc[elast_mask, WEIGHTS_ENERGY_COLUMN]*=5
weights_only.loc[elast_mask, WEIGHTS_FORCES_COLUMN]*=5

In [95]:
weights_only.to_pickle("custom_weights_only.pkl.gz", protocol=4)

## 3.2 Usage

**input.yaml:**

```
...
# load data as usual
data:
   filename: /some/fit/data/Al-Li/data.pkl.gz

fit:
    # use ExternalWeightingPolicy weighting scheme
    weighting: {type: ExternalWeightingPolicy, filename: custom_weights_only.pkl.gz}

...
``` 