In [1]:
import dostools
import importlib
import numpy as np
import pickle
import torch
import sys
import matplotlib.pyplot as plt
import copy
from tqdm import tqdm
import matplotlib
import time
torch.set_default_dtype(torch.float64) 


In [2]:
import dostools.datasets.data as data
import dostools.utils.utils as utils

n_structures = 1039
np.random.seed(0)
n_train = int(0.8 * n_structures)
train_index = np.arange(n_structures)
np.random.shuffle(train_index)
test_index = train_index[n_train:]
train_index = train_index[:n_train]

with torch.no_grad():
    structures = data.load_structures(":")
    n_structures = len(structures) #total number of structures
    for structure in structures:#implement periodicity
        structure.wrap(eps = 1e-12) 
    n_atoms = np.zeros(n_structures, dtype = int) #stores number of atoms in each structures
    for i in range(n_structures):
        n_atoms[i] = len(structures[i])

    #eigenergies, emin, emax = dostools.src.datasets.data.load_eigenenergies(unpack = True, n_structures = len(structures))
    xdos = torch.tensor(data.load_xdos())
    ldos = torch.tensor(data.load_ldos())
    ldos *= 2

    print ("ldos shape is {}".format(ldos.shape))
    mean_dos_per_atom = ldos[train_index].mean(axis = 0) #only calculated for train set to prevent data leakage
    print ("mean dos shape is {}".format(mean_dos_per_atom.shape))
    
    
    y_pw = ldos - mean_dos_per_atom
    y_lcdf = torch.cumsum(y_pw, dim = 1)
    _, pc_vectors = utils.build_pc(ldos[train_index], mean_dos_per_atom[None,:], n_pc = 10)
    y_pc = utils.build_coeffs(ldos - mean_dos_per_atom[None,:], pc_vectors)
    Silicon = data.load_features()
    kMM = data.load_kMM()

ldos shape is torch.Size([1039, 778])
mean dos shape is torch.Size([778])
Variance covered with 10 PCs is = 0.9871211778950163


In [3]:
#Testing linalg solver
Features = torch.rand(1000,1000)
Target = torch.sum(Features, dim = 1)

In [8]:
HYPER_PARAMETERS = {
    "cutoff": 4.0,#6.0,#4.0,
    "max_radial": 8,#12,#8,
    "max_angular": 6,#9,#6,
    "atomic_gaussian_width": 0.45,
    "center_atom_weight": 1.0,
    "radial_basis":{
        "Gto":{}
    },
    "cutoff_function":{
        "Step":{}, #maybe 
    },
    "radial_scaling":{
        "Willatt2018":{
        'exponent': 5,
        'rate' : 1,
        'scale' : 3.,
        },
    },
}


In [27]:
from dostools.datasets import dataset
importlib.reload(dataset)
Silicon = dataset.TensorFeatures(structures, HYPERS = HYPER_PARAMETERS)

  0%|          | 0/999 [00:00<?, ?it/s]

In [33]:
import rascaline
calculator = rascaline.SoapPowerSpectrum(**HYPER_PARAMETERS)
descriptors = calculator.compute(structures)
descriptors.keys_to_samples("species_center")
descriptors.keys_to_properties(["species_neighbor_1", "species_neighbor_2"])

TensorMap with 1 blocks
keys: ['species_center']
             14

In [84]:
from skcosmo.feature_selection import FPS

n_refs_list = [50]
features2 = []
for i in n_refs_list:
    n_refs = i
    n_atoms = descriptors.block(0).values.shape[0]
    n_structures = np.unique(descriptors.block(0).samples["structure"])
    feature = torch.zeros(len(n_structures), n_refs)
    atom_descriptors = torch.tensor(descriptors.block(0).values)
    atom_descriptors = torch.nn.functional.normalize(atom_descriptors, dim = 1)
    selector = FPS(n_to_select = n_refs,
               progress_bar = True,
               score_threshold = 1e-12,
               full = False,
               initialize = 0
              )
    selector.fit(atom_descriptors.T)
    references = selector.transform(atom_descriptors.T).T
    atomkernel_descriptors = torch.pow(atom_descriptors @ references.T, 2)
    for structure_i in n_structures:
        a_i = descriptors.block(0).samples["structure"] == structure_i
        feature[structure_i, :] = torch.sum(atomkernel_descriptors[a_i, :], axis = 0)/np.sum(a_i)
    
    features2.append(feature)
    

  0%|          | 0/49 [00:00<?, ?it/s]

In [43]:
features[0]

tensor([[1.0000, 0.6910, 0.8774,  ..., 0.8087, 0.8686, 0.6336],
        [0.9954, 0.6765, 0.8903,  ..., 0.8480, 0.8828, 0.6544],
        [0.9974, 0.6843, 0.8924,  ..., 0.8378, 0.8890, 0.6556],
        ...,
        [0.9579, 0.7939, 0.8952,  ..., 0.8066, 0.8550, 0.6117],
        [0.9567, 0.7964, 0.8965,  ..., 0.8048, 0.8549, 0.6107],
        [0.9563, 0.8012, 0.8983,  ..., 0.8070, 0.8558, 0.6106]])

In [86]:
main = features2[0]

Features = main[train_index]#Silicon.Features['structure_avekerneldescriptors'][train_index]
test_features = main[test_index]#Silicon.Features['structure_avekerneldescriptors'][test_index]
Target = ldos[train_index]
test_target = ldos[test_index]
m = Features.shape[1]
regularization = 1e-3
reg = regularization * torch.eye(m)
reg[-1, -1] = 0
A = torch.vstack([Features, reg])
b = torch.vstack([Target, torch.zeros(m,Target.shape[1])])
weights = torch.linalg.lstsq(A, b, rcond=1e-10).solution
pred = Features @ weights 
test_pred = test_features @ weights

loss_dos = loss.t_get_rmse(pred, Target, xdos, perc = True)
test_loss_dos = loss.t_get_rmse(test_pred, test_target, xdos, perc = True)
print ("Regularization: {}".format(regularization))
print ("The train error is {:.4} for n_refs = {}".format(loss_dos,m))
print ("The test error is {:.4} for n_refs = {}".format(test_loss_dos, m))

Regularization: 0.001
The train error is 14.64 for n_refs = 50
The test error is 14.4 for n_refs = 50


In [None]:
#Lets go for 200

reg: 0, 1e-10, 1e-5, 1e-3, 1e-1, 1
1000: 2.69, 67.15| 2.69, 67.15|  5.073, 17.79|  10.22, 12.92| 25.84, 25.4| 38.38, 38.05
500: 2.65, 76.07| 2.41, 92.05|5.24, 16.04 |10.64, 13.0 |27.4, 26.89 |39.95, 39.4
300: 3.56, 41.12| 3.56, 41.12| 5.801, 14.5| 11.06, 13.0| 28.61, 28.13| 41.16, 40.57
200: 5.284, 23.26| 5.284, 23.26| 6.324, 13.24 | 11.36, 12.98| 29.6, 29.14| 42.6, 42.1
100: 8.168 14.63| 8.168 14.63| 8.188 14.28| 12.37, 13.19| 31.35, 31.0| 43.8, 43.09
10: 34.33, 34.21 |34.33, 34.21 |34.33, 34.21 | 34.33, 34.21| 37.03, 36.8| 49.93, 49.63


In [31]:
8.5 - 43.01

-34.51

In [None]:
2.956 - 37.61

In [21]:
weights.shape

torch.Size([100, 778])

In [127]:
regularization = 100
n_col = Features.shape[1]
reg = regularization * torch.eye(n_col)
reg[n_col-1, n_col-1] = 0
reg_features = Features.T @ Features + reg


# matinv = torch.linalg.inv(Features.T @ Features)
# weight = matinv @ Features.T @ Target

weight2 = torch.linalg.lstsq(((Features.T @ Features)+reg),Features.T @ Target, driver = "gelsd", rcond = 1e-15).solution
preds1 = Features @ weight2
# preds2 = Features @ weight

print ("error1 : {:.4}".format(torch.mean(Target - preds1)**2))
# print ("error2 : {:.4}".format(torch.mean(Target - preds2)**2))

error1 : 1.261e-24


In [126]:
weight2

tensor([1.1406e-01, 9.8752e-02, 7.7868e-02,  ..., 1.1185e-01, 8.9080e-02,
        8.1009e+02])

In [113]:
weight

tensor([152238.9036, -81631.2050,  93461.0305,  ..., 135140.0196,
        -71803.1538, -79685.7966])

In [92]:
torch.mean(Target - preds2)**2

tensor(32.8394)

In [93]:
shifted_weights

tensor([731.6263])