In [1]:
import nshutils as nu

nu.pretty()

In [14]:
import datasets

dataset = datasets.load_dataset("nimashoghi/mptrj", split="train")
dataset.set_format("numpy")
dataset

In [15]:
dataset["composition"][0]

In [16]:
import torch

nu.display(torch.bincount(torch.from_numpy(dataset["numbers"][0]), minlength=120))
nu.display(dataset["composition"][0])

In [17]:
from collections.abc import Callable

import numpy as np
from sklearn.linear_model import LinearRegression, Ridge


def get_species_ref_energy_by_linear_comb(
    compositions: np.ndarray,
    energies: np.ndarray,
    model_cls: Callable[[], Ridge | LinearRegression],
):
    """
    Total energy as y, composition as c_i,
    solve linear regression of y = c_i*X
    sklearn LinearRegression as solver

    x should be one-hot-indexed
    give num_chem_species if possible
    """

    c = compositions
    y = energies
    num_chem_species = c.shape[1]

    # tweak to fine tune training from many-element to small element
    zero_indices = np.all(c == 0, axis=0)
    c_reduced = c[:, ~zero_indices]
    full_coeff = np.zeros(num_chem_species)
    coef_reduced = model_cls().fit(c_reduced, y).coef_
    full_coeff[~zero_indices] = coef_reduced

    return full_coeff


ref_linear = get_species_ref_energy_by_linear_comb(
    dataset["composition"],
    dataset["energy"],
    lambda: LinearRegression(fit_intercept=False),
)
nu.display(ref_linear)

ref_ridge = get_species_ref_energy_by_linear_comb(
    dataset["composition"],
    dataset["energy"],
    lambda: Ridge(fit_intercept=False, alpha=0.1),
)
nu.display(ref_ridge)

In [18]:
nu.display(
    np.vstack(
        [
            ref_linear,
            ref_ridge,
            np.load("mptrj_linref.npy"),
            np.abs(ref_linear - ref_ridge),
            np.abs(ref_linear - np.load("mptrj_linref.npy")),
        ]
    )
)

In [6]:
np.load("mptrj_linref.npy")