In [None]:
import numpy as np
import torch
import time
import os

from sklearn import (linear_model, model_selection, preprocessing,
                     pipeline)
from scipy.spatial.distance import pdist

from kymatio.torch import HarmonicScattering3D

from kymatio.scattering3d.backend.torch_backend \
    import TorchBackend3D

from kymatio.scattering3d.utils \
    import generate_weighted_sum_of_gaussians

from kymatio.datasets import fetch_qm7
from kymatio.caching import get_cache_dir

In [None]:
qm7_url = "https://qmml.org/Datasets/gdb7-12.zip"
def fetch_qm7(align=True, cache=True):
    """Fetches the GDB7-12 dataset"""

    cache_path = get_cache_dir("qm7")
    if cache:
        cache_path = get_cache_dir("qm7")
        if align:
            aligned_filename = os.path.join(cache_path, "qm7_aligned.npz")
            if os.path.exists(aligned_filename):
                f = np.load(aligned_filename)
                return dict(**f)

        # load unaligned if existent, align if required
        unaligned_filename = os.path.join(cache_path, "qm7.npz")
        if os.path.exists(unaligned_filename):
            f = np.load(unaligned_filename)
            if align:
                _pca_align_positions(f['positions'], f['charges'], inplace=True)
                np.savez(aligned_filename, **f)
            return dict(**f)

    path = get_cache_dir("qm7")
    qm7_file = os.path.join(path, "gdb7-12/dsgdb7ae.xyz")
    if not os.path.exists(qm7_file):
        qm7_zipfile = os.path.join(path, "gdb7-12.zip")
        if not os.path.exists(qm7_zipfile):
            _download(qm7_url, qm7_zipfile)
            import zipfile
            with zipfile.ZipFile(qm7_zipfile, "r") as zipref:
                zipref.extractall(path)

    qm7 = read_xyz(qm7_file)
    if cache:
        np.savez(unaligned_filename, **qm7)

    if align:
        _pca_align_positions(qm7['positions'], qm7['charges'], inplace=True)
        if cache:
            np.savez(aligned_filename, **qm7)

    return qm7

In [None]:
# Changing code to fetch in smaller set

qm7 = fetch_qm7(align=True, cache = True)
pos_full = qm7['positions']
full_charges_complete = qm7['charges']

cache_path = get_cache_dir("qm7")
with open(cache_path + '/gdb7-12/dsgdb7ae_subset1k.txt', 'r') as file:
    # Create an empty list to store the lines
    lines = []

    # Iterate over the lines of the file
    for line in file:
        # Remove the newline character at the end of the line
        line = line.strip()

        # Append the line to the list
        lines.append(line)
        
#Take single string stored in above list and convert it into string of indexes
string_indexes = lines[0].split(',')
indexes = [len(string_indexes)]

#convert string indexes to int indexes

for i in string_indexes:
    int_ver = int(i.strip())
    if int_ver in indexes:
        continue
    indexes.append(int_ver)

length = len(indexes)

#create pos and full_charges as np arrays with length fitted to dataset 
pos = np.empty([length, 23, 3])
full_charges = np.empty([length, 23])

#fill pos and full_charges with data
j = 0 #iterator
for i in indexes:
    pos[j] = pos_full[i]
    full_charges[j] = full_charges_complete[j]
    j = j + 1

n_molecules = pos.shape[0]
print(n_molecules)

In [None]:
# Starting here working over 1k subset data set and going to try full code but with the torch.cat implemented
# Additional time output added
mask = full_charges <= 2
valence_charges = full_charges * mask

mask = np.logical_and(full_charges > 2, full_charges <= 10)
valence_charges += (full_charges - 2) * mask

mask = np.logical_and(full_charges > 10, full_charges <= 18)
valence_charges += (full_charges - 10) * mask


In [None]:

overlapping_precision = 1e-1
sigma = 2.0
min_dist = np.inf

for i in range(n_molecules):
    n_atoms = np.sum(full_charges[i] != 0)
    pos_i = pos[i, :n_atoms, :]
    check = min(min_dist, pdist(pos_i).min())
    #prevents min_dist from equaling 0 which results in pos being filled with 0 and NaN for data entries
    if(check != 0):
        min_dist = check
    
delta = sigma * np.sqrt(-8 * np.log(overlapping_precision))
pos = pos * delta / min_dist


In [None]:
M, N, O = 192, 128, 96

grid = np.mgrid[-M//2:-M//2+M, -N//2:-N//2+N, -O//2:-O//2+O]
grid = np.fft.ifftshift(grid)

J = 2
L = 3
integral_powers = [0.5, 1.0, 2.0, 3.0]

scattering = HarmonicScattering3D(J=J, shape=(M, N, O),
                                  L=L, sigma_0=sigma,
                                  integral_powers=integral_powers)

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
scattering.to(device)

batch_size = 8
n_batches = int(np.ceil(n_molecules / batch_size))

order_0, orders_1_and_2 = [], []
print('Computing solid harmonic scattering coefficients of '
      '{} molecules from the QM7 database on {}'.format(
        n_molecules,   "GPU" if use_cuda else "CPU"))
print('sigma: {}, L: {}, J: {}, integral powers: {}'.format(
        sigma, L, J, integral_powers))


In [None]:
this_time = None
last_time = None

#Run regression for first two intervals to catch errors in appending/functions 
#before running entire transform across all batches

for i in range(2):
    this_time = time.time()
    if last_time is not None:
        dt = this_time - last_time
        print("Iteration {} ETA: [{:02}:{:02}:{:02}]".format(
                    i + 1, int(((n_batches - i - 1) * dt) // 3600),
                    int((((n_batches - i - 1) * dt) // 60) % 60),
                    int(((n_batches - i - 1) * dt) % 60)))
    else:
        print("Iteration {} ETA: {}".format(i + 1, '-'))
    last_time = this_time
    time.sleep(1)
    
    t_0 = time.time()

    # Extract the current batch.
    start = i * batch_size
    end = min(start + batch_size, n_molecules)

    pos_batch = pos[start:end]
    full_batch = full_charges[start:end]
    val_batch = valence_charges[start:end]

    # Calculate the density map for the nuclear charges and transfer
    # to PyTorch.
    full_density_batch = generate_weighted_sum_of_gaussians(grid,
            pos_batch, full_batch, sigma)
    full_density_batch = torch.from_numpy(full_density_batch)
    full_density_batch = full_density_batch.to(device).float()

    # Compute zeroth-order, first-order, and second-order scattering
    # coefficients of the nuclear charges.
    full_order_0 = TorchBackend3D.compute_integrals(full_density_batch,
                                     integral_powers)
    full_scattering = scattering(full_density_batch)

    # Compute the map for valence charges.
    val_density_batch = generate_weighted_sum_of_gaussians(grid,
            pos_batch, val_batch, sigma)
    val_density_batch = torch.from_numpy(val_density_batch)
    val_density_batch = val_density_batch.to(device).float()

    # Compute scattering coefficients for the valence charges.
    val_order_0 = TorchBackend3D.compute_integrals(val_density_batch,
                                    integral_powers)
    val_scattering = scattering(val_density_batch)

    # Take the difference between nuclear and valence charges, then
    # compute the corresponding scattering coefficients.
    core_density_batch = full_density_batch - val_density_batch

    core_order_0 = TorchBackend3D.compute_integrals(core_density_batch,
                                     integral_powers)
    core_scattering = scattering(core_density_batch)

    # Stack the nuclear, valence, and core coefficients into arrays
    # and append them to the output.
    batch_order_0 = torch.stack(
        (full_order_0, val_order_0, core_order_0), dim=-1)
    batch_orders_1_and_2 = torch.stack(
        (full_scattering, val_scattering, core_scattering), dim=-1)

    
    order_0 = torch.Tensor(order_0)
    print(type(order_0))
    order_0 = torch.cat((order_0, batch_order_0), 0)

    orders_1_and_2 = torch.Tensor(orders_1_and_2)
    orders_1_and_2 = torch.cat((orders_1_and_2, batch_orders_1_and_2), 0)
    # Changing to utilize torch cat instead of append since error thrown from append
    
    t_f = time.time()
    
    print("--- %s seconds ---" % (t_f - t_0))
    print("order 0 size is", order_0.size(), "orders 1 and 2 size is", orders_1_and_2.size())

In [None]:
#perform rest of batches
for i in range(2, n_batches):
    this_time = time.time()
    if last_time is not None:
        dt = this_time - last_time
        print("Iteration {} ETA: [{:02}:{:02}:{:02}]".format(
                    i + 1, int(((n_batches - i - 1) * dt) // 3600),
                    int((((n_batches - i - 1) * dt) // 60) % 60),
                    int(((n_batches - i - 1) * dt) % 60)))
    else:
        print("Iteration {} ETA: {}".format(i + 1, '-'))
    last_time = this_time
    time.sleep(1)
    
    t_0 = time.time()

    # Extract the current batch.
    start = i * batch_size
    end = min(start + batch_size, n_molecules)

    pos_batch = pos[start:end]
    full_batch = full_charges[start:end]
    val_batch = valence_charges[start:end]

    # Calculate the density map for the nuclear charges and transfer
    # to PyTorch.
    full_density_batch = generate_weighted_sum_of_gaussians(grid,
            pos_batch, full_batch, sigma)
    full_density_batch = torch.from_numpy(full_density_batch)
    full_density_batch = full_density_batch.to(device).float()

    # Compute zeroth-order, first-order, and second-order scattering
    # coefficients of the nuclear charges.
    full_order_0 = TorchBackend3D.compute_integrals(full_density_batch,
                                     integral_powers)
    full_scattering = scattering(full_density_batch)

    # Compute the map for valence charges.
    val_density_batch = generate_weighted_sum_of_gaussians(grid,
            pos_batch, val_batch, sigma)
    val_density_batch = torch.from_numpy(val_density_batch)
    val_density_batch = val_density_batch.to(device).float()

    # Compute scattering coefficients for the valence charges.
    val_order_0 = TorchBackend3D.compute_integrals(val_density_batch,
                                    integral_powers)
    val_scattering = scattering(val_density_batch)

    # Take the difference between nuclear and valence charges, then
    # compute the corresponding scattering coefficients.
    core_density_batch = full_density_batch - val_density_batch

    core_order_0 = TorchBackend3D.compute_integrals(core_density_batch,
                                     integral_powers)
    core_scattering = scattering(core_density_batch)

    # Stack the nuclear, valence, and core coefficients into arrays
    # and append them to the output.
    batch_order_0 = torch.stack(
        (full_order_0, val_order_0, core_order_0), dim=-1)
    batch_orders_1_and_2 = torch.stack(
        (full_scattering, val_scattering, core_scattering), dim=-1)

    
    order_0 = torch.Tensor(order_0)
    order_0 = torch.cat((order_0, batch_order_0), 0)

    orders_1_and_2 = torch.Tensor(orders_1_and_2)
    orders_1_and_2 = torch.cat((orders_1_and_2, batch_orders_1_and_2), 0)
    # Changing to utilize torch cat instead of append since error thrown from append
    
    t_f = time.time()
    
    print("--- %s seconds ---" % (t_f - t_0))
    print("order 0 size is", order_0.size(), "orders 1 and 2 size is", orders_1_and_2.size())

In [None]:
torch.save(order_0, "order_coefficients.csv")
torch.save(orders_1_and_2, "orders_1_and_2_coefficients.csv")

In [None]:
# NOTE: NEED TO EDIT TO FIT SUBSET AFTER ABOVE CODE SUCCEDES

#commented out code which alters the space in which the arrays are stored
#order_0 = torch.cat(order_0, dim=0)
#orders_1_and_2 = torch.cat(orders_1_and_2, dim=0)

#order_0 = (order_0).cpu().numpy()
#orders_1_and_2 = orders_1_and_2.cpu().numpy()

order_0 = order_0.reshape((n_molecules, -1))

orders_1_and_2 = orders_1_and_2.reshape((n_molecules, -1))

basename = 'qm7_L_{}_J_{}_sigma_{}_MNO_{}_powers_{}.npy'.format(
        L, J, sigma, (M, N, O), integral_powers)

cache_dir = get_cache_dir("qm7/experiments")

filename = os.path.join(cache_dir, 'order_0_' + basename)
np.save(filename, order_0)

filename = os.path.join(cache_dir, 'orders_1_and_2' + basename)
np.save(filename, orders_1_and_2)

scattering_coef = np.concatenate([order_0, orders_1_and_2], axis=1)

In [None]:
torch.save(scattering_coef, "scattering_coefficients.txt")

In [None]:

qm7 = fetch_qm7()
target_full = qm7['energies']

#takes relevant indexes for energy levels
j = 0 #iterator
target = np.empty([length])
for i in indexes:
    target[j] = target_full[i]
    j = j + 1

#Number of molecules must be divisible by number of folds
n_folds = 5

P = np.random.permutation(n_molecules).reshape((n_folds, -1))

cross_val_folds = []

for i_fold in range(n_folds):
    fold = (np.concatenate(P[np.arange(n_folds) != i_fold], axis=0),
            P[i_fold])
    cross_val_folds.append(fold)
    
#removes NaN data for large datasets and replaces with 0 to preserve eneregy
scattering_coef[np.isnan(scattering_coef)] = 0


In [None]:
alphas = 10.0 ** (-np.arange(1, 10))
for i, alpha in enumerate(alphas):
    scaler = preprocessing.StandardScaler()
    ridge = linear_model.Ridge(alpha=alpha)
    
    
    regressor = pipeline.make_pipeline(scaler, ridge)
    
    target_prediction = model_selection.cross_val_predict(regressor,
            X=scattering_coef, y=target, cv=cross_val_folds)

    target_prediction2 = model_selection.cross_val_predict(regressor,
            X=(target.reshape(-1, 1)), y=target, cv=cross_val_folds)
    #prints MAE, RMSE
    #expected MAE to be <3.5 kcal/mol
    MAE = np.mean(np.abs(target_prediction - target))
    RMSE = np.sqrt(np.mean((target_prediction - target) ** 2))

    print('Ridge regression, alpha: {}, MAE: {}, RMSE: {}'.format(
        alpha, MAE, RMSE))
