In [None]:
%%capture install
try:
  import imlms
  print('Already installed')
except:
  %pip install git+https://github.com/Mads-PeterVC/imlms

In [None]:
print(install.stdout.splitlines()[-1])

# Descriptors

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
from imlms.potentials.load_carbon_data import get_carbon_cluster_data


### Convenience Functions

I am going to put a few functions in this section for convenience as they will be
reused across the rest of the examples and exercises.

In [None]:
class NeuralNetworkModel(torch.nn.Module):

    def __init__(self, vector_dim, hidden_dim=32):
        super().__init__()
        layers = []
        layers.append(torch.nn.Linear(vector_dim, hidden_dim)) # ? -> 32
        layers.append(torch.nn.SiLU())
        layers.append(torch.nn.Linear(hidden_dim, hidden_dim)) # 32 -> 32
        layers.append(torch.nn.SiLU())
        layers.append(torch.nn.Linear(hidden_dim, 1)) # 32 -> 1
        self.net = torch.nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

In [None]:
def train_test_split(X, E, test_size=0.5):
    n = len(X)
    indices = np.random.permutation(n)
    split = int(n * test_size)
    train_indices, test_indices = indices[split:], indices[:split]
    return X[train_indices], E[train_indices], X[test_indices], E[test_indices]

In [None]:
def training_loop(model, X_train, y_train, epochs=200, lr=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = torch.nn.MSELoss()

    loss_per_epoch = torch.zeros(epochs)

    for epoch in tqdm(range(epochs)): # Loop over the dataset multiple times
        optimizer.zero_grad()   # Zero the gradients
        loss = 0                # Initialize the loss
        for xb, yb in zip(X_train, y_train):# Loop over the dataset
            E = model.forward(xb).squeeze() # Compute the energy
            loss += loss_fn(E, yb)
        loss.backward()         # Compute the gradient
        optimizer.step()        # Update the parameters
        loss_per_epoch[epoch] = loss.item() / len(X_train) # Store the loss
        
    return loss_per_epoch

### Example: Cartesian Coordinates

In [None]:
X, E = get_carbon_cluster_data(n=6) # n is the number of atoms in the cluster

We can try to take the Cartesian coordinates as descriptors, to do so we will 
take the $(N, 3)$ matrix of coordinates and flatten it into a vector.

In [None]:
# X_train = X[0:25]; E_train = E[0:25]
# X_test = X[25:]; E_test = E[25:]

X_train, E_train, X_test, E_test = train_test_split(X, E, test_size=0.5)

print('25 examples of (6,3)-matrices', X_train.shape)

X_train = X_train.reshape((X_train.shape[0], -1))
X_test = X_test.reshape((X_test.shape[0], -1))

print('25 examples of vectors', X_train.shape)

Now we will build a neural network to handle this data

We can check that this does indeed work as a $\mathcal{R}^{18} \rightarrow \mathcal{R}$ function.

In [None]:
model = NeuralNetworkModel(vector_dim=18)
E = model(X_train[0:1])
print('Output of the model for a single example', E)

We can try training the model

In [None]:
model = NeuralNetworkModel(18)
loss_history = training_loop(model, X_train, E_train, epochs=10000, lr=0.003)

In [None]:
fig, ax = plt.subplots(figsize=(3, 3))
ax.plot(loss_history)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_yscale('log')

In [None]:
from imlms.potentials.plot_parity import plot_parity

E_pred_train = model(X_train).detach().numpy()
E_pred_test = model(X_test).detach().numpy()

fig, axes = plt.subplots(1, 2, figsize=(7, 3), layout="constrained")
plot_parity(axes[0], E_train, E_pred_train)
plot_parity(axes[1], E_test, E_pred_test)

Another metric that is interesting is the loss on the test set as a function of the size of the training set. 

If the model is capable of learning, the loss on the test set should decrease as more training data is provided.

In [None]:
sizes = [5, 15, 25]
losses = []

for train_size in sizes:
    model = NeuralNetworkModel(18)
    loss_history = training_loop(model, X_train[0:train_size], E_train[0:train_size], epochs=10000, lr=0.003)

    E_pred_test = model(X_test).detach().numpy()
    test_loss = np.sum(E_test.detach().numpy() - E_pred_test)**2
    losses.append(test_loss)

In [None]:
fig, ax = plt.subplots(figsize=(3, 3))

ax.plot(sizes, losses, 'o-')
ax.set_xlabel('Training set size')
ax.set_ylabel('Test loss')

### Exercise: Coulomb Matrix

In order to build accurate models we need a better descriptor for atomic systems. 
An early proposal for such a descriptor is the **Coulomb matrix* given by

$$
\begin{equation}
C_{ij} =
    \begin{cases}
        Z_i^2 & \text{if } i = j \\
        \frac{Z_iZ_j}{r_{ij}^2} & \text{if } i \neq j
    \end{cases}
\end{equation}
$$

Where $Z_i$ is the atomic number of atom $i$ and $r_{ij}$ is the distance between atoms $i$ and $j$.


Consider an atomic configuration with 4 atoms of the same species, then the Coulomb matrix is given by

$$
C = Z^2 \begin{bmatrix}
1 & r_{12}^{-2} & r_{13}^{-2} & r_{14}^{-2} \\
r_{21}^{-2} & 1 & r_{23}^{-2} & r_{24}^{-2} \\
r_{31}^{-2} & r_{32}^{-2} & 1 & r_{34}^{-2} \\
r_{41}^{-2} & r_{42}^{-2} & r_{43}^{-2} & 1 \\
\end{bmatrix}
$$

We want a vector so one possibility is to just flatten the matrix e.g. we make a 
vector $v_C$

$$
v_C = [C_{11}, C_{12}, ... , ... , C_{NN}] 
$$

In [None]:
def get_coulomb_matrix(X):
    n = X.shape[0]
    C = np.zeros((n, n))
    Z = 6
    for i in range(n):
        for j in range(n):
            if i == j:
                C[i, j] = Z**2
            else:
                C[i, j] = Z**2 / np.linalg.norm(X[i] - X[j])

    return C.flatten()

In [None]:
X, E = get_carbon_cluster_data(n=8) # n is the number of atoms in the cluster
X_train, E_train, X_test, E_test = train_test_split(X, E, test_size=0.5)

# Compute the Coulomb matrix for the training and test sets
X_train = torch.tensor(np.array([get_coulomb_matrix(x) for x in X_train])).float()
X_test = torch.tensor(np.array([get_coulomb_matrix(x) for x in X_test])).float()

In order to make a model using this descriptor we need to know how many dimensions it has, 
before our Cartesian descriptor has 18 dimensions we instantiated our model like `NeuralNetworkModel(vector_dim=18)`. 

In [None]:
model = NeuralNetworkModel(vector_dim=64)
loss_history = training_loop(model, X_train, E_train, epochs=10000, lr=0.001)

In [None]:
fig, ax = plt.subplots(figsize=(3, 3))
ax.plot(loss_history)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_yscale('log')

In [None]:
E_pred_train = model(X_train).detach().numpy()
E_pred_test = model(X_test).detach().numpy()

fig, axes = plt.subplots(1, 2, figsize=(7, 3), layout="constrained")
plot_parity(axes[0], E_train, E_pred_train)
plot_parity(axes[1], E_test, E_pred_test)

There is clearly some outliers, but for a lot of the test set the Coulomb matrix description does get it pretty correct. 

### Exercise: What is invariance anyway? 

In [None]:
from imlms.potentials.load_carbon_data import get_invariances_examples
from agox.utils.plot import plot_atoms, plot_cell

In [None]:
chain_atoms, ring_atoms = get_invariances_examples()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(6, 3))

labels = ['Chain', 'Ring']
for ax, atoms, label in zip(axes, [chain_atoms, ring_atoms], labels):
    plot_cell(ax, atoms.cell, collection_kwargs={'alpha': 0})
    plot_atoms(ax, atoms)
    ax.set_title(label)

We want some way of comparing how similar these two structures are, there are many 
possibilities, but only fairly intuitive way is

$$
d(\vec{v_i}, \vec{v_j}) = \frac{\vec{v_i}\cdot\vec{v_j}}{|\vec{v_i}||\vec{v_j}|}
$$
This metric will measure the similarity between two vectors and always return a value 
between 0 and 1 - with a value of one meaning that the vectors are identical e.g. 
$d(v_i, v_i) = 1$.

Implement a function `dot_product_similarity` to calculate this. 

Hint: You may find `np.dot` and `np.linalg.norm` useful.

In [None]:
def dot_product_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [None]:
X1 = chain_atoms.get_positions().flatten() # 18-dimensional vector
X2 = ring_atoms.get_positions().flatten() # 18-dimensional vector

d11 = dot_product_similarity(X1, X1)
d22 = dot_product_similarity(X2, X2)
d12 = dot_product_similarity(X1, X2)
print(f'Similarity chain vs chain: {d11:.3f}')
print(f'Similarity ring vs ring: {d11:.3f}')
print(f'Similarity ring vs chain: {d12:.3f}')

So the vector of Cartesian coordinates are different, as expected, so an algorithm 
does get a signal that these structures are not the same. 

However, it seems like a small difference - so lets try to investigate further.
To do so we will consider transformations under which the energy of the system is invariant. 
Specifically

- **Translation**: Rigid movement of the entire structure. 

- **Rotation**: A rotation of the structure.

- **Permutation**: Changing the order of the rows of the Cartesian coordinates.

The cells below creates a plot of transformed version of the chain and ring structures.

In [None]:
from imlms.potentials.load_carbon_data import transforms

atoms_dict = {}
for base_atoms, base_label in zip([chain_atoms, ring_atoms], ['chain', 'ring']):
    for transform_label, transform in transforms.items():
        atoms_dict[f"{base_label}-{transform_label}"] = transform(base_atoms)

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(8, 4), layout="constrained")

for ax, (label, atoms) in zip(axes.flatten(), atoms_dict.items()):

    plot_atoms(ax, atoms)
    plot_cell(ax, atoms.cell, collection_kwargs={'alpha': 0.1})

    for atom in atoms:
        ax.text(atom.position[0], atom.position[1], atom.index, va='center', ha='center')

    ax.set_title(label)

We have 8 configurations here, so what we will do is make a 8x8 matrix that contains 
the similarity measure between each combination of these eight.

In [None]:
D = np.zeros((len(atoms_dict), len(atoms_dict)))
             
for i, atoms_1 in enumerate(atoms_dict.values()):
    for j, atoms_2 in enumerate(atoms_dict.values()):
        v1 = atoms_1.get_positions().flatten()
        v2 = atoms_2.get_positions().flatten()
        similarity = dot_product_similarity(v1, v2)
        D[i, j] = similarity

We can then visualize this matrix 

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))

cax = ax.matshow(D, vmin=D.min()*0.90, vmax=1)

ax.set_xticks(range(len(atoms_dict)))
ax.set_yticks(range(len(atoms_dict)))
ax.set_xticklabels(atoms_dict.keys(), rotation=90)
ax.set_yticklabels(atoms_dict.keys())

for i in range(len(atoms_dict)):
    for j in range(len(atoms_dict)):
        ax.text(j, i, f'{D[i, j]:.2f}', ha='center', va='center', color='black', fontsize=8)

# Red box around upper 4x4
rect = plt.Rectangle((-0.5, -0.5), 4, 4, edgecolor='red', facecolor='none', linewidth=2, clip_on=False, zorder=2)
ax.add_patch(rect)

# Blue box around lower 4x4
rect = plt.Rectangle((3.5, 3.5), 4, 4, edgecolor='blue', facecolor='none', linewidth=2, clip_on=False, zorder=2)
ax.add_patch(rect)

This might be a little complicated to decipher, however here are some key points: 

- The diagonal is the self-similarity and is 1.0 - as we expect everything is exactly similar to itself.

- The (4x4)-matrix on the upper-left, marked with red, is the chain compared to itself with different transforms applied. Ideally we would want all these to be 1.0 such that our machine learning algorithm seems these as identical. 

- The same applies to the (4x4)-matrix on the lower-right, marked with blue, which is for the ring structure and its transformed copies. 

- The two off-diagonal (4x4)-matrices are then for the ring vs chain - here we see another manifestation of the problem with using the Cartesian coordinates - the ring and the chain are comparably similar, similarties around 0.95, than a rotated copy of the chain compared to the original chain with a similarity of 0.94.

Make the similarity matrix but use the Coulomb matrix rather than the Cartesian coordinates.

In [None]:
N_configurations = len(atoms_dict)
D = np.zeros((N_configurations, N_configurations))
             
for i, atoms_1 in enumerate(atoms_dict.values()):
    for j, atoms_2 in enumerate(atoms_dict.values()):
        v1 = 1 # Compute the Coulomb matrix for atoms_1
        v2 = 1 # Compute the Coulomb matrix for atoms_2
        similarity = dot_product_similarity(v1, v2)
        D[i, j] = similarity

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))

cax = ax.matshow(D, vmin=D.min()*0.90, vmax=1)

ax.set_xticks(range(len(atoms_dict)))
ax.set_yticks(range(len(atoms_dict)))

ax.set_xticklabels(atoms_dict.keys(), rotation=90)
ax.set_yticklabels(atoms_dict.keys())

for i in range(len(atoms_dict)):
    for j in range(len(atoms_dict)):
        ax.text(j, i, f'{D[i, j]:.2f}', ha='center', va='center', color='black', fontsize=8)

Notice that we know have two (3x3)-blocks that are exactly 1.0! 

This is because the Coulomb matrix is **invariant** to rotation and translation. 
That is, it doesn't change if the molecule is translated or rotated. 

However, we can see that it is not invariant to a permutation of the atoms! 

There are at least two ways of making a permutation invariant version of the Coulomb matrix

1. Before flattening to a vector sort the rows and the columns by their norm. E.g. such that the 
first row of the matrix is the row with the longest norm, the second the second longest and so on - and 
equivalently for the rows. 

2. Rather than using the matrix itself, one can use its eigenvalues sorted from smallest to largest (or vice versa). 
This is, somewhat, analagous to comparing two molecules by their energy levels that appear as the eigenvalues of an 
Hamiltonian. 

The function `sort_matrix` performs the sorting required for the first option. 
Your task is to implement the `get_sorted_coulomb_matrix`-function in the next cell
after that and make the plot of the similarity matrix with that.

In [None]:
def sort_matrix(matrix):
    row_norms = np.linalg.norm(matrix, axis=1)
    col_norms = np.linalg.norm(matrix, axis=0)

    row_order = np.argsort(row_norms)
    col_order = np.argsort(col_norms)

    return matrix[row_order][:, col_order]

In [None]:
def get_sorted_coulomb_matrix(X):    
    pass # Implement the function

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))

cax = ax.matshow(D, vmin=D.min()*0.90, vmax=1)

ax.set_xticks(range(len(atoms_dict)))
ax.set_yticks(range(len(atoms_dict)))

ax.set_xticklabels(atoms_dict.keys(), rotation=90)
ax.set_yticklabels(atoms_dict.keys())

for i in range(len(atoms_dict)):
    for j in range(len(atoms_dict)):
        ax.text(j, i, f'{D[i, j]:.2f}', ha='center', va='center', color='black', fontsize=8)

You should now have that this is four blocks: 

1. Upper-left: All versions of the chain are completely similar. 

2. Lower-right: All versions of the ring are completely similar. 

3. Lower-left / Upper-right: All comparisons between versions of rings and chains are equally similar. 

### Exercise: Use the sorted Coulomb matrix descriptor

Not giving you any code this time, you can find anything you need in the previous exercises.

Do the following; 

1. Get the dataset and split into train and test.
2. Convert the Cartesian coordinates to sorted Coulomb matrix vectors
3. Train the `NeuralNetworkModel`
4. Analyze the training and the performance of the model.

### Exercise: Bond distance histogram descriptor

In [None]:
from scipy.spatial.distance import pdist

def bond_histogram_descriptor(X):
    n = X.shape[0]
    distances = pdist(X)


In [None]:
chain_atoms, ring_atoms = get_invariances_examples()

chain_histogram = bond_histogram_descriptor(chain_atoms.get_positions())
ring_histogram = bond_histogram_descriptor(ring_atoms.get_positions())


fig, ax = plt.subplots(figsize=(3, 3))

ax.bar(np.arange(20), chain_histogram, alpha=0.5, label='Chain')
ax.bar(np.arange(20), ring_histogram, alpha=0.5, label='Ring')

ax.set_xlabel('Bond length')
ax.set_ylabel('Frequency')
ax.legend()

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))

cax = ax.matshow(D, vmin=D.min()*0.90, vmax=1)

ax.set_xticks(range(len(atoms_dict)))
ax.set_yticks(range(len(atoms_dict)))

ax.set_xticklabels(atoms_dict.keys(), rotation=90)
ax.set_yticklabels(atoms_dict.keys())

for i in range(len(atoms_dict)):
    for j in range(len(atoms_dict)):
        ax.text(j, i, f'{D[i, j]:.2f}', ha='center', va='center', color='black', fontsize=8)