In [1]:
from sklearn.model_selection import train_test_split
from datasetLoader import load_dataset
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib qt

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [3]:
pore_widths = np.load("data/initial kernels/Size_Kernel_Silica_Adsorption.npy")
pressures = np.load("data/initial kernels/Pressure_Silica.npy")
with open("data/initial kernels/Kernel_Silica_Adsorption.npy", 'rb') as f:
    data_sorb = np.load(f)

x, y = load_dataset('data/datasets/silica_random.npz')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=1)

x_exp, y_exp = load_dataset('data/datasets/SMP_CUT_NOT_ZERO.npz')

x_train_exp, x_test_exp, y_train_exp, y_test_exp = train_test_split(x_exp, y_exp, test_size=0.15, random_state=1)

In [6]:
i = np.random.randint(0, len(x_train))
plt.plot(pore_widths, y_train[i], marker=".")
plt.grid()
plt.show()

In [7]:
i = np.random.randint(0, len(x_train))
plt.plot(pressures[:-10], x_train[i], marker=".")
plt.grid()
plt.show()

In [4]:
class IsothermDataset(Dataset):
    def __init__(self, isotherms, transform=None):
        self.data = torch.tensor(isotherms, dtype=torch.float32).to(device)
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx]
        if self.transform:
            x = self.transform(x)
        return x, x

dataset = IsothermDataset(np.concatenate((x_train_exp, x_train_exp)))
dataset_test = IsothermDataset(np.concatenate((x_test_exp, x_test_exp)))


batch_size = 512
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

In [5]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_recon = self.decoder(z)
        return x_recon, z

input_dim = 448
latent_dim = 16
epochs = 200
learning_rate = 1e-3

model = Autoencoder(input_dim=input_dim, latent_dim=latent_dim)
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

def train_autoencoder(model, loader, loader_test):
    model.train()
    total_loss = 0
    total_vloss = 0
    for x, _ in loader:
        optimizer.zero_grad()
        x_recon, _ = model(x)
        loss = criterion(x_recon, x)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    with torch.no_grad():
        for x, _ in loader_test:
            x_recon, _  = model(x)
            vloss = criterion(x_recon, x)
            total_vloss += vloss.item()

    return total_loss / len(loader.dataset), total_vloss / len(loader_test.dataset)


# sample_z = model.encoder(torch.tensor(isotherms_np[0], dtype=torch.float32))

In [9]:
for epoch in range(1, epochs+1):
    loss, vloss = train_autoencoder(model, loader,loader_test)
    if epoch % 1 == 0:
        print(f"Epoch {epoch}/{epochs}, Loss: {loss*100:.8f} Test loss: {vloss*100:.8f}")

Epoch 1/200, Loss: 0.00002977 Test loss: 0.00003961
Epoch 2/200, Loss: 0.00002448 Test loss: 0.00002959
Epoch 3/200, Loss: 0.00002201 Test loss: 0.00002942
Epoch 4/200, Loss: 0.00002089 Test loss: 0.00002840
Epoch 5/200, Loss: 0.00002038 Test loss: 0.00002761
Epoch 6/200, Loss: 0.00002025 Test loss: 0.00002858
Epoch 7/200, Loss: 0.00002035 Test loss: 0.00002915
Epoch 8/200, Loss: 0.00002208 Test loss: 0.00003152
Epoch 9/200, Loss: 0.00002714 Test loss: 0.00004133
Epoch 10/200, Loss: 0.00002569 Test loss: 0.00003298
Epoch 11/200, Loss: 0.00002190 Test loss: 0.00002802
Epoch 12/200, Loss: 0.00002091 Test loss: 0.00002742
Epoch 13/200, Loss: 0.00002008 Test loss: 0.00002709
Epoch 14/200, Loss: 0.00002082 Test loss: 0.00003315
Epoch 15/200, Loss: 0.00002422 Test loss: 0.00004077
Epoch 16/200, Loss: 0.00004248 Test loss: 0.00008431
Epoch 17/200, Loss: 0.00004545 Test loss: 0.00004041
Epoch 18/200, Loss: 0.00003042 Test loss: 0.00003758
Epoch 19/200, Loss: 0.00002451 Test loss: 0.00003073
Ep

In [10]:
torch.save(model, "data/models/torch/autoencoder_exp.pt")

In [11]:
model = torch.load("data/models/torch/autoencoder_exp.pt", weights_only=False)
model.eval()

Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=448, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=16, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=448, bias=True)
  )
)

In [12]:
model.eval()
latent_vectors_train = model.encoder(torch.tensor(x_train, dtype=torch.float32).to(device)).detach().cpu().numpy()
latent_vectors_test = model.encoder(torch.tensor(x_test, dtype=torch.float32).to(device)).detach().cpu().numpy()
latent_vectors_test_exp = model.encoder(torch.tensor(x_test_exp, dtype=torch.float32).to(device)).detach().cpu().numpy()

In [13]:
np.random.seed(0)
labels = None 

pca = PCA(n_components=2)
latent_pca = pca.fit_transform(latent_vectors_train[:100])
latent_pca_exp = pca.fit_transform(latent_vectors_test[:100])

tsne = TSNE(n_components=2, init='pca', random_state=0)
latent_tsne = tsne.fit_transform(latent_vectors_train[:100])
latent_tsne_exp = tsne.fit_transform(latent_vectors_test[:100])

plt.figure()
plt.scatter(latent_pca[:, 0], latent_pca[:, 1], label="train")
plt.scatter(latent_pca_exp[:, 0], latent_pca_exp[:, 1], label="exp")
plt.title("PCA of Latent Space")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
plt.show()

plt.figure()
plt.scatter(latent_tsne[:, 0], latent_tsne[:, 1], label="train")
plt.scatter(latent_tsne_exp[:, 0], latent_tsne_exp[:, 1], label="exp")
for i in range(latent_tsne_exp.shape[0]):
        plt.text(latent_tsne_exp[i, 0], latent_tsne_exp[i, 1], str(i), fontsize=8, ha='center', va='center')
plt.title("t-SNE of Latent Space")
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.legend()
plt.show()



In [20]:
from xgboost import XGBRegressor
bst = XGBRegressor(n_estimators=500, max_depth=5)
bst.fit(latent_vectors_train, y_train, verbose=True)

In [43]:
model_name = 'forest_exp.ubj'
bst.save_model(f"data/models/torch/{model_name}")

In [158]:
accuracies = []
for i in range(1, bst.n_estimators + 1):
    y_pred = bst.predict(latent_vectors_test[:500, :], iteration_range=(0, i))
    acc = np.sum(np.abs(y_test[:500, :]-y_pred))/len(y_test[:500, :])
    accuracies.append(acc)
    print(f"Error using {i} trees: {acc:.4f}")

Error using 1 trees: 7.6471
Error using 2 trees: 5.7654
Error using 3 trees: 4.4765
Error using 4 trees: 3.5926
Error using 5 trees: 2.9841
Error using 6 trees: 2.5630
Error using 7 trees: 2.2792
Error using 8 trees: 2.0820
Error using 9 trees: 1.9480
Error using 10 trees: 1.8525
Error using 11 trees: 1.7820
Error using 12 trees: 1.7316
Error using 13 trees: 1.6910
Error using 14 trees: 1.6570
Error using 15 trees: 1.6278
Error using 16 trees: 1.6031
Error using 17 trees: 1.5842
Error using 18 trees: 1.5637
Error using 19 trees: 1.5468
Error using 20 trees: 1.5317
Error using 21 trees: 1.5176
Error using 22 trees: 1.5055
Error using 23 trees: 1.4927
Error using 24 trees: 1.4821
Error using 25 trees: 1.4727
Error using 26 trees: 1.4621
Error using 27 trees: 1.4523
Error using 28 trees: 1.4433
Error using 29 trees: 1.4357
Error using 30 trees: 1.4279
Error using 31 trees: 1.4212
Error using 32 trees: 1.4145
Error using 33 trees: 1.4081
Error using 34 trees: 1.4014
Error using 35 trees: 1

In [34]:
plt.plot(accuracies, marker=".")
plt.grid(True)
plt.show()

In [84]:
preds = bst.predict(latent_vectors_test_exp)

In [127]:
def plot_preds(x, y, preds): 
    NX, NY = 3, 4
    figure, axis = plt.subplots(NX, NY)
    for i in range(NX):
        for j in range(NY):
            k = np.random.randint(0, len(preds)) 
            iso_axis = axis[i, j].twiny()
            iso_axis.set_xlabel("P/P$^0$",fontsize=8)
            iso_axis.plot(pressures[:-10], x[k], label="Isotherm", color = 'green')
            kernel = (data_sorb.T[:-10])
            iso_axis.plot(pressures[:-10], np.dot(kernel, preds[k][:128]), label="Isotherm by model", color="red")
            axis[i, j].set_title(f"№ {k}")
            axis[i, j].title.set_size(10)
            axis[i, j].grid()
            axis[i, j].set_xlabel("nm",fontsize=8)
            axis[i, j].plot(pore_widths, (preds[k]), marker=".", label=f"Model PSD")
            axis[i, j].plot(pore_widths, y[k], marker=".", label="PSD")
    plt.subplots_adjust(hspace=0.6, right=0.95, left=0.05, bottom=0.05, top=0.9)
    plt.legend()
    axis[0, 0].legend()
    plt.show()
    
plot_preds(x_test_exp, y_test_exp, preds)

In [56]:
from tools import model_tester
from inverse import fit_linear

error_lst, roughness_lst = model_tester.test_model_predictions(preds, x_test_exp, kernel=data_sorb[:, :-10])
kde_x, kde_error, kde_fun = model_tester.calculate_kde_data(error_lst, stop=150)
print("average error:", np.mean(error_lst))
plt.plot(kde_x, kde_error, label=model_name)
plt.grid(True)
plt.legend()
plt.plot()

average error: 19.576227837865584


[]

In [14]:
class PSD_model(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PSD_model, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, output_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, output_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, output_dim),
            nn.ReLU()
        )

    def forward(self, x):
        psd = self.model(x)
        return psd

class Isotherm_PSD_Dataset(Dataset):
    def __init__(self, x, y, transform=None):
        self.x = torch.tensor(x, dtype=torch.float32).to(device)
        self.y = torch.tensor(y, dtype=torch.float32).to(device)
        self.transform = transform

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        if self.transform:
            x = self.transform(x)
        return x, y

train_PSD = Isotherm_PSD_Dataset(latent_vectors_train, (y_train))
test_PSD = Isotherm_PSD_Dataset(latent_vectors_test, (y_test))

batch_size = 512
PSD_loader = DataLoader(train_PSD, batch_size=batch_size, shuffle=True)
PSD_loader_test = DataLoader(test_PSD, batch_size=batch_size, shuffle=False)


model_PSD = PSD_model(input_dim=latent_dim, output_dim=128)
model_PSD.to(device)

optimizer = optim.Adam(model_PSD.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

def train_PSD_model(model, loader, loader_test):
    model.train()
    total_loss = 0
    total_vloss = 0
    for x, y in loader:
        optimizer.zero_grad()
        y_recon = model(x)
        loss = criterion(y_recon, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    with torch.no_grad():
        for x, y in loader_test:
            y_recon  = model(x)
            vloss = criterion(y_recon, y)
            total_vloss += vloss.item()

    return total_loss / len(loader.dataset), total_vloss / len(loader_test.dataset)


In [15]:
epochs = 300
for epoch in range(1, epochs+1):
    loss, vloss = train_PSD_model(model_PSD, PSD_loader, PSD_loader_test)
    if epoch % 1 == 0:
        print(f"Epoch {epoch}/{epochs}, Loss: {loss*100:.8f} Test loss: {vloss*100:.8f}")

Epoch 1/300, Loss: 0.00278849 Test loss: 0.00155914
Epoch 2/300, Loss: 0.00161460 Test loss: 0.00106561
Epoch 3/300, Loss: 0.00105126 Test loss: 0.00087033
Epoch 4/300, Loss: 0.00088638 Test loss: 0.00070937
Epoch 5/300, Loss: 0.00081221 Test loss: 0.00059989
Epoch 6/300, Loss: 0.00073967 Test loss: 0.00071905
Epoch 7/300, Loss: 0.00067042 Test loss: 0.00063340
Epoch 8/300, Loss: 0.00068083 Test loss: 0.00059451
Epoch 9/300, Loss: 0.00061890 Test loss: 0.00054444
Epoch 10/300, Loss: 0.00055777 Test loss: 0.00052487
Epoch 11/300, Loss: 0.00057298 Test loss: 0.00043978
Epoch 12/300, Loss: 0.00062566 Test loss: 0.00044594
Epoch 13/300, Loss: 0.00057907 Test loss: 0.00040018
Epoch 14/300, Loss: 0.00057865 Test loss: 0.00050906
Epoch 15/300, Loss: 0.00062882 Test loss: 0.00047552
Epoch 16/300, Loss: 0.00051313 Test loss: 0.00037603
Epoch 17/300, Loss: 0.00047756 Test loss: 0.00044809
Epoch 18/300, Loss: 0.00045021 Test loss: 0.00040530
Epoch 19/300, Loss: 0.00048411 Test loss: 0.00035291
Ep

In [119]:
model_PSD.eval()
y_train_PSD = model_PSD.model(torch.tensor(latent_vectors_train, dtype=torch.float32).to(device)).detach().cpu().numpy()
y_test_PSD = model_PSD.model(torch.tensor(latent_vectors_test, dtype=torch.float32).to(device)).detach().cpu().numpy()
y_test_exp_PSD = model_PSD.model(torch.tensor(latent_vectors_test_exp, dtype=torch.float32).to(device)).detach().cpu().numpy()

In [161]:
math_psds = [fit_linear(x_test_exp[i], data_sorb[:, :-10], 0).x for i in range(len(x_test_exp))]
restored_isotherms = [np.dot(data_sorb[:, :-10].T, psd) for psd in math_psds]

In [167]:
model_tester.plot_testing_graphs(y_test_exp_PSD, x_test_exp, restored_isotherms, data_sorb[:, :-10].T, model_name)

In [168]:
plot_preds(x_test_exp, y_test_exp, y_test_exp_PSD)

In [123]:
i = np.random.randint(0, len(y_test_exp_PSD))
plt.plot(pore_widths, y_test_exp_PSD[i], marker=".", label="model")
plt.plot(pore_widths, y_test_exp[i], marker=".", label="exp")
plt.legend()
plt.grid(True)
plt.show()
i

1198

In [124]:
restored_isotherm = np.dot(data_sorb.T, y_test_exp_PSD[i])
plt.plot(pressures, restored_isotherm, marker=".", label="model")
plt.plot(pressures[:-10], x_test_exp[i], marker=".", label="exp")
plt.legend()
plt.grid(True)
plt.show()

In [298]:
len(y_test_exp_PSD)

3411