In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np


from torch.utils.data import DataLoader, TensorDataset
import pandas as pd

# Read the dataframe
df1 = pd.read_pickle('augmented_data_10k.pkl')
df2 = pd.read_pickle('augmented_data.pkl')

df = pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,id,embedding,medoids,cluster_sizes
0,hep-ph/0610334,"[[-0.09863124, -0.025400802, -0.005175168, 0.0...","[[-0.113838255, -0.013086513, -0.026049882, 0....","[55, 19, 55, 8]"
1,2104.06416,"[[-0.084981225, -0.08507558, 0.03543399, 0.086...","[[-0.13890694, -0.045757502, 0.0331088, 0.0221...","[61, 49, 40, 28]"
2,hep-ph/9606269,"[[-0.116921924, -0.031099621, 0.09050446, 0.07...","[[-0.09846101, 0.05293004, 0.047359765, -0.025...","[28, 19, 12, 7]"
3,hep-ph/9811382,"[[-0.05011667, -0.0072394763, -0.017491272, 0....","[[-0.10917934, -0.025503034, -0.004675309, 0.0...","[46, 33, 10, 23]"
4,1304.2781,"[[-0.05021094, -0.04983033, -0.02687403, -0.02...","[[-0.054514293, -0.08432221, -0.044620816, -0....","[7, 7, 8, 3]"
...,...,...,...,...
25166,909992,"[[-0.07872735, -0.009727927, 0.023001013, -0.0...","[[0.020977847, 0.00994313, 0.016100913, -0.020...","[3, 1, 1, 1]"
25167,910046,"[[0.0024761495, 0.029174268, -0.121854655, 0.0...","[[-0.04889003, -0.027657501, -0.03703226, 0.00...","[3, 1, 2, 1]"
25168,910075,"[[-0.03798147, 0.0035953722, 0.03408878, 0.035...","[[0.011452884, 0.14479369, -0.02908832, 0.0719...","[3, 2, 1, 1]"
25169,910092,"[[-0.022506248, -0.034485348, -0.053791, 0.072...","[[-0.022506248, -0.034485348, -0.053791, 0.072...","[2, 1, 2, 1]"


In [2]:

# Convert medoids to list of numpy arrays
medoids_list = df['medoids'].apply(lambda x: np.array(x).reshape(-1))

# Stack them into a single numpy array and convert to PyTorch tensor
medoids_np = np.stack(medoids_list.to_numpy())
print(medoids_np[0].shape)
medoids_tensor = torch.FloatTensor(medoids_np)

# Create DataLoader
dataset = TensorDataset(medoids_tensor)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)

# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")





(1536,)
Using cuda device


In [3]:
from model import SimpleAutoencoder
# Model Initialization
model = SimpleAutoencoder().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
print(next(iter(dataloader))[0].shape)

# from model import CNN_Autoencoder
# # Model Initialization
# model = CNN_Autoencoder().to(device)
# optimizer = optim.Adam(model.parameters(), lr=0.001)
# criterion = nn.MSELoss()
# print(next(iter(dataloader))[0].shape)


# from model import RecurrentAutoencoder
# # Load the trained model
# model = RecurrentAutoencoder()
# optimizer = optim.Adam(model.parameters(), lr=0.001)
# criterion = nn.MSELoss()
# print(next(iter(dataloader))[0].shape)

# from model import TransformerAutoencoder
# # Example usage
# embed_dim = 384  # Example embedding dimension
# num_heads = 4    # Example number of heads in multi-head attention
# dim_feedforward = 1024  # Example feedforward dimension
# num_layers = 2  # Example number of layers in the transformer encoder
# seq_length = 4  # Original sequence length

# model = TransformerAutoencoder(embed_dim, num_heads, dim_feedforward, num_layers, seq_length).to(device)
# optimizer = optim.Adam(model.parameters(), lr=0.001)
# criterion = nn.MSELoss()
# print(next(iter(dataloader))[0].shape)

torch.Size([256, 1536])


In [4]:

# Training Loop
num_epochs = 200
best_loss = float('inf')  # Initialize with a very high value

for epoch in range(num_epochs):
    for batch_idx, (data,) in enumerate(dataloader):
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data)
        loss.backward()
        optimizer.step()

        # Check if this is the best model so far
        if loss.item() < best_loss:
            best_loss = loss.item()
            # Save the model state
            torch.save(model.state_dict(), 'trained_SimpleAutoencoder_best+arxiv.pth')

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')



Epoch [1/200], Loss: 0.0024
Epoch [2/200], Loss: 0.0023
Epoch [3/200], Loss: 0.0022
Epoch [4/200], Loss: 0.0022
Epoch [5/200], Loss: 0.0021
Epoch [6/200], Loss: 0.0021
Epoch [7/200], Loss: 0.0021
Epoch [8/200], Loss: 0.0021
Epoch [9/200], Loss: 0.0020
Epoch [10/200], Loss: 0.0020
Epoch [11/200], Loss: 0.0019
Epoch [12/200], Loss: 0.0019
Epoch [13/200], Loss: 0.0019
Epoch [14/200], Loss: 0.0019
Epoch [15/200], Loss: 0.0019
Epoch [16/200], Loss: 0.0019
Epoch [17/200], Loss: 0.0019
Epoch [18/200], Loss: 0.0019
Epoch [19/200], Loss: 0.0019
Epoch [20/200], Loss: 0.0018
Epoch [21/200], Loss: 0.0018
Epoch [22/200], Loss: 0.0018
Epoch [23/200], Loss: 0.0018
Epoch [24/200], Loss: 0.0018
Epoch [25/200], Loss: 0.0018
Epoch [26/200], Loss: 0.0018
Epoch [27/200], Loss: 0.0018
Epoch [28/200], Loss: 0.0018
Epoch [29/200], Loss: 0.0017
Epoch [30/200], Loss: 0.0017
Epoch [31/200], Loss: 0.0017
Epoch [32/200], Loss: 0.0018
Epoch [33/200], Loss: 0.0017
Epoch [34/200], Loss: 0.0018
Epoch [35/200], Loss: 0

In [5]:
# def encode_dataConv(model, numpy_arrays):
#     encoded_tensors = []  # To collect the encoded tensors
    
#     for arr in numpy_arrays:
#         # Check if the array can be reshaped to (1, 4, 384)
#         if arr.size == 1536:
#             reshaped_array = arr.reshape(1, 4, 384)
            
#             # Convert NumPy array to PyTorch tensor
#             input_tensor = torch.tensor(reshaped_array, dtype=torch.float32).to(device)
            
#             with torch.no_grad():  # Disable gradient computation
#                 encoded_tensor = model.encoder(input_tensor)  # Use only the encoder part
#                 encoded_tensors.append(encoded_tensor.cpu())  # Move tensor back to CPU
#         else:
#             print(f"Skipping array of shape {arr.shape}. Cannot be reshaped to (1, 4, 384).")
    
#     return encoded_tensors  # Return the list of all encoded tensors
# random_array = np.random.randn(256, 4, 384)

# encoded_tensors = encode_dataConv(model, random_array)
# print(len(encoded_tensors))
# encoded_tensors[0].shape