In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd

# Read the dataframe
df = pd.read_pickle('augmented_data.pkl')
df

Unnamed: 0,id,embedding,medoids,cluster_sizes
0,1,"[[-0.0019472323, 0.058371827, 0.0812831, 0.030...","[[-0.004593551, 0.051833656, -0.013445671, -0....","[10, 7, 9, 1]"
1,2,"[[0.01151042, -0.021297293, -0.004139077, 0.03...","[[0.004454516, 0.011180584, 0.053998474, -0.02...","[6, 2, 5, 1]"
2,6,"[[0.013927452, 0.035443924, 0.016817052, -0.01...","[[-0.0025131523, 0.072745346, 0.04038468, -0.0...","[2, 2, 2, 4]"
3,9,"[[0.02544553, -0.03236037, 0.0035475865, 0.070...","[[-0.013504671, 0.07948076, 0.097698964, 0.042...","[2, 2, 1, 1]"
4,13,"[[0.017213065, -0.013364788, 0.013486441, -0.0...","[[-0.0059717577, 0.035555597, 0.024298443, -0....","[3, 3, 1, 3]"
...,...,...,...,...
15742,909992,"[[-0.07872735, -0.009727927, 0.023001013, -0.0...","[[0.020977847, 0.00994313, 0.016100913, -0.020...","[3, 1, 1, 1]"
15743,910046,"[[0.0024761495, 0.029174268, -0.121854655, 0.0...","[[-0.04889003, -0.027657501, -0.03703226, 0.00...","[3, 1, 2, 1]"
15744,910075,"[[-0.03798147, 0.0035953722, 0.03408878, 0.035...","[[0.011452884, 0.14479369, -0.02908832, 0.0719...","[3, 2, 1, 1]"
15745,910092,"[[-0.022506248, -0.034485348, -0.053791, 0.072...","[[-0.022506248, -0.034485348, -0.053791, 0.072...","[2, 1, 2, 1]"


In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Convert medoids to list of numpy arrays
medoids_list = df['medoids'].apply(lambda x: np.array(x))

# Stack them into a single numpy array and convert to PyTorch tensor
medoids_np = np.stack(medoids_list.to_numpy())
print(medoids_np[0].shape)
medoids_tensor = torch.FloatTensor(medoids_np)


# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")


latent_dim = 64
timesteps = 4  # as there are 4 sentences
input_dim = 384  # embedding dimension

# Define Encoder
inputs = Input(shape=(timesteps, input_dim))
encoded = LSTM(latent_dim)(inputs)

# Define Decoder
decoded = RepeatVector(timesteps)(encoded)
decoded = LSTM(input_dim, return_sequences=True)(decoded)

# Combine Encoder and Decoder into an Autoencoder model
autoencoder = Model(inputs, decoded)

# Compile the model
autoencoder.compile(optimizer='adam', loss='mse')

# Create DataLoader using tf.data.Dataset
BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = 100

# Split data into training and validation sets
train_data, val_data = train_test_split(medoids_np, test_size=0.2, random_state=42)

# Create tf.data.Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_data))
train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)

val_dataset = tf.data.Dataset.from_tensor_slices((val_data, val_data))
val_dataset = val_dataset.batch(BATCH_SIZE)  # No need to shuffle validation data


# Compile the model
autoencoder.compile(optimizer='adam', loss='mse')

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, mode='min')

# Train the model
autoencoder.fit(
    train_dataset,
    epochs=10,
    validation_data=val_dataset,
    callbacks=[early_stopping, model_checkpoint]
)




(4, 384)
Using cuda device
Epoch 1/10
Epoch 2/10
  9/394 [..............................] - ETA: 5s - loss: 0.0018

  saving_api.save_model(


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x18eff7b4b80>

In [6]:
def get_encoder(autoencoder):
    """
    Extracts the encoder part of the autoencoder.

    Parameters:
    autoencoder (tf.keras.Model): The trained autoencoder model.

    Returns:
    tf.keras.Model: The encoder model.
    """
    encoder = Model(inputs=autoencoder.input, outputs=autoencoder.layers[1].output)
    return encoder

# Load the best model
autoencoder.load_weights('best_model.h5')

# To get the reconstructed embeddings
reconstructed_embeddings = autoencoder.predict(medoids_np)

# Usage:
encoder_model = get_encoder(autoencoder)
# For example, to obtain the encoded representations of your data:
encoded_data = encoder_model.predict(medoids_np)
print(encoded_data.shape)

(15747, 64)
