In [1]:
import torch

import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split
import pickle
import os
import torch.nn.functional as F


# def read_triplets(file_path):
#     with open(file_path, 'rb') as f:
#         triplets = pickle.load(f)
#     anchors, positives, negatives = zip(*triplets)
#     return torch.tensor(anchors), torch.tensor(positives), torch.tensor(negatives)

def read_triplets(*file_paths):
    all_anchors = []
    all_positives = []
    all_negatives = []

    for file_path in file_paths:
        with open(file_path, 'rb') as f:
            triplets = pickle.load(f)
        anchors, positives, negatives = zip(*triplets)
        all_anchors.extend(anchors)
        all_positives.extend(positives)
        all_negatives.extend(negatives)

    return torch.tensor(all_anchors), torch.tensor(all_positives), torch.tensor(all_negatives)

def prepare_data(anchors, positives, negatives):
    dataset = TensorDataset(anchors, positives, negatives)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    return DataLoader(train_dataset, batch_size=256, shuffle=True), DataLoader(val_dataset, batch_size=256, shuffle=True)

def train_model(model, train_loader, optimizer, device):
    model.train()  # Set the model to training mode
    for anchors, positives, negatives in train_loader:
        # Move data to the target device
        anchors, positives, negatives = anchors.to(device), positives.to(device), negatives.to(device)
        # Zero the optimizer gradients
        optimizer.zero_grad()
        # Pass data through the model
        anchor_output, positive_output, negative_output = model(anchors), model(positives), model(negatives)
        # Compute the loss
        loss = triplet_loss(anchor_output, positive_output, negative_output)
        # Backward pass
        loss.backward()
        # Update weights
        optimizer.step()
    return loss.item()

def validate_model(model, val_loader, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():
        for anchors, positives, negatives in val_loader:
            anchors, positives, negatives = anchors.to(device), positives.to(device), negatives.to(device)
            anchor_output, positive_output, negative_output = model(anchors), model(positives), model(negatives)
            loss = triplet_loss(anchor_output, positive_output, negative_output)
            total_loss += loss.item()
    return total_loss / len(val_loader)


def online_train(model, optimizer, device):
    # TODO: implement online training here
    pass


# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import torch

# Ensure CUDA is available
if torch.cuda.is_available():
    # Get the name of the GPU
    gpu_name = torch.cuda.get_device_name(torch.cuda.current_device())
    print("GPU Name:", gpu_name)
else:
    print("CUDA is not available. Running on CPU.")

# # Load data
# anchors, positives, negatives = read_triplets('triplets_medoids.pkl')
# train_loader, val_loader = prepare_data(anchors, positives, negatives)
# Load data from both files
anchors, positives, negatives = read_triplets('triplets_medoids.pkl', 'triplets_medoids_arxiv.pkl')

# Prepare data loaders
train_loader, val_loader = prepare_data(anchors, positives, negatives)

# Function to compute the loss for each triplet
def compute_triplet_loss(model, loader, device):
    model.eval()  # Set model to evaluation mode
    losses = []
    triplets = []
    with torch.no_grad():  # No need to track gradients
        for i, (anchor, positive, negative) in enumerate(loader):
            anchor, positive, negative = anchor.to(device), positive.to(device), negative.to(device)
            anchor_out, positive_out, negative_out = model(anchor), model(positive), model(negative)
            loss = triplet_loss(anchor_out, positive_out, negative_out)
            losses.append(loss.item())
            triplets.append((anchor, positive, negative))
    return triplets, losses

# Function to select triplets with loss above a threshold
def filter_hard_triplets(triplets, losses, threshold):
    hard_triplets = [triplet for triplet, loss in zip(triplets, losses) if loss > threshold]
    return hard_triplets
from models import TransformerEncoder

def create_model(embed_dim, num_heads, dim_feedforward, num_layers, dropout=0.1):
    model = TransformerEncoder(
        embed_dim=embed_dim, 
        num_heads=num_heads, 
        dim_feedforward=dim_feedforward, 
        num_layers=num_layers, 
        dropout=dropout
    )
    return model

def triplet_loss(anchor, positive, negative, margin=0.5):
    distance_positive = (anchor - positive).pow(2).sum(-1)  # Distance between anchor and positive
    distance_negative = (anchor - negative).pow(2).sum(-1)  # Distance between anchor and negative
    losses = F.relu(distance_positive - distance_negative + margin)
    return losses.mean()  # Return the mean triplet loss



GPU Name: NVIDIA GeForce RTX 4090


  return torch.tensor(all_anchors), torch.tensor(all_positives), torch.tensor(all_negatives)


In [2]:

from models import TransformerEncoder
embed_dim = 384  # Example embedding dimension
num_heads = 4    # Example number of heads in multi-head attention
dim_feedforward = 1024  # Example feedforward dimension
num_layers = 2  # Example number of layers in the transformer encoder
seq_length = 4  # Original sequence length

model = TransformerEncoder(embed_dim, num_heads, dim_feedforward, num_layers, 0.2).to(device)


# from models import LSTMEncoder
# # Model hyperparameters
# embedding_dim = 384
# sequence_length = 4
# hidden_dim = 1024  # Hidden dimensions for the LSTM
# output_dim = 128  # This is the desired output dimension
# num_layers = 2    # Number of LSTM layers
# dropout = 0.5     # Dropout rate

# # Initialize the model
# model = LSTMEncoder(
#     embedding_dim=embedding_dim, 
#     sequence_length=sequence_length, 
#     hidden_dim=hidden_dim, 
#     output_dim=output_dim, 
#     num_layers=num_layers, 
#     dropout=dropout
# ).to(device)


# Parameters for the convolutional encoder
# from models import ConvolutionalEncoder
# num_channels = [16, 32, 64]  # Example: 3 convolutional layers with increasing channels
# sequence_length = 384  # Example: Length of the input sequence
# embedding_dim = 4  # Example: Number of input channels
# output_dim = 128  # Desired dimension of the final output embeddings

# model = ConvolutionalEncoder(sequence_length, num_channels, embedding_dim, output_dim).to(device)

# Example usage:
# Define the input dimension, for example, it could be the flattened size of an image or the size of the feature vector
from models import SimpleLinearModel

input_dim = 4*384  # This is the flattened input dimension
hidden_dims = [1024, 512, 256]  
output_dim = 128  
dropout_rate = 0.4  

model = SimpleLinearModel(input_dim, hidden_dims, output_dim, dropout_rate).to(device)





In [3]:

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Train the model
epochs = 50  # Increase the number of epochs
patience = 10  # Patience for early stopping
best_val_loss = float('inf')  # Start with a high best validation loss
epochs_no_improve = 0  # Counter for epochs without improvement

for epoch in range(epochs):
    
    train_loss = train_model(model, train_loader, optimizer, device)
    val_loss = validate_model(model, val_loader, device)
    print(f"Epoch {epoch+1}/{epochs}.. Train loss: {train_loss:.3f}.. Validation loss: {val_loss:.3f}")

    # Check if the validation loss has improved
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'triplet_model_SimpleLinearModel+Arxiv.pth')
        print(f"Model saved with validation loss {val_loss:.3f}")
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        # If the validation loss has not improved for 'patience' epochs, switch to online training
        if epochs_no_improve == patience:
            print("Switching to online training...")
            online_train(model, optimizer, device)
            break  # Exit the offline training loop




Epoch 1/50.. Train loss: 0.104.. Validation loss: 0.119
Model saved with validation loss 0.119
Epoch 2/50.. Train loss: 0.091.. Validation loss: 0.092
Model saved with validation loss 0.092
Epoch 3/50.. Train loss: 0.072.. Validation loss: 0.080
Model saved with validation loss 0.080
Epoch 4/50.. Train loss: 0.047.. Validation loss: 0.072
Model saved with validation loss 0.072
Epoch 5/50.. Train loss: 0.057.. Validation loss: 0.068
Model saved with validation loss 0.068
Epoch 6/50.. Train loss: 0.040.. Validation loss: 0.064
Model saved with validation loss 0.064
Epoch 7/50.. Train loss: 0.034.. Validation loss: 0.060
Model saved with validation loss 0.060
Epoch 8/50.. Train loss: 0.035.. Validation loss: 0.058
Model saved with validation loss 0.058
Epoch 9/50.. Train loss: 0.060.. Validation loss: 0.057
Model saved with validation loss 0.057
Epoch 10/50.. Train loss: 0.034.. Validation loss: 0.056
Model saved with validation loss 0.056
Epoch 11/50.. Train loss: 0.048.. Validation loss

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras import layers

# Load the DataFrame from 'cleaned_dataset.csv'
unique_id_to_topic = pd.read_csv('cleaned_dataset_10k.csv')

# Function to merge specified subcategories into 'Physics'
def merge_into_physics(category):
    physics_subcategories = ['hep-ph', 'astro-ph', 'hep-th','gr-gc', 'hep-ex', 'nucl-th', 'quant-ph', 'nucl-ex', 'hep-lat']
    if category in physics_subcategories:
        return 'physics'
    return category

def merge_into_math(category):
    physics_subcategories = ['math','cond-mat']
    if category in physics_subcategories:
        return 'math'
    return category

# # Apply the function to the unique_primary_category column
# unique_id_to_topic['unique_primary_category'] = unique_id_to_topic['unique_primary_category'].apply(merge_into_physics)
# unique_id_to_topic['unique_primary_category'] = unique_id_to_topic['unique_primary_category'].apply(merge_into_math)

# Load the EmbeddingsDataFrame from 'EmbeddingsDataFrame.pkl'
with open('augmented_data_10k.pkl', 'rb') as f:
    embeddings_df = pickle.load(f)

# Merge the two DataFrames based on 'id'
merged_df = pd.merge(embeddings_df, unique_id_to_topic, left_on='id', right_on='id')

merged_df = merged_df[['id', 'medoids', 'unique_primary_category']]
merged_df



Unnamed: 0,id,medoids,unique_primary_category
0,hep-ph/0610334,"[[-0.113838255, -0.013086513, -0.026049882, 0....",hep-ph
1,2104.06416,"[[-0.13890694, -0.045757502, 0.0331088, 0.0221...",hep-ph
2,hep-ph/9606269,"[[-0.09846101, 0.05293004, 0.047359765, -0.025...",hep-ph
3,hep-ph/9811382,"[[-0.10917934, -0.025503034, -0.004675309, 0.0...",hep-ph
4,1304.2781,"[[-0.054514293, -0.08432221, -0.044620816, -0....",hep-ph
...,...,...,...
9419,2308.09211,"[[-0.0042298124, -0.033133063, 0.060381196, -0...",econ
9420,2212.03704,"[[-0.09859083, -0.034708254, 0.04424262, -0.02...",econ
9421,2309.09299,"[[0.04684168, -0.020495592, 0.0053604506, 0.02...",econ
9422,1910.11154,"[[0.006068893, -0.0030239178, -0.046500694, -0...",econ


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

from models import SimpleLinearModel

input_dim = 4*384  # This is the flattened input dimension
hidden_dims = [1024, 512, 256]     
output_dim = 128  
dropout_rate = 0.4  

model = SimpleLinearModel(input_dim, hidden_dims, output_dim, dropout_rate).to(device)
# Load the trained model
model.load_state_dict(torch.load('triplet_model_SimpleLinearModel+Arxiv.pth'))
model = model.to(device)
model.eval()  # Set the model to evaluation mode

def encode_data(model, numpy_arrays):
    encoded_tensors = []  # To collect the encoded tensors
    # Assuming 'embedding' is a numpy array with shape (4, 384)
    for i in numpy_arrays:
        embedding_tensor = torch.tensor(i, dtype=torch.float).unsqueeze(0)  # Add batch dimension

        # Assuming 'model' is an instance of the TransformerEncoder class
        # and is already loaded with the trained weights and sent to the appropriate device
        model.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            encoded_embedding = model(embedding_tensor.to(device))
        encoded_tensors.append(encoded_embedding.to("cpu"))
        
    return encoded_tensors


medoids = merged_df['medoids'].values
medoids[0].shape

encoded_tensors = encode_data(model, medoids)
len(encoded_tensors)

print(encoded_tensors[0].shape)

embeddings = np.stack(encoded_tensors)  # Convert to a 2D NumPy array
embeddings = embeddings.reshape(embeddings.shape[0], -1)
embeddings.shape
del model

Using cuda device
torch.Size([1, 128])


In [3]:
import torch
from torchviz import make_dot
import torch.onnx



device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Create a dummy input tensor
dummy_input = torch.randn(1, 4*384, device=device)  # Batch size of 1



from models import SimpleLinearModel

input_dim = 4*384  # This is the flattened input dimension
hidden_dims = [1024, 512, 256]     
output_dim = 128  
dropout_rate = 0.4  

model = SimpleLinearModel(input_dim, hidden_dims, output_dim, dropout_rate).to(device)

# Get the model output from the dummy input
model_output = model(dummy_input)

# Create a dummy input tensor with the appropriate shape and move it to the same device
dummy_input = torch.randn(1, 1,4*384, device=device)  # Batch size of 1


# Ensure the model is in evaluation mode
model.eval()

# Define the file path for the ONNX model
onnx_model_path = "Linear_encoder_model.onnx"

# Export the model
torch.onnx.export(model,                     # model being run
                  dummy_input,               # model input (or a tuple for multiple inputs)
                  onnx_model_path,           # where to save the model
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=11,          # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names=['input'],     # the model's input names
                  output_names=['output'],   # the model's output names
                  dynamic_axes={'input': {0: 'batch_size'},  # variable length axes
                                'output': {0: 'batch_size'}})

In [4]:
import umap.umap_ as umap

import plotly.express as px


# Perform dimensionality reduction using UMAP
reducer = umap.UMAP(n_components=3, random_state=42, n_neighbors=20, min_dist=0.0,metric="cosine")
embeddings_3d = reducer.fit_transform(embeddings)

# Create a DataFrame for the 3D embeddings
embeddings_3d_df = pd.DataFrame(embeddings_3d, columns=['x', 'y', 'z'])
embeddings_3d_df['Category'] = merged_df['unique_primary_category'].values

# Create the 3D scatter plot using Plotly
fig = px.scatter_3d(embeddings_3d_df, x='x', y='y', z='z', color='Category')
fig.show()
fig.write_html("3D_plot_SimpleLinearModel.html")

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [5]:

unique_primary_category = merged_df['unique_primary_category'].values
# Label encode the unique_primary_category
le = LabelEncoder()

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(embeddings.shape[1],)),  # Input layer
    layers.Dense(512, activation='relu'),  # Hidden layer
    layers.Dense(len(le.classes_), activation='softmax')  # Output layer
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test accuracy for Simple Linear Model: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
import numpy as np


# Make predictions on the test set
y_pred = model.predict(X_test,verbose=None)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

# Calculate precision and recall
precision = precision_score(y_test, y_pred_classes, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred_classes, average='macro')  # Change average as needed

print(f"Test precision: {precision * 100:.2f}%")
print(f"Test recall: {recall * 100:.2f}%")




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy for Simple Linear Model: 56.82%
Test precision: 57.17%
Test recall: 56.51%


In [6]:
# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', C=1)  # Modify the kernel and C parameter as per your requirements

# Train the SVM model
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"SVM Test accuracy from Simple Linear Model: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro',zero_division=0)  # Change average as needed
recall = recall_score(y_test, y_pred, average='macro')  # Change average as needed

print(f"SVM Test precision: {precision * 100:.2f}%")
print(f"SVM Test recall: {recall * 100:.2f}%")

SVM Test accuracy from Simple Linear Model: 54.85%
SVM Test precision: 54.92%
SVM Test recall: 54.62%


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

from models import ConvolutionalEncoder
num_channels = [16, 32, 64]  # Example: 3 convolutional layers with increasing channels
sequence_length = 384  # Example: Length of the input sequence
embedding_dim = 4  # Example: Number of input channels
output_dim = 128  # Desired dimension of the final output embeddings

model = ConvolutionalEncoder(sequence_length, num_channels, embedding_dim, output_dim)
# Load the trained model
model.load_state_dict(torch.load('triplet_model_ConvolutionalEncoder+Arxiv.pth'))
model = model.to(device)
model.eval()  # Set the model to evaluation mode

def encode_data(model, numpy_arrays):
    encoded_tensors = []  # To collect the encoded tensors
    # Assuming 'embedding' is a numpy array with shape (4, 384)
    for i in numpy_arrays:
        embedding_tensor = torch.tensor(i, dtype=torch.float).unsqueeze(0)  # Add batch dimension

        # Assuming 'model' is an instance of the TransformerEncoder class
        # and is already loaded with the trained weights and sent to the appropriate device
        model.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            encoded_embedding = model(embedding_tensor.to(device))
        encoded_tensors.append(encoded_embedding.to("cpu"))
        
    return encoded_tensors


medoids = merged_df['medoids'].values
medoids[0].shape

encoded_tensors = encode_data(model, medoids)
len(encoded_tensors)

print(encoded_tensors[0].shape)

embeddings = np.stack(encoded_tensors)  # Convert to a 2D NumPy array
embeddings = embeddings.reshape(embeddings.shape[0], -1)
embeddings.shape
del model

Using cuda device
torch.Size([1, 128])


In [8]:
import torch
from torchviz import make_dot

# Define your model and move it to the appropriate device (either CPU or GPU)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = ConvolutionalEncoder(sequence_length, num_channels, embedding_dim, output_dim)
model.to(device)

# Create a dummy input tensor with the appropriate shape and move it to the same device
dummy_input = torch.randn(1, embedding_dim, sequence_length, device=device)  # Batch size of 1

import torch
import torch.onnx

# Ensure the model is in evaluation mode
model.eval()

# Define the file path for the ONNX model
onnx_model_path = "convolutional_encoder_model.onnx"

# Export the model
torch.onnx.export(model,                     # model being run
                  dummy_input,               # model input (or a tuple for multiple inputs)
                  onnx_model_path,           # where to save the model
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=11,          # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names=['input'],     # the model's input names
                  output_names=['output'],   # the model's output names
                  dynamic_axes={'input': {0: 'batch_size'},  # variable length axes
                                'output': {0: 'batch_size'}})

In [9]:
import umap.umap_ as umap

import plotly.express as px


# Perform dimensionality reduction using UMAP
reducer = umap.UMAP(n_components=3, random_state=42, n_neighbors=20, min_dist=0.0,metric="cosine")
embeddings_3d = reducer.fit_transform(embeddings)

# Create a DataFrame for the 3D embeddings
embeddings_3d_df = pd.DataFrame(embeddings_3d, columns=['x', 'y', 'z'])
embeddings_3d_df['Category'] = merged_df['unique_primary_category'].values

# Create the 3D scatter plot using Plotly
fig = px.scatter_3d(embeddings_3d_df, x='x', y='y', z='z', color='Category')
fig.show()
fig.write_html("3D_plot_ConvolutionalEncoder.html")


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [10]:

unique_primary_category = merged_df['unique_primary_category'].values
# Label encode the unique_primary_category
le = LabelEncoder()

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(embeddings.shape[1],)),  # Input layer
    layers.Dense(512, activation='relu'),  # Hidden layer
    layers.Dense(len(le.classes_), activation='softmax')  # Output layer
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test accuracy from Convolutional Encoder: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
import numpy as np


# Make predictions on the test set
y_pred = model.predict(X_test,verbose=None)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

# Calculate precision and recall
precision = precision_score(y_test, y_pred_classes, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred_classes, average='macro')  # Change average as needed

print(f"Test precision: {precision * 100:.2f}%")
print(f"Test recall: {recall * 100:.2f}%")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy from Convolutional Encoder: 61.96%
Test precision: 62.21%
Test recall: 61.75%


In [11]:
# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', C=1)  # Modify the kernel and C parameter as per your requirements

# Train the SVM model
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"SVM Test accuracy from Convolutional Encoder: {test_accuracy * 100:.2f}%")
from sklearn.metrics import precision_score, recall_score
# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred, average='macro')  # Change average as needed

print(f"SVM Test precision: {precision * 100:.2f}%")
print(f"SVM Test recall: {recall * 100:.2f}%")

SVM Test accuracy from Convolutional Encoder: 61.96%
SVM Test precision: 62.27%
SVM Test recall: 61.62%


In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

from models import LSTMEncoder
# Model hyperparameters
embedding_dim = 384
sequence_length = 4
hidden_dim = 1024  # Hidden dimensions for the LSTM
output_dim = 128  # This is the desired output dimension
num_layers = 2    # Number of LSTM layers
dropout = 0.5     # Dropout rate

# Initialize the model
model = LSTMEncoder(
    embedding_dim=embedding_dim, 
    sequence_length=sequence_length, 
    hidden_dim=hidden_dim, 
    output_dim=output_dim, 
    num_layers=num_layers, 
    dropout=dropout
).to(device)

# Load the trained model
model.load_state_dict(torch.load('triplet_model_LSTMEncoder+Arxiv.pth'))
model = model.to(device)
model.eval()  # Set the model to evaluation mode

def encode_data(model, numpy_arrays):
    encoded_tensors = []  # To collect the encoded tensors
    # Assuming 'embedding' is a numpy array with shape (4, 384)
    for i in numpy_arrays:
        embedding_tensor = torch.tensor(i, dtype=torch.float).unsqueeze(0)  # Add batch dimension

        # Assuming 'model' is an instance of the TransformerEncoder class
        # and is already loaded with the trained weights and sent to the appropriate device
        model.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            encoded_embedding = model(embedding_tensor.to(device))
        encoded_tensors.append(encoded_embedding.to("cpu"))
        
    return encoded_tensors


medoids = merged_df['medoids'].values
medoids[0].shape

encoded_tensors = encode_data(model, medoids)
len(encoded_tensors)

print(encoded_tensors[0].shape)

embeddings = np.stack(encoded_tensors)  # Convert to a 2D NumPy array
embeddings = embeddings.reshape(embeddings.shape[0], -1)
embeddings.shape
del model

Using cuda device
torch.Size([1, 128])


In [13]:
import torch
from torchviz import make_dot
import torch
import torch.onnx

# Define your model and move it to the appropriate device (either CPU or GPU)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

from models import LSTMEncoder
# Model hyperparameters
embedding_dim = 384
sequence_length = 4
hidden_dim = 1024  # Hidden dimensions for the LSTM
output_dim = 128  # This is the desired output dimension
num_layers = 2    # Number of LSTM layers
dropout = 0.5     # Dropout rate

# Initialize the model
model = LSTMEncoder(
    embedding_dim=embedding_dim, 
    sequence_length=sequence_length, 
    hidden_dim=hidden_dim, 
    output_dim=output_dim, 
    num_layers=num_layers, 
    dropout=dropout
).to(device)

# Create a dummy input tensor with the appropriate shape and move it to the same device
dummy_input = torch.randn(sequence_length, 1, embedding_dim, device=device)  # Adjusted shape


# Ensure the model is in evaluation mode
model.eval()

# Define the file path for the ONNX model
onnx_model_path = "LSTM_encoder_model.onnx"

# Export the model
torch.onnx.export(model,                     # model being run
                  dummy_input,               # model input (or a tuple for multiple inputs)
                  onnx_model_path,           # where to save the model
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=11,          # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names=['input'],     # the model's input names
                  output_names=['output'],   # the model's output names
                  dynamic_axes={'input': {0: 'batch_size'},  # variable length axes
                                'output': {0: 'batch_size'}})


Exporting a model to ONNX with a batch_size other than 1, with a variable length with LSTM can cause an error when running the ONNX model with a different batch size. Make sure to save the model with a batch size of 1, or define the initial states (h0/c0) as inputs of the model. 



In [14]:
import umap.umap_ as umap

import plotly.express as px


# Perform dimensionality reduction using UMAP
reducer = umap.UMAP(n_components=3, random_state=42, n_neighbors=20, min_dist=0.0,metric="cosine")
embeddings_3d = reducer.fit_transform(embeddings)

# Create a DataFrame for the 3D embeddings
embeddings_3d_df = pd.DataFrame(embeddings_3d, columns=['x', 'y', 'z'])
embeddings_3d_df['Category'] = merged_df['unique_primary_category'].values

# Create the 3D scatter plot using Plotly
fig = px.scatter_3d(embeddings_3d_df, x='x', y='y', z='z', color='Category')
fig.show()
fig.write_html("3D_plot_LSTMEncoder.html")


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [15]:

unique_primary_category = merged_df['unique_primary_category'].values
# Label encode the unique_primary_category
le = LabelEncoder()

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(embeddings.shape[1],)),  # Input layer
    layers.Dense(512, activation='relu'),  # Hidden layer
    layers.Dense(len(le.classes_), activation='softmax')  # Output layer
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test accuracy from LSTM Encoder: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
import numpy as np


# Make predictions on the test set
y_pred = model.predict(X_test,verbose=None)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

# Calculate precision and recall
precision = precision_score(y_test, y_pred_classes, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred_classes, average='macro')  # Change average as needed

print(f"Test precision: {precision * 100:.2f}%")
print(f"Test recall: {recall * 100:.2f}%")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy from LSTM Encoder: 61.75%
Test precision: 62.68%
Test recall: 61.74%


In [16]:
# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', C=1)  # Modify the kernel and C parameter as per your requirements

# Train the SVM model
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"SVM Test accuracy from LSTM Encoder: {test_accuracy * 100:.2f}%")
from sklearn.metrics import precision_score, recall_score
# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred, average='macro')  # Change average as needed

print(f"SVM Test precision: {precision * 100:.2f}%")
print(f"SVM Test recall: {recall * 100:.2f}%")

SVM Test accuracy from LSTM Encoder: 61.22%
SVM Test precision: 61.73%
SVM Test recall: 61.01%


In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

from models import TransformerEncoder
embed_dim = 384  # Example embedding dimension
num_heads = 4    # Example number of heads in multi-head attention
dim_feedforward = 1024  # Example feedforward dimension
num_layers = 2  # Example number of layers in the transformer encoder
seq_length = 4  # Original sequence length

model = TransformerEncoder(embed_dim, num_heads, dim_feedforward, num_layers, 0.2).to(device)
# Load the trained model
model.load_state_dict(torch.load('triplet_model_TransformerEncoder+Arxiv.pth'))
model = model.to(device)
model.eval()  # Set the model to evaluation mode

def encode_data(model, numpy_arrays):
    encoded_tensors = []  # To collect the encoded tensors
    # Assuming 'embedding' is a numpy array with shape (4, 384)
    for i in numpy_arrays:
        embedding_tensor = torch.tensor(i, dtype=torch.float).unsqueeze(0)  # Add batch dimension

        # Assuming 'model' is an instance of the TransformerEncoder class
        # and is already loaded with the trained weights and sent to the appropriate device
        model.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            encoded_embedding = model(embedding_tensor.to(device))
        encoded_tensors.append(encoded_embedding.to("cpu"))
        
    return encoded_tensors


medoids = merged_df['medoids'].values
medoids[0].shape

encoded_tensors = encode_data(model, medoids)
len(encoded_tensors)

print(encoded_tensors[0].shape)

embeddings = np.stack(encoded_tensors)  # Convert to a 2D NumPy array
embeddings = embeddings.reshape(embeddings.shape[0], -1)
embeddings.shape
del model

Using cuda device
torch.Size([1, 128])


In [18]:
import umap.umap_ as umap

import plotly.express as px


# Perform dimensionality reduction using UMAP
reducer = umap.UMAP(n_components=3, random_state=42, n_neighbors=20, min_dist=0.0,metric="cosine")
embeddings_3d = reducer.fit_transform(embeddings)

# Create a DataFrame for the 3D embeddings
embeddings_3d_df = pd.DataFrame(embeddings_3d, columns=['x', 'y', 'z'])
embeddings_3d_df['Category'] = merged_df['unique_primary_category'].values

# Create the 3D scatter plot using Plotly
fig = px.scatter_3d(embeddings_3d_df, x='x', y='y', z='z', color='Category')
fig.show()
fig.write_html("3D_plot_TransformerEncoder.html")


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [19]:

unique_primary_category = merged_df['unique_primary_category'].values
# Label encode the unique_primary_category
le = LabelEncoder()

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(embeddings.shape[1],)),  # Input layer
    layers.Dense(512, activation='relu'),  # Hidden layer
    layers.Dense(len(le.classes_), activation='softmax')  # Output layer
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test accuracy from Transformer Encoder: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
import numpy as np


# Make predictions on the test set
y_pred = model.predict(X_test,verbose=None)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

# Calculate precision and recall
precision = precision_score(y_test, y_pred_classes, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred_classes, average='macro')  # Change average as needed

print(f"Test precision: {precision * 100:.2f}%")
print(f"Test recall: {recall * 100:.2f}%")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy from Transformer Encoder: 61.64%
Test precision: 62.41%
Test recall: 61.30%


In [20]:
# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', C=1)  # Modify the kernel and C parameter as per your requirements

# Train the SVM model
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"SVM Test accuracy from Transformer Encoder: {test_accuracy * 100:.2f}%")
from sklearn.metrics import precision_score, recall_score
# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred, average='macro')  # Change average as needed

print(f"SVM Test precision: {precision * 100:.2f}%")
print(f"SVM Test recall: {recall * 100:.2f}%")

SVM Test accuracy from Transformer Encoder: 59.79%
SVM Test precision: 61.21%
SVM Test recall: 59.35%
