In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras import layers

# Load the DataFrame from 'cleaned_dataset.csv'
unique_id_to_topic = pd.read_csv('cleaned_dataset_10k.csv')

# Load the EmbeddingsDataFrame from 'EmbeddingsDataFrame.pkl'
with open('augmented_data_10k.pkl', 'rb') as f:
    embeddings_df = pickle.load(f)

# Merge the two DataFrames based on 'id'
merged_df = pd.merge(embeddings_df, unique_id_to_topic, left_on='id', right_on='id')

merged_df = merged_df[['id', 'medoids', 'unique_primary_category']]
merged_df



Unnamed: 0,id,medoids,unique_primary_category
0,hep-ph/0610334,"[[-0.113838255, -0.013086513, -0.026049882, 0....",hep-ph
1,2104.06416,"[[-0.13890694, -0.045757502, 0.0331088, 0.0221...",hep-ph
2,hep-ph/9606269,"[[-0.09846101, 0.05293004, 0.047359765, -0.025...",hep-ph
3,hep-ph/9811382,"[[-0.10917934, -0.025503034, -0.004675309, 0.0...",hep-ph
4,1304.2781,"[[-0.054514293, -0.08432221, -0.044620816, -0....",hep-ph
...,...,...,...
9419,2308.09211,"[[-0.0042298124, -0.033133063, 0.060381196, -0...",econ
9420,2212.03704,"[[-0.09859083, -0.034708254, 0.04424262, -0.02...",econ
9421,2309.09299,"[[0.04684168, -0.020495592, 0.0053604506, 0.02...",econ
9422,1910.11154,"[[0.006068893, -0.0030239178, -0.046500694, -0...",econ


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")



Using cuda device


In [3]:

def encode_data(model, numpy_arrays):
    encoded_tensors = []  # To collect the encoded tensors
    
    for arr in numpy_arrays:
        # Check if the array can be reshaped to (1, 1536)
        if arr.size == 1536:
            reshaped_array = arr.reshape(1, 1536)
            
            # Convert NumPy array to PyTorch tensor
            input_tensor = torch.tensor(reshaped_array, dtype=torch.float32).to(device)
            
            with torch.no_grad():  # Disable gradient computation
                encoded_tensor = model.encoder(input_tensor)  # Use only the encoder part
                encoded_tensors.append(encoded_tensor.cpu())  # Move tensor back to CPU
        else:
            print(f"Skipping array of shape {arr.shape}. Cannot be reshaped to (1, 1536).")
    
    return encoded_tensors  # Return the list of all encoded tensors

def encode_dataConv(model, numpy_arrays):
    encoded_tensors = []  # To collect the encoded tensors
    
    for arr in numpy_arrays:
        # Check if the array can be reshaped to (1, 4, 384)
        if arr.size == 1536:
            reshaped_array = arr.reshape(1, 4, 384)
            
            # Convert NumPy array to PyTorch tensor
            input_tensor = torch.tensor(reshaped_array, dtype=torch.float32).to(device)
            
            with torch.no_grad():  # Disable gradient computation
                encoded_tensor = model.encoder(input_tensor)  # Use only the encoder part
                encoded_tensors.append(encoded_tensor.cpu())  # Move tensor back to CPU
        else:
            print(f"Skipping array of shape {arr.shape}. Cannot be reshaped to (1, 4, 384).")
    
    return encoded_tensors  # Return the list of all encoded tensors


In [4]:
import sys
sys.path.append("C:/Users/chris/OneDrive/Desktop/Diplomatiki2/AutoencoderCreation")
from model import SimpleAutoencoder
# Load the trained model
model = SimpleAutoencoder()
model.load_state_dict(torch.load('trained_SimpleAutoencoder_best+arxiv.pth'))
model = model.to(device)
model.eval()  # Set the model to evaluation mode

medoids = merged_df['medoids'].values
encoded_tensors = encode_data(model, medoids)
encoded_tensors[0].shape
embeddings = np.stack(encoded_tensors)  # Convert to a 2D NumPy array
embeddings = embeddings.reshape(embeddings.shape[0], -1)
model.cpu()
del model
embeddings.shape

(9424, 128)

In [5]:
import umap.umap_ as umap

import plotly.express as px


# Perform dimensionality reduction using UMAP
reducer = umap.UMAP(n_components=3, random_state=42, n_neighbors=20, min_dist=0.0,metric="cosine")
embeddings_3d = reducer.fit_transform(embeddings)

# Create a DataFrame for the 3D embeddings
embeddings_3d_df = pd.DataFrame(embeddings_3d, columns=['x', 'y', 'z'])
embeddings_3d_df['Category'] = merged_df['unique_primary_category'].values

# Create the 3D scatter plot using Plotly
fig = px.scatter_3d(embeddings_3d_df, x='x', y='y', z='z', color='Category')
fig.show()
fig.write_html("3D_plot_model_autoencoder.html")

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [6]:

unique_primary_category = merged_df['unique_primary_category'].values
# Label encode the unique_primary_category
le = LabelEncoder()

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(embeddings.shape[1],)),  # Input layer
    layers.Dense(512, activation='relu'),  # Hidden layer
    layers.Dense(len(le.classes_), activation='softmax')  # Output layer
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test accuracy of SimpleAutoencoder: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
import numpy as np


# Make predictions on the test set
y_pred = model.predict(X_test,verbose=None)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

# Calculate precision and recall
precision = precision_score(y_test, y_pred_classes, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred_classes, average='macro')  # Change average as needed

print(f"Test precision: {precision * 100:.2f}%")
print(f"Test recall: {recall * 100:.2f}%")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy of SimpleAutoencoder: 75.54%
Test precision: 76.61%
Test recall: 75.35%


In [7]:
# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', C=1)  # Modify the kernel and C parameter as per your requirements

# Train the SVM model
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"SVM Test accuracy of SimpleAutoencoder: {test_accuracy * 100:.2f}%")
from sklearn.metrics import precision_score, recall_score
# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred, average='macro')  # Change average as needed

print(f"SVM Test precision: {precision * 100:.2f}%")
print(f"SVM Test recall: {recall * 100:.2f}%")

SVM Test accuracy of SimpleAutoencoder: 73.00%
SVM Test precision: 72.93%
SVM Test recall: 72.83%


In [8]:
import sys
sys.path.append("C:/Users/chris/OneDrive/Desktop/Diplomatiki2/AutoencoderCreation")
from model import CNN_Autoencoder
# Load the trained model
model = CNN_Autoencoder()
model.load_state_dict(torch.load('trained_CNN_Autoencoder_best+arxiv.pth'))
model = model.to(device)
model.eval()  # Set the model to evaluation mode

medoids = merged_df['medoids'].values
encoded_tensors = encode_dataConv(model, medoids)
encoded_tensors[0].shape
embeddings = np.stack(encoded_tensors)  # Convert to a 2D NumPy array
embeddings = embeddings.reshape(embeddings.shape[0], -1)
model.cpu()
del model
embeddings.shape

(9424, 128)

In [9]:
import umap.umap_ as umap

import plotly.express as px


# Perform dimensionality reduction using UMAP
reducer = umap.UMAP(n_components=3, random_state=42, n_neighbors=20, min_dist=0.0,metric="cosine")
embeddings_3d = reducer.fit_transform(embeddings)

# Create a DataFrame for the 3D embeddings
embeddings_3d_df = pd.DataFrame(embeddings_3d, columns=['x', 'y', 'z'])
embeddings_3d_df['Category'] = merged_df['unique_primary_category'].values

# Create the 3D scatter plot using Plotly
fig = px.scatter_3d(embeddings_3d_df, x='x', y='y', z='z', color='Category')
fig.show()
fig.write_html("3D_plot_model_autoencoder.html")


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [10]:

unique_primary_category = merged_df['unique_primary_category'].values
# Label encode the unique_primary_category
le = LabelEncoder()

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(embeddings.shape[1],)),  # Input layer
    layers.Dense(512, activation='relu'),  # Hidden layer
    layers.Dense(len(le.classes_), activation='softmax')  # Output layer
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test accuracy of CNN_Autoencoder: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
import numpy as np


# Make predictions on the test set
y_pred = model.predict(X_test,verbose=None)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

# Calculate precision and recall
precision = precision_score(y_test, y_pred_classes, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred_classes, average='macro')  # Change average as needed

print(f"Test precision: {precision * 100:.2f}%")
print(f"Test recall: {recall * 100:.2f}%")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy of CNN_Autoencoder: 71.83%
Test precision: 72.58%
Test recall: 71.48%


In [11]:
# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', C=1)  # Modify the kernel and C parameter as per your requirements

# Train the SVM model
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"SVM Test accuracy of CNN_Autoencoder: {test_accuracy * 100:.2f}%")
from sklearn.metrics import precision_score, recall_score
# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred, average='macro')  # Change average as needed

print(f"SVM Test precision: {precision * 100:.2f}%")
print(f"SVM Test recall: {recall * 100:.2f}%")

SVM Test accuracy of CNN_Autoencoder: 72.63%
SVM Test precision: 72.90%
SVM Test recall: 72.47%


In [12]:
import sys
sys.path.append("C:/Users/chris/OneDrive/Desktop/Diplomatiki2/AutoencoderCreation")
from model import RecurrentAutoencoder
# Load the trained model
model = RecurrentAutoencoder()
model.load_state_dict(torch.load('trained_RecurrentAutoencoder_best+arxiv.pth'))
model = model.to(device)
model.eval()  # Set the model to evaluation mode

medoids = merged_df['medoids'].values
encoded_tensors = encode_dataConv(model, medoids)
encoded_tensors[0].shape
embeddings = np.stack(encoded_tensors)  # Convert to a 2D NumPy array
embeddings = embeddings.reshape(embeddings.shape[0], -1)
model.cpu()
del model
embeddings.shape

(9424, 128)

In [13]:
import umap.umap_ as umap

import plotly.express as px


# Perform dimensionality reduction using UMAP
reducer = umap.UMAP(n_components=3, random_state=42, n_neighbors=20, min_dist=0.0,metric="cosine")
embeddings_3d = reducer.fit_transform(embeddings)

# Create a DataFrame for the 3D embeddings
embeddings_3d_df = pd.DataFrame(embeddings_3d, columns=['x', 'y', 'z'])
embeddings_3d_df['Category'] = merged_df['unique_primary_category'].values

# Create the 3D scatter plot using Plotly
fig = px.scatter_3d(embeddings_3d_df, x='x', y='y', z='z', color='Category')
fig.show()
fig.write_html("3D_plot_model_autoencoder.html")


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [14]:

unique_primary_category = merged_df['unique_primary_category'].values
# Label encode the unique_primary_category
le = LabelEncoder()

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(embeddings.shape[1],)),  # Input layer
    layers.Dense(512, activation='relu'),  # Hidden layer
    layers.Dense(len(le.classes_), activation='softmax')  # Output layer
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test accuracy of RecurrentAutoencoder: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
import numpy as np


# Make predictions on the test set
y_pred = model.predict(X_test,verbose=None)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

# Calculate precision and recall
precision = precision_score(y_test, y_pred_classes, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred_classes, average='macro')  # Change average as needed

print(f"Test precision: {precision * 100:.2f}%")
print(f"Test recall: {recall * 100:.2f}%")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy of RecurrentAutoencoder: 76.50%
Test precision: 77.03%
Test recall: 76.34%


In [15]:
# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', C=1)  # Modify the kernel and C parameter as per your requirements

# Train the SVM model
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"SVM Test accuracy of RecurrentAutoencoder: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred, average='macro')  # Change average as needed

print(f"SVM Test precision: {precision * 100:.2f}%")
print(f"SVM Test recall: {recall * 100:.2f}%")

SVM Test accuracy of RecurrentAutoencoder: 76.45%
SVM Test precision: 76.62%
SVM Test recall: 76.43%


In [16]:
import sys
sys.path.append("C:/Users/chris/OneDrive/Desktop/Diplomatiki2/AutoencoderCreation")
from model import TransformerAutoencoder

# Load the trained model
embed_dim = 384  # Example embedding dimension
num_heads = 4    # Example number of heads in multi-head attention
dim_feedforward = 1024  # Example feedforward dimension
num_layers = 2  # Example number of layers in the transformer encoder
seq_length = 4  # Original sequence length

model = TransformerAutoencoder(embed_dim, num_heads, dim_feedforward, num_layers, seq_length).to(device)
model.load_state_dict(torch.load('trained_TransformerAutoencoder_best+arxiv.pth'))
model = model.to(device)
model.eval()  # Set the model to evaluation mode

medoids = merged_df['medoids'].values
encoded_tensors = encode_dataConv(model, medoids)
encoded_tensors[0].shape
embeddings = np.stack(encoded_tensors)  # Convert to a 2D NumPy array
embeddings = embeddings.reshape(embeddings.shape[0], -1)
model.cpu()
del model
embeddings.shape

(9424, 128)

In [17]:
import umap.umap_ as umap

import plotly.express as px


# Perform dimensionality reduction using UMAP
reducer = umap.UMAP(n_components=3, random_state=42, n_neighbors=20, min_dist=0.0,metric="cosine")
embeddings_3d = reducer.fit_transform(embeddings)

# Create a DataFrame for the 3D embeddings
embeddings_3d_df = pd.DataFrame(embeddings_3d, columns=['x', 'y', 'z'])
embeddings_3d_df['Category'] = merged_df['unique_primary_category'].values

# Create the 3D scatter plot using Plotly
fig = px.scatter_3d(embeddings_3d_df, x='x', y='y', z='z', color='Category')
fig.show()
fig.write_html("3D_plot_model_autoencoder.html")


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [18]:

unique_primary_category = merged_df['unique_primary_category'].values
# Label encode the unique_primary_category
le = LabelEncoder()

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(embeddings.shape[1],)),  # Input layer
    layers.Dense(512, activation='relu'),  # Hidden layer
    layers.Dense(len(le.classes_), activation='softmax')  # Output layer
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test accuracy of TransformerAutoencoder: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
import numpy as np


# Make predictions on the test set
y_pred = model.predict(X_test,verbose=None)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

# Calculate precision and recall
precision = precision_score(y_test, y_pred_classes, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred_classes, average='macro')  # Change average as needed

print(f"Test precision: {precision * 100:.2f}%")
print(f"Test recall: {recall * 100:.2f}%")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy of TransformerAutoencoder: 76.39%
Test precision: 76.72%
Test recall: 76.16%


In [19]:
# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', C=1)  # Modify the kernel and C parameter as per your requirements

# Train the SVM model
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"SVM Test accuracy of TransformerAutoencoder: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred, average='macro')  # Change average as needed

print(f"SVM Test precision: {precision * 100:.2f}%")
print(f"SVM Test recall: {recall * 100:.2f}%")

SVM Test accuracy of TransformerAutoencoder: 77.35%
SVM Test precision: 77.53%
SVM Test recall: 77.31%
