In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras import layers

# Load the DataFrame from 'cleaned_dataset.csv'
unique_id_to_topic = pd.read_csv('cleaned_dataset.csv')

# Load the EmbeddingsDataFrame from 'EmbeddingsDataFrame.pkl'
with open('augmented_data.pkl', 'rb') as f:
    embeddings_df = pickle.load(f)

# Merge the two DataFrames based on 'id'
merged_df = pd.merge(embeddings_df, unique_id_to_topic, left_on='id', right_on='id')

merged_df = merged_df[['id', 'medoids', 'unique_primary_category']]
merged_df



Unnamed: 0,id,medoids,unique_primary_category
0,hep-ph/0610334,"[[-0.113838255, -0.013086513, -0.026049882, 0....",hep-ph
1,2104.06416,"[[-0.13890694, -0.045757502, 0.0331088, 0.0221...",hep-ph
2,hep-ph/9606269,"[[-0.09846101, 0.05293004, 0.047359765, -0.025...",hep-ph
3,hep-ph/9811382,"[[-0.10917934, -0.025503034, -0.004675309, 0.0...",hep-ph
4,1304.2781,"[[-0.054514293, -0.08432221, -0.044620816, -0....",hep-ph
...,...,...,...
1882,2007.07091,"[[0.012824047, 0.048758022, -0.018256145, -0.0...",econ
1883,2006.00368,"[[0.045670357, -0.006828758, 0.026731724, -0.0...",econ
1884,2107.03440,"[[-0.06988597, -0.0019644916, -0.030532172, -0...",econ
1885,1910.00073,"[[0.033800874, -0.021397091, 0.009115859, 0.05...",econ


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")



Using cuda device


In [3]:

def encode_data(model, numpy_arrays):
    encoded_tensors = []  # To collect the encoded tensors
    
    for arr in numpy_arrays:
        # Check if the array can be reshaped to (1, 1536)
        if arr.size == 1536:
            reshaped_array = arr.reshape(1, 1536)
            
            # Convert NumPy array to PyTorch tensor
            input_tensor = torch.tensor(reshaped_array, dtype=torch.float32).to(device)
            
            with torch.no_grad():  # Disable gradient computation
                encoded_tensor = model.encoder(input_tensor)  # Use only the encoder part
                encoded_tensors.append(encoded_tensor.cpu())  # Move tensor back to CPU
        else:
            print(f"Skipping array of shape {arr.shape}. Cannot be reshaped to (1, 1536).")
    
    return encoded_tensors  # Return the list of all encoded tensors

def encode_dataConv(model, numpy_arrays):
    encoded_tensors = []  # To collect the encoded tensors
    
    for arr in numpy_arrays:
        # Check if the array can be reshaped to (1, 4, 384)
        if arr.size == 1536:
            reshaped_array = arr.reshape(1, 4, 384)
            
            # Convert NumPy array to PyTorch tensor
            input_tensor = torch.tensor(reshaped_array, dtype=torch.float32).to(device)
            
            with torch.no_grad():  # Disable gradient computation
                encoded_tensor = model.encoder(input_tensor)  # Use only the encoder part
                encoded_tensors.append(encoded_tensor.cpu())  # Move tensor back to CPU
        else:
            print(f"Skipping array of shape {arr.shape}. Cannot be reshaped to (1, 4, 384).")
    
    return encoded_tensors  # Return the list of all encoded tensors


In [4]:
import sys
sys.path.append("C:/Users/chris/OneDrive/Desktop/Diplomatiki2/AutoencoderCreation")
from model import SimpleAutoencoder
# Load the trained model
model = SimpleAutoencoder()
model.load_state_dict(torch.load('trained_SimpleAutoencoder_best.pth'))
model = model.to(device)
model.eval()  # Set the model to evaluation mode

medoids = merged_df['medoids'].values
encoded_tensors = encode_data(model, medoids)
encoded_tensors[0].shape
embeddings = np.stack(encoded_tensors)  # Convert to a 2D NumPy array
embeddings = embeddings.reshape(embeddings.shape[0], -1)
model.cpu()
del model
embeddings.shape

(1887, 128)

In [5]:
import umap.umap_ as umap

import plotly.express as px


# Perform dimensionality reduction using UMAP
reducer = umap.UMAP(n_components=3, random_state=42, n_neighbors=20, min_dist=0.0,metric="cosine")
embeddings_3d = reducer.fit_transform(embeddings)

# Create a DataFrame for the 3D embeddings
embeddings_3d_df = pd.DataFrame(embeddings_3d, columns=['x', 'y', 'z'])
embeddings_3d_df['Category'] = merged_df['unique_primary_category'].values

# Create the 3D scatter plot using Plotly
fig = px.scatter_3d(embeddings_3d_df, x='x', y='y', z='z', color='Category')
fig.show()
fig.write_html("3D_plot_model_autoencoder.html")

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [6]:

unique_primary_category = merged_df['unique_primary_category'].values
# Label encode the unique_primary_category
le = LabelEncoder()

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(embeddings.shape[1],)),  # Input layer
    layers.Dense(512, activation='relu'),  # Hidden layer
    layers.Dense(len(le.classes_), activation='softmax')  # Output layer
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test accuracy of SimpleAutoencoder: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
import numpy as np


# Make predictions on the test set
y_pred = model.predict(X_test,verbose=None)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

# Calculate precision and recall
precision = precision_score(y_test, y_pred_classes, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred_classes, average='macro')  # Change average as needed

print(f"Test precision: {precision * 100:.2f}%")
print(f"Test recall: {recall * 100:.2f}%")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy of SimpleAutoencoder: 69.58%
Test precision: 71.37%
Test recall: 68.43%


In [7]:
# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', C=1)  # Modify the kernel and C parameter as per your requirements

# Train the SVM model
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"SVM Test accuracy of SimpleAutoencoder: {test_accuracy * 100:.2f}%")
from sklearn.metrics import precision_score, recall_score
# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred, average='macro')  # Change average as needed

print(f"SVM Test precision: {precision * 100:.2f}%")
print(f"SVM Test recall: {recall * 100:.2f}%")

SVM Test accuracy of SimpleAutoencoder: 70.90%
SVM Test precision: 72.63%
SVM Test recall: 71.59%


In [8]:
import sys
sys.path.append("C:/Users/chris/OneDrive/Desktop/Diplomatiki2/AutoencoderCreation")
from model import CNN_Autoencoder
# Load the trained model
model = CNN_Autoencoder()
model.load_state_dict(torch.load('trained_CNN_Autoencoder_best.pth'))
model = model.to(device)
model.eval()  # Set the model to evaluation mode

medoids = merged_df['medoids'].values
encoded_tensors = encode_dataConv(model, medoids)
encoded_tensors[0].shape
embeddings = np.stack(encoded_tensors)  # Convert to a 2D NumPy array
embeddings = embeddings.reshape(embeddings.shape[0], -1)
model.cpu()
del model
embeddings.shape

(1887, 128)

In [9]:
import umap.umap_ as umap

import plotly.express as px


# Perform dimensionality reduction using UMAP
reducer = umap.UMAP(n_components=3, random_state=42, n_neighbors=20, min_dist=0.0,metric="cosine")
embeddings_3d = reducer.fit_transform(embeddings)

# Create a DataFrame for the 3D embeddings
embeddings_3d_df = pd.DataFrame(embeddings_3d, columns=['x', 'y', 'z'])
embeddings_3d_df['Category'] = merged_df['unique_primary_category'].values

# Create the 3D scatter plot using Plotly
fig = px.scatter_3d(embeddings_3d_df, x='x', y='y', z='z', color='Category')
fig.show()
fig.write_html("3D_plot_model_autoencoder.html")


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [10]:

unique_primary_category = merged_df['unique_primary_category'].values
# Label encode the unique_primary_category
le = LabelEncoder()

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(embeddings.shape[1],)),  # Input layer
    layers.Dense(512, activation='relu'),  # Hidden layer
    layers.Dense(len(le.classes_), activation='softmax')  # Output layer
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test accuracy of CNN_Autoencoder: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
import numpy as np


# Make predictions on the test set
y_pred = model.predict(X_test,verbose=None)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

# Calculate precision and recall
precision = precision_score(y_test, y_pred_classes, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred_classes, average='macro')  # Change average as needed

print(f"Test precision: {precision * 100:.2f}%")
print(f"Test recall: {recall * 100:.2f}%")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy of CNN_Autoencoder: 54.76%
Test precision: 56.41%
Test recall: 55.60%


In [11]:
# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', C=1)  # Modify the kernel and C parameter as per your requirements

# Train the SVM model
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"SVM Test accuracy of CNN_Autoencoder: {test_accuracy * 100:.2f}%")
from sklearn.metrics import precision_score, recall_score
# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred, average='macro')  # Change average as needed

print(f"SVM Test precision: {precision * 100:.2f}%")
print(f"SVM Test recall: {recall * 100:.2f}%")

SVM Test accuracy of CNN_Autoencoder: 56.35%
SVM Test precision: 61.59%
SVM Test recall: 58.83%



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [12]:
import sys
sys.path.append("C:/Users/chris/OneDrive/Desktop/Diplomatiki2/AutoencoderCreation")
from model import RecurrentAutoencoder
# Load the trained model
model = RecurrentAutoencoder()
model.load_state_dict(torch.load('trained_RecurrentAutoencoder_best.pth'))
model = model.to(device)
model.eval()  # Set the model to evaluation mode

medoids = merged_df['medoids'].values
encoded_tensors = encode_dataConv(model, medoids)
encoded_tensors[0].shape
embeddings = np.stack(encoded_tensors)  # Convert to a 2D NumPy array
embeddings = embeddings.reshape(embeddings.shape[0], -1)
model.cpu()
del model
embeddings.shape

(1887, 128)

In [13]:
import umap.umap_ as umap

import plotly.express as px


# Perform dimensionality reduction using UMAP
reducer = umap.UMAP(n_components=3, random_state=42, n_neighbors=20, min_dist=0.0,metric="cosine")
embeddings_3d = reducer.fit_transform(embeddings)

# Create a DataFrame for the 3D embeddings
embeddings_3d_df = pd.DataFrame(embeddings_3d, columns=['x', 'y', 'z'])
embeddings_3d_df['Category'] = merged_df['unique_primary_category'].values

# Create the 3D scatter plot using Plotly
fig = px.scatter_3d(embeddings_3d_df, x='x', y='y', z='z', color='Category')
fig.show()
fig.write_html("3D_plot_model_autoencoder.html")


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [14]:

unique_primary_category = merged_df['unique_primary_category'].values
# Label encode the unique_primary_category
le = LabelEncoder()

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(embeddings.shape[1],)),  # Input layer
    layers.Dense(512, activation='relu'),  # Hidden layer
    layers.Dense(len(le.classes_), activation='softmax')  # Output layer
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test accuracy of RecurrentAutoencoder: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
import numpy as np


# Make predictions on the test set
y_pred = model.predict(X_test,verbose=None)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

# Calculate precision and recall
precision = precision_score(y_test, y_pred_classes, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred_classes, average='macro')  # Change average as needed

print(f"Test precision: {precision * 100:.2f}%")
print(f"Test recall: {recall * 100:.2f}%")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy of RecurrentAutoencoder: 73.81%
Test precision: 75.71%
Test recall: 74.54%


In [15]:
# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', C=1)  # Modify the kernel and C parameter as per your requirements

# Train the SVM model
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"SVM Test accuracy of RecurrentAutoencoder: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred, average='macro')  # Change average as needed

print(f"SVM Test precision: {precision * 100:.2f}%")
print(f"SVM Test recall: {recall * 100:.2f}%")

SVM Test accuracy of RecurrentAutoencoder: 73.54%
SVM Test precision: 74.59%
SVM Test recall: 74.29%


In [16]:
import sys
sys.path.append("C:/Users/chris/OneDrive/Desktop/Diplomatiki2/AutoencoderCreation")
from model import TransformerAutoencoder

# Load the trained model
embed_dim = 384  # Example embedding dimension
num_heads = 4    # Example number of heads in multi-head attention
dim_feedforward = 1024  # Example feedforward dimension
num_layers = 2  # Example number of layers in the transformer encoder
seq_length = 4  # Original sequence length

model = TransformerAutoencoder(embed_dim, num_heads, dim_feedforward, num_layers, seq_length).to(device)
model.load_state_dict(torch.load('trained_TransformerAutoencoder_best.pth'))
model = model.to(device)
model.eval()  # Set the model to evaluation mode

medoids = merged_df['medoids'].values
encoded_tensors = encode_dataConv(model, medoids)
encoded_tensors[0].shape
embeddings = np.stack(encoded_tensors)  # Convert to a 2D NumPy array
embeddings = embeddings.reshape(embeddings.shape[0], -1)
model.cpu()
del model
embeddings.shape

(1887, 128)

In [17]:
import umap.umap_ as umap

import plotly.express as px


# Perform dimensionality reduction using UMAP
reducer = umap.UMAP(n_components=3, random_state=42, n_neighbors=20, min_dist=0.0,metric="cosine")
embeddings_3d = reducer.fit_transform(embeddings)

# Create a DataFrame for the 3D embeddings
embeddings_3d_df = pd.DataFrame(embeddings_3d, columns=['x', 'y', 'z'])
embeddings_3d_df['Category'] = merged_df['unique_primary_category'].values

# Create the 3D scatter plot using Plotly
fig = px.scatter_3d(embeddings_3d_df, x='x', y='y', z='z', color='Category')
fig.show()
fig.write_html("3D_plot_model_autoencoder.html")


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [18]:

unique_primary_category = merged_df['unique_primary_category'].values
# Label encode the unique_primary_category
le = LabelEncoder()

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(embeddings.shape[1],)),  # Input layer
    layers.Dense(512, activation='relu'),  # Hidden layer
    layers.Dense(len(le.classes_), activation='softmax')  # Output layer
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test accuracy of TransformerAutoencoder: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
import numpy as np


# Make predictions on the test set
y_pred = model.predict(X_test,verbose=None)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

# Calculate precision and recall
precision = precision_score(y_test, y_pred_classes, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred_classes, average='macro')  # Change average as needed

print(f"Test precision: {precision * 100:.2f}%")
print(f"Test recall: {recall * 100:.2f}%")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy of TransformerAutoencoder: 72.49%
Test precision: 74.79%
Test recall: 73.17%


In [19]:
# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_primary_category)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', C=1)  # Modify the kernel and C parameter as per your requirements

# Train the SVM model
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"SVM Test accuracy of TransformerAutoencoder: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred, average='macro')  # Change average as needed

print(f"SVM Test precision: {precision * 100:.2f}%")
print(f"SVM Test recall: {recall * 100:.2f}%")

SVM Test accuracy of TransformerAutoencoder: 76.72%
SVM Test precision: 77.12%
SVM Test recall: 77.12%


In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras import layers

# Load the DataFrame
unique_id_to_topic = pd.read_csv('cleaned_dataset.csv')
unique_id_to_topic = unique_id_to_topic.dropna(subset=['full_text'])

# Function to load GloVe embeddings
def load_glove_embeddings(path):
    embeddings_index = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Function to create document embeddings
def document_embedding(doc, embeddings_index):
    words = doc.split()
    word_embeddings = [embeddings_index.get(word, np.zeros((200,))) for word in words]  # Using 200 dimensions
    doc_embedding = np.mean(word_embeddings, axis=0)
    return doc_embedding

# Load GloVe embeddings (assuming you have the 200d file)
glove_embeddings = load_glove_embeddings('glove.6B.200d.txt')  # Update path as necessary

# Apply document embedding to each document
doc_embeddings = np.array([document_embedding(doc, glove_embeddings) for doc in unique_id_to_topic['full_text']])
print(len(doc_embeddings))
# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_id_to_topic['unique_primary_category'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(doc_embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(200,)),  # Adjust input shape to GloVe dimensions
    layers.Dense(512, activation='relu'),
    layers.Dense(len(le.classes_), activation='softmax')
])

# Compile and train the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)


# Make predictions and calculate precision and recall
from sklearn.metrics import precision_score, recall_score
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
precision = precision_score(y_test, y_pred_classes, average='macro')
recall = recall_score(y_test, y_pred_classes, average='macro')
print(f"GloVe Test accuracy: {test_accuracy * 100:.2f}%")
print(f"Test precision: {precision * 100:.2f}%")
print(f"Test recall: {recall * 100:.2f}%")


1888
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
GloVe Test accuracy: 50.53%
Test precision: 51.45%
Test recall: 50.76%


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Assuming unique_id_to_topic and doc_embeddings are already defined as per previous steps

# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_id_to_topic['unique_primary_category'])

# Split the data into training and testing sets using the GloVe embeddings
X_train, X_test, y_train, y_test = train_test_split(doc_embeddings, y_encoded, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', C=1)  # You can modify the kernel and C parameter as needed

# Train the SVM model
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Test accuracy: {test_accuracy * 100:.2f}%")

# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred, average='macro')  # Change average as needed
print(f"SVM Test precision: {precision * 100:.2f}%")
print(f"SVM Test recall: {recall * 100:.2f}%")


SVM Test accuracy: 47.88%
SVM Test precision: 53.30%
SVM Test recall: 48.70%


In [22]:
import pandas as pd
import numpy as np
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras import layers
from gensim.utils import simple_preprocess
from sklearn.decomposition import PCA

# Load the DataFrame from 'cleaned_dataset.csv'
unique_id_to_topic = pd.read_csv('cleaned_dataset.csv')
unique_id_to_topic = unique_id_to_topic.dropna(subset=['full_text'])  # Option 1: Drop rows with NaN values

# Function to merge specified subcategories into 'Physics'
def merge_into_physics(category):
    physics_subcategories = ['hep-ph', 'astro-ph', 'hep-th','gr-gc', 'hep-ex', 'nucl-th', 'quant-ph', 'nucl-ex', 'hep-lat']
    if category in physics_subcategories:
        return 'physics'
    return category

def merge_into_math(category):
    physics_subcategories = ['math','cond-mat']
    if category in physics_subcategories:
        return 'math'
    return category

# # Apply the function to the unique_primary_category column
# unique_id_to_topic['unique_primary_category'] = unique_id_to_topic['unique_primary_category'].apply(merge_into_physics)
# unique_id_to_topic['unique_primary_category'] = unique_id_to_topic['unique_primary_category'].apply(merge_into_math)


# Tokenize text and prepare documents for Doc2Vec
documents = [TaggedDocument(simple_preprocess(doc), [i]) for i, doc in enumerate(unique_id_to_topic['full_text'])]

# Build and train Doc2Vec model
d2v_model = Doc2Vec(documents, vector_size=128, window=5, min_count=1, workers=10)
d2v_model.train(documents, total_examples=d2v_model.corpus_count, epochs=10)

# Generate embeddings
embeddings_d2v = np.array([d2v_model.infer_vector(doc.words) for doc in documents])
# # Apply PCA


# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_id_to_topic['unique_primary_category'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings_d2v, y_encoded, test_size=0.2, random_state=42)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(embeddings_d2v.shape[1],)),
    layers.Dense(512, activation='relu'),
    layers.Dense(len(le.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Doc2Vec Test accuracy: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
import numpy as np


# Make predictions on the test set
y_pred = model.predict(X_test,verbose=None)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

# Calculate precision and recall
precision = precision_score(y_test, y_pred_classes, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred_classes, average='macro')  # Change average as needed

print(f"Test precision: {precision * 100:.2f}%")
print(f"Test recall: {recall * 100:.2f}%")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Doc2Vec Test accuracy: 75.66%
Test precision: 74.36%
Test recall: 74.50%


In [23]:
# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_id_to_topic['unique_primary_category'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings_d2v, y_encoded, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', C=1)  # You can modify the kernel and C parameter as needed

# Train the SVM model
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"SVM Test accuracy: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred, average='macro')  # Change average as needed

print(f"SVM Test precision: {precision * 100:.2f}%")
print(f"SVM Test recall: {recall * 100:.2f}%")

SVM Test accuracy: 74.07%
SVM Test precision: 73.74%
SVM Test recall: 72.98%


In [24]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.decomposition import PCA

# Load the DataFrame from 'cleaned_dataset.csv'
unique_id_to_topic = pd.read_csv('cleaned_dataset.csv')
unique_id_to_topic = unique_id_to_topic.dropna(subset=['full_text'])  # Drop rows with NaN values

# Function to merge specified subcategories into 'Physics'
def merge_into_physics(category):
    physics_subcategories = ['hep-ph', 'astro-ph', 'hep-th','gr-gc', 'hep-ex', 'nucl-th', 'quant-ph', 'nucl-ex', 'hep-lat']
    if category in physics_subcategories:
        return 'physics'
    return category

def merge_into_math(category):
    physics_subcategories = ['math','cond-mat']
    if category in physics_subcategories:
        return 'math'
    return category

# Apply the function to the unique_primary_category column
# unique_id_to_topic['unique_primary_category'] = unique_id_to_topic['unique_primary_category'].apply(merge_into_physics)
# unique_id_to_topic['unique_primary_category'] = unique_id_to_topic['unique_primary_category'].apply(merge_into_math)

# Load the BERT model (you can choose other models)
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate embeddings
embeddings_sbert = sbert_model.encode(unique_id_to_topic['full_text'].tolist(), show_progress_bar=True)
sbert_model.to('cuda')
# Apply PCA
# pca = PCA(n_components=128)  # Reduce to 64 dimensions
# embeddings_sbert = pca.fit_transform(embeddings_sbert)

# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_id_to_topic['unique_primary_category'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings_sbert, y_encoded, test_size=0.2, random_state=42)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(embeddings_sbert.shape[1],)),
    layers.Dense(512, activation='relu'),
    layers.Dense(len(le.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Sentence BERT Test accuracy: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
import numpy as np


# Make predictions on the test set
y_pred = model.predict(X_test,verbose=None)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

# Calculate precision and recall
precision = precision_score(y_test, y_pred_classes, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred_classes, average='macro')  # Change average as needed

print(f"Test precision: {precision * 100:.2f}%")
print(f"Test recall: {recall * 100:.2f}%")


Batches:   0%|          | 0/59 [00:00<?, ?it/s]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Sentence BERT Test accuracy: 66.14%
Test precision: 66.74%
Test recall: 64.76%


In [25]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_id_to_topic['unique_primary_category'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings_sbert, y_encoded, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', C=1)  # You can modify the kernel and C parameter as needed

# Train the SVM model
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Sentence BERT + SVM Test accuracy: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred, average='macro')  # Change average as needed

print(f"SVM Test precision: {precision * 100:.2f}%")
print(f"SVM Test recall: {recall * 100:.2f}%")

Sentence BERT + SVM Test accuracy: 60.85%
SVM Test precision: 61.80%
SVM Test recall: 59.83%


In [26]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras import layers
# from sklearn.decomposition import PCA  # Uncomment if PCA is needed

# Load the DataFrame
unique_id_to_topic = pd.read_csv('cleaned_dataset.csv')
unique_id_to_topic = unique_id_to_topic.dropna(subset=['full_text'])  # Drop rows with NaN values

# Function to merge specified subcategories into 'Physics'
def merge_into_physics(category):
    physics_subcategories = ['hep-ph', 'astro-ph', 'hep-th','gr-gc', 'hep-ex', 'nucl-th', 'quant-ph', 'nucl-ex', 'hep-lat']
    if category in physics_subcategories:
        return 'physics'
    return category

def merge_into_math(category):
    physics_subcategories = ['math','cond-mat']
    if category in physics_subcategories:
        return 'math'
    return category

# Apply the function to the unique_primary_category column
# unique_id_to_topic['unique_primary_category'] = unique_id_to_topic['unique_primary_category'].apply(merge_into_physics)
# unique_id_to_topic['unique_primary_category'] = unique_id_to_topic['unique_primary_category'].apply(merge_into_math)

# Load the LaBSE model
labse_model = SentenceTransformer('LaBSE')
labse_model.to('cuda')  # Uncomment if you want to use GPU

# Generate embeddings using LaBSE
embeddings_labse = labse_model.encode(unique_id_to_topic['full_text'].tolist(), show_progress_bar=True)

# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_id_to_topic['unique_primary_category'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings_labse, y_encoded, test_size=0.2, random_state=42)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(embeddings_labse.shape[1],)),
    layers.Dense(512, activation='relu'),
    layers.Dense(len(le.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"LaBSE Test accuracy: {test_accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score
import numpy as np

# Make predictions on the test set
y_pred = model.predict(X_test, verbose=None)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

# Calculate precision and recall
precision = precision_score(y_test, y_pred_classes, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred_classes, average='macro')  # Change average as needed

print(f"Test precision: {precision * 100:.2f}%")
print(f"Test recall: {recall * 100:.2f}%")


Batches:   0%|          | 0/59 [00:00<?, ?it/s]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
LaBSE Test accuracy: 64.02%
Test precision: 65.48%
Test recall: 64.74%


In [27]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Assuming 'unique_id_to_topic' and 'embeddings_labse' are already defined as per your previous code

# Label encode the unique_primary_category
le = LabelEncoder()
y_encoded = le.fit_transform(unique_id_to_topic['unique_primary_category'])

# Split the data into training and testing sets using the LaBSE embeddings
X_train, X_test, y_train, y_test = train_test_split(embeddings_labse, y_encoded, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', C=1)  # You can modify the kernel and C parameter as needed

# Train the SVM model
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
test_accuracy = accuracy_score(y_test, y_pred)
print(f"LaBSE + SVM Test accuracy: {test_accuracy * 100:.2f}%")

# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro')  # Change average as needed
recall = recall_score(y_test, y_pred, average='macro')  # Change average as needed

print(f"SVM Test precision: {precision * 100:.2f}%")
print(f"SVM Test recall: {recall * 100:.2f}%")


LaBSE + SVM Test accuracy: 68.25%
SVM Test precision: 68.62%
SVM Test recall: 68.16%
