In [1]:
######################################
### Setting up Modules and loading the whole data frame
###################################


### import XPU for my local Intel Laptop 
import torch
import intel_extension_for_pytorch as ipex

### import modules and model 
from transformers import AutoTokenizer, AutoModel
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

# Select XPU if available, otherwise fallback to CPU
device = torch.device("xpu")
print("Using device:", device)

# specify model and tokenizer
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)
model.eval() 

# import database 
df = pd.read_pickle ("df_clean.pkl")

# define consistent color mapping
color_mapping = {
    "Physics": "Purple",
    "Computer Science": "orange",
    "Mathematics": "blue",
    "Statistics": "red",
    "Quantitative Biology": "green",
    "Quantitative Finance": "brown",
    "Other": "gray"
}

  from .autonotebook import tqdm as notebook_tqdm


Using device: xpu


In [None]:
# define consistent color mapping
color_mapping = {
    "Physics": "Purple",
    "Computer Science": "orange",
    "Mathematics": "blue",
    "Statistics": "red",
    "Quantitative Biology": "green",
    "Quantitative Finance": "brown",
    "Other": "gray"
}

In [None]:
##########################
####### Selecting the Subset to be analysed

subset_df = df.sample(n=10000, random_state=42).reset_index(drop=True)    #[(df["year"] >= 1990) & (df["year"] <= 2024)]


In [None]:
############################
###### create Word Embeddings in Latent Space


# Define a helper function to extract the [CLS] embedding for a given text
def get_cls_embedding(text):
    # Tokenize the text with a maximum length of 512 tokens (truncation applied)
    inputs = tokenizer(text, max_length=512, truncation=True, return_tensors="pt")
    # Move inputs to the device
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # Extract the [CLS] token embedding (first token in the sequence)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # shape: (1, hidden_size)
    # Remove the batch dimension and convert to numpy array
    return cls_embedding.squeeze(0).cpu().numpy()

# experimenting with a few Scaling Methods 
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
# embeddings = scaler.fit_transform(raw_embeddings)

# Generate embeddings for each abstract in the sample
embeddings =[get_cls_embedding(abstract) for abstract in subset_df["abstract"]]
embeddings = np.array(embeddings)
print("Embeddings shape:", embeddings.shape)


In [None]:
##########################################
####### Run a PCA and estimate Explained Variance incl. Scree Plot

from sklearn.decomposition import PCA

# Run full PCA on the embeddings ()
pca= PCA().fit(embeddings)

# define explained and create enumarated array 
explained_variance = pca.explained_variance_ratio_ * 100  # convert to percentages
components = np.arange(1, len(explained_variance) + 1)

# Filter to include only components with >= 1% explained variance
mask = explained_variance >= 1
filtered_components = components[mask]
filtered_explained_variance = explained_variance[mask]

# Plot the scree plot using only filtered components
plt.figure(figsize=(10, 6))
plt.bar(filtered_components, filtered_explained_variance, color='skyblue')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance (%)')
plt.title('Scree Plot (Components with ≥ 1% Explained Variance)')
plt.xticks(filtered_components)
plt.ylim(0,100)
plt.tight_layout()
plt.show()

########## create cumulatative scree plot 
cumulative_varianve_explained = np.cumsum(filtered_explained_variance)

#plot cumulative variance
plt.figure(figsize=(10, 6))
plt.bar(filtered_components,cumulative_varianve_explained, color='skyblue')
plt.xlabel('Principal Component')
plt.ylabel('Cumulatative Explained Variance (%)')
plt.title('Plot (Components with ≥ 1% Explained Variance)')
plt.xticks(filtered_components)
plt.ylim(0,100)
plt.tight_layout()
plt.show()

In [None]:
############################
###### plot PCA with colors 

pca_projection = pca.transform(embeddings)


# Get unique fields for the plot
unique_fields = subset_df["field"].unique()
# define year bins
year_bins = [(2000, 2001), (2002, 2003), (2004, 2005), (2023, 2024), (2025, 2025)]

for start,end in year_bins: 
    plt.figure(figsize=(12, 8))
    # create mask_years to be able to iterate through each year
    mask_years = (subset_df["year"] >= start) & (subset_df["year"] <= end)
    # creating year bin
    df_binned_years = subset_df[mask_years]
    print(f"Number of publications from {start} to {end}: {len(df_binned_years)}")

    # Plot each field with its specific color from the color mapping
    for field in unique_fields:
        if field in color_mapping:  # Check if the field is in our color mapping
            # Create a combined mask for both year range and field
            combined_mask = mask_years & (subset_df["field"] == field)
            # Only plot if there are any points matching the criteria
            if combined_mask.any():
                plt.scatter(
                    pca_projection[combined_mask, 0],  # PC1
                    pca_projection[combined_mask, 1],  # PC2
                    label=f"{field} ({start}-{end})",  # Add year range to label
                    color=color_mapping[field],
                    alpha=0.7,
                    s=80  # Point size
                )

    plt.title("PCA of SciBERT Embeddings", fontsize=15)
    plt.xlabel("Principal Component 1", fontsize=12)
    plt.ylabel("Principal Component 2", fontsize=12)
    plt.legend(fontsize=10)
    plt.grid(linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

# Print the explained variance for the first two components
pc1_variance = explained_variance[0]
pc2_variance = explained_variance[1]
print(f"PC1 explains {pc1_variance:.2f}% of the variance")
print(f"PC2 explains {pc2_variance:.2f}% of the variance")
print(f"Together they explain {pc1_variance + pc2_variance:.2f}% of the variance")




In [None]:
#############################
### creating embedded vectors of official subcategories  as a reference using SciBert

# create dataframe from pickle file
df_cats_map = pd.read_pickle("df_cats_map.pkl")

embedding_sources = []  #Only used to verify if else statement is working properly, could be removed

# Generate embeddings for each category, using long_description if available, otherwise short_description
embeddings_cat = []
for _, row in df_cats_map.iterrows():
    # Use long_description if it is not just a placeholder text
    if  "Description coming soon" not in row['long_description']:
        embedding_sources.append("long")
        embeddings_cat.append(get_cls_embedding(row['long_description']))
    else:
        embedding_sources.append("short")
        embeddings_cat.append(get_cls_embedding(row['short_description']))

embeddings_cat = np.array(embeddings_cat)
print("Embeddings shape:", embeddings_cat.shape)

# Create a summary of which descriptions were used
source_counts = pd.Series(embedding_sources).value_counts()
print("Count of descriptions used:")
print(source_counts)

In [None]:
##########################
#### plotting embeddings_cat in the same PCA and same way 

pca_projection_cats = pca.transform(embeddings_cat)

# Create a figure for plotting category embeddings
plt.figure(figsize=(12, 8))

for field in unique_fields:
    if field in color_mapping and not pd.isna(field):  # Check if the field is in our color mapping and not NaN
        # Create a mask for the current field in df_cats_map
        mask_cats = (df_cats_map["field"] == field)
        # Only plot if there are any points matching the criteria
        if mask_cats.any():
            plt.scatter(
                pca_projection_cats[mask_cats, 0],  # PC1
                pca_projection_cats[mask_cats, 1],  # PC2
                label=field,  # Simply use the field name
                color=color_mapping[field],
                alpha=0.7,
                s=80  # Point size
            )

plt.title("PCA of SciBERT Category Embeddings", fontsize=15)
plt.xlabel("Principal Component 1", fontsize=12)
plt.ylabel("Principal Component 2", fontsize=12)
plt.legend(fontsize=10)
plt.grid(linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
############################
### MDS Attempt
'''
from sklearn.manifold import MDS
from mpl_toolkits.mplot3d import Axes3D
from sklearn import manifold

mds = MDS (n_components=2, random_state=0)
mds_projection = mds.fit_transform(embeddings)

# Plot the MDS projection
plt.figure(figsize=(12, 8))

# Get unique fields for the plot
unique_fields = subset_df["field"].unique()

# Plot each field with its specific color from the color mapping
for field in unique_fields:
    if field in color_mapping:  # Check if the field is in our color mapping
        mask = subset_df["field"] == field
        plt.scatter(
            mds_projection[mask, 0],  # MDS dimension 1
            mds_projection[mask, 1],  # MDS dimension 2
            label=field,
            color=color_mapping[field],
            alpha=0.7,
            s=80  # Point size
        )

plt.title("MDS of SciBERT Embeddings", fontsize=15)
plt.xlabel("MDS Dimension 1", fontsize=12)
plt.ylabel("MDS Dimension 2", fontsize=12)
plt.legend(fontsize=10)
plt.grid(linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()'''

In [None]:
####################################
#### creating 3D PCA Plot

'''
from mpl_toolkits.mplot3d import Axes3D  # Required for 3D plotting
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Assume embeddings is your NumPy array of shape (num_samples, 768)
pca_3d = PCA(n_components=3)
embeddings_3d = pca_3d.fit_transform(embeddings)

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Example: color-code by field if you have that column in your DataFrame
# Create a color mapping (this is just an example; adjust as needed)
unique_fields = subset_df["field"].unique()
cmap = plt.cm.get_cmap("tab10", len(unique_fields))
field_to_color = {field: cmap(i) for i, field in enumerate(unique_fields)}

for field in unique_fields:
    idx = subset_df["field"] == field
    ax.scatter(
        embeddings_3d[idx, 0],
        embeddings_3d[idx, 1],
        embeddings_3d[idx, 2],
        label=field,
        color=field_to_color[field],
        alpha=0.7,
        s=50
    )

ax.set_title("3D PCA of SciBERT Abstract Embeddings")
ax.set_xlabel("PC 1")
ax.set_ylabel("PC 2")
ax.set_zlabel("PC 3")
ax.legend()
plt.show()'''


In [None]:
####################################
#### creating 3D interactive PCA Plot

'''
import plotly.express as px
import nbformat 

# Suppose 'embeddings' is your NumPy array of SciBERT embeddings (num_samples x 768)
# Run PCA to reduce to 3 dimensions:
pca_3d = PCA(n_components=3)
embeddings_3d = pca_3d.fit_transform(embeddings)

# Create a DataFrame with the PCA components and your grouping variable ('field')
df_pca = pd.DataFrame(embeddings_3d, columns=["PC1", "PC2", "PC3"])
df_pca["field"] = subset_df["field"].values  # Ensure the order matches your embeddings

# Create the interactive 3D scatter plot
fig = px.scatter_3d(
    df_pca, x="PC1", y="PC2", z="PC3",
    color="field",                # Color-code by field
    title="Interactive 3D PCA of SciBERT Abstract Embeddings"
)

fig.show()
'''

In [None]:
##########################################
##### Deciding Whether to use "abstract" or "title" column to create embedded vector
##### -> Question to be answered is: Do the Abstracts exceed the max. input length of 512tokens ?

'''
# Computing the tokenized length of each abstract.
df_csLG["abstract_token_length"] = df_csLG["abstract"].apply(
    lambda x: len(tokenizer.encode(x, add_special_tokens=True))
)

# Checking the maximum, mean, and distribution.
max_length = df_csLG["abstract_token_length"].max()
mean_length = df_csLG["abstract_token_length"].mean()
description = df_csLG["abstract_token_length"].describe()

print("Max token length:", max_length)
print("Mean token length:", mean_length)
print(description)

#### Estimating number of rows above 512
# Filter rows where token length exceeds 512 from cs.LG abstracts
df_long_abstracts = df_csLG[df_csLG["abstract_token_length"] > 512]

# How many such rows?
count_long = len(df_long_abstracts)

print(f"Number of abstracts above 512 tokens: {count_long}")


##############      Console Output: Number of abstracts above 512 tokens: 61   (i.e. 61/100000)
##############      -> Therefore decision to use Abstract as opposed to titles 

'''