# Simple case

In [1]:
import numpy as np
import plotly.express as px
from sklearn.metrics.pairwise import cosine_similarity

# Generate random feature vectors
np.random.seed(42)
A = np.random.rand(3, 5)  # 3 documents, 5 features
B = np.random.rand(2, 5)  # 2 keywords, 5 features

# Compute cosine similarity (3×2 matrix)
sim_matrix = cosine_similarity(A, B)

# Convert matrix to a DataFrame for Plotly
import pandas as pd
df = pd.DataFrame(sim_matrix, 
                  index=[f"Doc {i+1}" for i in range(A.shape[0])], 
                  columns=[f"Keyword {i+1}" for i in range(B.shape[0])])

# Create heatmap with Plotly
fig = px.imshow(df, text_auto=".2f", color_continuous_scale="Blues",
                labels={"x": "Keywords", "y": "Documents", "color": "Similarity"})

# Show the plot
fig.show()


## Real case 


In [3]:

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.metrics.pairwise import cosine_similarity
from bertopic import BERTopic

# Load or train two different BERTopic models
model_1 = BERTopic.load("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\\model_1")  # First topic model  # Load first model
model_2 = BERTopic.load("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\\model_2")  # Load second model

# Get topic info and filter out outlier topics (-1)
topic_info_1 = model_1.get_topic_info().query('Topic != -1')
topic_info_2 = model_2.get_topic_info().query('Topic != -1')

# Extract topic embeddings for valid topics (excluding outlier -1)
embeddings_1 = model_1.topic_embeddings_[topic_info_1['Topic'].values]
embeddings_2 = model_2.topic_embeddings_[topic_info_2['Topic'].values]

# Extract topic names
topic_names_1 = topic_info_1['Name'].values
topic_names_2 = topic_info_2['Name'].values

# Check shapes
print(f"Model 1 - Number of Topics: {len(topic_names_1)}, Shape of Embeddings: {embeddings_1.shape}")
print(f"Model 2 - Number of Topics: {len(topic_names_2)}, Shape of Embeddings: {embeddings_2.shape}")

# Ensure the shapes match before computing cosine similarity
if len(topic_names_1) != embeddings_1.shape[0] or len(topic_names_2) != embeddings_2.shape[0]:
    raise ValueError("Mismatch between number of topics and embeddings.")

# Compute cosine similarity matrix (topics from model_1 vs. model_2)
similarity_matrix = cosine_similarity(embeddings_1, embeddings_2)

# Convert to a DataFrame for visualization
df = pd.DataFrame(similarity_matrix, 
                  index=topic_names_1,  # Use topic names from model 1
                  columns=topic_names_2)  # Use topic names from model 2

# Plot heatmap using Plotly
fig = px.imshow(df, text_auto=".2f", color_continuous_scale="Blues",
                labels={"x": "Biology Topics", "y": "Philosophy Topics", "color": "Cosine Similarity"},
                title="Topic Similarity Between Two BERTopic Models")

# Show plot
fig.show()
fig.show(renderer='browser')

Model 1 - Number of Topics: 33, Shape of Embeddings: (33, 384)
Model 2 - Number of Topics: 14, Shape of Embeddings: (14, 384)


In [4]:

model_2 = BERTopic.load("model_2")  # Second topic model

# Extract topic embeddings (ignoring the outlier topic -1)
embeddings_1 = np.array([emb for i, emb in enumerate(topic_embeddings_) if i != -1])
embeddings_2 = np.array([emb for i, emb in enumerate(model_2.topic_embeddings_) if i != -1])

# Compute cosine similarity matrix (topics from model_1 vs. model_2)
similarity_matrix = cosine_similarity(embeddings_1, embeddings_2)

# Convert to a DataFrame for visualization
import pandas as pd
df = pd.DataFrame(similarity_matrix, 
                  index=[f"Model1_Topic {i}" for i in range(len(embeddings_1))], 
                  columns=[f"Model2_Topic {i}" for i in range(len(embeddings_2))])

# Plot heatmap using Plotly
fig = px.imshow(df, text_auto=".2f", color_continuous_scale="Blues",
                labels={"x": "Model 2 Topics", "y": "Model 1 Topics", "color": "Cosine Similarity"})

# Show plot
fig.show()

ValueError: Make sure to either pass a valid directory or HF model.