In [3]:
import hdbscan
import os
import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Define directory paths
main_directory = os.path.join('/', *os.getcwd().split('/')[:-1])
data_folder = os.path.join(main_directory, 'data')

# Load dataset
df = pd.read_csv(f"{data_folder}/ai_research_papers_balanced_10_years.csv")

#  /Users/jyotbuch/Desktop/Emerging_Research_Trends_Clustering/data/ai_research_papers_balanced_10_years_embeddings.npz
# Load embeddings
embeddings = np.load(f"{data_folder}/ai_research_papers_balanced_10_years_embeddings.npz")

# Normalize the embeddings
scaler = StandardScaler()
embeddings_scaled = scaler.fit_transform(embeddings)

# Apply HDBSCAN clustering
clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom')
df["cluster"] = clusterer.fit_predict(embeddings_scaled)

# Reduce embeddings to 2D using UMAP
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
embeddings_2d = umap_model.fit_transform(embeddings_scaled)

# Add 2D projections to DataFrame
df["x"] = embeddings_2d[:, 0]
df["y"] = embeddings_2d[:, 1]

# Plot Clusters
plt.figure(figsize=(10, 6))
scatter = plt.scatter(df["x"], df["y"], c=df["cluster"], cmap="Spectral", alpha=0.7)
plt.colorbar(scatter, label="Cluster Label")
plt.title("HDBSCAN Clustering of AI Research Papers (UMAP Projection)")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.show()

print(f"Clustering complete! Found {df['cluster'].nunique()} research topics.")

  from .autonotebook import tqdm as notebook_tqdm


ValueError: Length of values (500) does not match length of index (1230)

In [4]:
print(f"Dataset size: {df.shape[0]} rows")
print(f"Embeddings size: {embeddings.shape[0]} rows")

Dataset size: 1230 rows
Embeddings size: 500 rows
