In [12]:
import pandas as pd
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from sklearn.cluster import HDBSCAN, KMeans
import time
from tqdm import tqdm
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import calinski_harabasz_score,silhouette_score
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances
from sklearn.decomposition import PCA
from umap import UMAP
import optuna
import ast
from collections import Counter
import xgboost as xgb
from algorithms import *

In [None]:
questions = pd.read_pickle(os.path.join(Path.cwd().parent,'data','stackexchange_embeddings.pkl'))
tags = pd.read_parquet(os.path.join(Path.cwd().parent,'data', "tag_embeddings.parquet"))
original_data = pd.read_csv(os.path.join(Path.cwd().parent,'data', 'stackexchange_dataset.csv'))
tags.embedding.values
t_stacked = np.stack(tags.embedding.values)
original_data.index = original_data.question_id
data = original_data.drop(columns=['question_id'])
data = original_data[~original_data.index.duplicated()]
data.loc[:,('tags')] = data.loc[:,('tags')].apply(ast.literal_eval)

In [10]:
data.drop(columns=['question_id'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=['question_id'], inplace=True)


In [13]:
tag_counter = Counter(tag for tags in data['tags'] for tag in tags)
tag_freq = pd.Series(tag_counter).sort_values(ascending=False)

In [15]:
threshold = 100
frequent_tags = tag_freq[tag_freq >= threshold].index.tolist()
infrequent_tags = tag_freq[tag_freq < threshold].index.tolist()

In [20]:
data2 = data.copy()
data2['tags'] = data2['tags'].apply(lambda tags: [tag for tag in tags if tag in frequent_tags])

In [57]:
tags2 = tags[tags.index.isin(frequent_tags)]
tags2.shape

(411, 1)

In [58]:
t_stacked = np.stack(tags2.embedding.values)

In [None]:
tree_builder = DirectHierarchicalLabelTree(
    tag_embeddings=t_stacked,
    tag_names=tags2.index,
    branching_factors=[100, 100],
)

In [60]:
tree_builder.build_tree()

{'root': {'is_leaf': False,
  'n_clusters': 100,
  'centroids': array([[ 0.01263871, -0.00276341,  0.00317389, ...,  0.00679785,
           0.0017936 ,  0.02754182],
         [ 0.0276288 ,  0.00284742, -0.00936931, ..., -0.00206651,
           0.00130427,  0.00765856],
         [ 0.02843124,  0.00492957, -0.00991798, ..., -0.01031823,
          -0.00032595,  0.01016234],
         ...,
         [ 0.00044189, -0.00094863, -0.04499175, ..., -0.00432266,
          -0.00438768, -0.00504249],
         [ 0.02803127,  0.02599724,  0.00838054, ..., -0.00938909,
           0.00142133,  0.00442287],
         [ 0.01375601, -0.0133375 , -0.003146  , ..., -0.00540397,
          -0.00967053, -0.00736161]], shape=(100, 4096), dtype=float32),
  'children': ['root_c0',
   'root_c1',
   'root_c2',
   'root_c3',
   'root_c4',
   'root_c5',
   'root_c6',
   'root_c7',
   'root_c8',
   'root_c9',
   'root_c10',
   'root_c11',
   'root_c12',
   'root_c13',
   'root_c14',
   'root_c15',
   'root_c16',
   'roo

In [61]:
tree_builder.visualize_tree_statistics()

Total tags: 411
Level 0: 1 clusters
Level 1: 100 clusters
Level 2: 167 clusters


In [None]:
import numpy as np
import re
from collections import Counter

def find_representative_tags2(
    tree_builder,
    tags,
    n_centroids=100,
    similarity_metric="euclidean",
):
    if similarity_metric not in ["euclidean", "cosine"]:
        raise ValueError(f"Unsupported similarity metric: {similarity_metric}")

    root_centroids = np.asarray(tree_builder.tree['root']['centroids'])
    
    # Handle case where actual clusters might be fewer than requested n_centroids
    actual_n_centroids = root_centroids.shape[0]
    
    cluster_means = {}
    for i in range(actual_n_centroids):
        cluster_means[i] = root_centroids[i]

    all_embeddings = tree_builder.norm_embeddings 
    N = len(tags)
    D = all_embeddings.shape[1]

    cluster_idxs = np.full(N, -1, dtype=int)
    pattern_root_child = re.compile(r"root_c(\d+)")

    for i in range(N):
        # tag_to_path gives the leaf path, e.g., "root_c5_c2" or "root_c0"
        tag_path = tree_builder.tag_to_path.get(i, "")
        
        # We extract the first number after root_c to identify the Level 1 cluster
        m = pattern_root_child.search(tag_path)
        if m:
            cluster_idxs[i] = int(m.group(1))

    # Initialize result dictionary
    h1_t = {idx: None for idx in range(n_centroids)}

    # Loop through clusters to find representatives
    for idx in range(n_centroids):
        if idx not in cluster_means:
            continue

        mean = cluster_means[idx].reshape(1, D)
        
        mask = cluster_idxs == idx
        if not mask.any():
            continue
            
        true_indices = np.nonzero(mask)[0]
        candidate_embs = all_embeddings[mask]  # (m, D)

        if similarity_metric == "euclidean":
            diffs = candidate_embs - mean
            dists = np.linalg.norm(diffs, axis=1)
            argmin = int(np.argmin(dists))
            closest_dist = float(dists[argmin])
        else:  # cosine
            mean_norm = mean / (np.linalg.norm(mean) + 1e-10)
            cand_norm = candidate_embs / (np.linalg.norm(candidate_embs, axis=1, keepdims=True) + 1e-10)
            
            cosine_sims = np.dot(cand_norm, mean_norm.T).squeeze()
            if cosine_sims.ndim == 0: cosine_sims = np.array([cosine_sims]) # single item case
            
            argmin = int(np.argmax(cosine_sims))
            closest_dist = float(1.0 - cosine_sims[argmin])

        chosen_tag_idx = true_indices[argmin]
        closest_tag_name = tags[chosen_tag_idx]
        closest_tag_emb = all_embeddings[chosen_tag_idx]

        cluster_tag_names = [tags[i] for i in true_indices]

        top_popular_tags = []
        tag_counter = Counter(cluster_tag_names)
        for tag, _ in tag_counter.most_common(5):
            top_popular_tags.append(tag)

        h1_t[idx] = {
            "closest_tag": closest_tag_name,
            "closest_dist": closest_dist,
            "cluster_mean": cluster_means[idx],
            "tag_embedding": closest_tag_emb,
            "cluster_names": cluster_tag_names,
            "cluster_size": len(cluster_tag_names),
            "top_5_popular_tags": top_popular_tags,
        }

    return h1_t

In [77]:
h1_t = find_representative_tags2(tree_builder, tags2.index,similarity_metric="cosine")

In [80]:
centroid_tags = pd.DataFrame.from_dict(h1_t, orient='index')
centroid_tags.index = centroid_tags['closest_tag'].values
centroid_tags.drop(columns=['closest_tag'], inplace=True)

In [87]:
centroid_tags.iloc[centroid_tags['cluster_size'].argmax()]

closest_dist                                                    0.07937
cluster_mean          [0.018479984, -0.0058807596, -0.0067218333, -0...
tag_embedding         [0.02201923, 0.010853411, -0.012791802, -0.035...
cluster_names         [database, mongodb, sql, elasticsearch, fireba...
cluster_size                                                         15
top_5_popular_tags    [database, mongodb, sql, elasticsearch, firebase]
Name: database, dtype: object

In [91]:
centroid_tags['cluster_mean'].head(1)

selenium    [0.012638707, -0.002763408, 0.0031738898, -0.0...
Name: cluster_mean, dtype: object

In [None]:
# centroid_tags.to_parquet(os.path.join(Path.cwd().parent,'data', 'centroid-popular-tags.parquet'))

In [98]:
def closest_frequent_tag(tags, name_of_interest, infrequent_tags):
    infrequent_set = set(infrequent_tags)
    candidates = tags[~tags.index.isin(infrequent_set)]
    if name_of_interest in candidates.index:
        candidates = candidates.drop(name_of_interest)
    if candidates.empty:
        return '', 0
    candidate_matrix = np.stack(candidates.embedding.values)
    target_vector = tags.loc[name_of_interest].embedding.reshape(1, -1)
    similarities = cosine_similarity(target_vector, candidate_matrix).flatten()
    best_idx = np.argmax(similarities)
    
    return candidates.index[best_idx], similarities[best_idx]

def closest_centroid_tag(tags, centroid_tags, name_of_interest):
    if name_of_interest in centroid_tags.index:
        return name_of_interest, 1.0
    candidate_matrix = np.stack(centroid_tags['cluster_mean'].values)
    target_vector = tags.loc[name_of_interest].embedding.reshape(1, -1)
    similarities = cosine_similarity(target_vector, candidate_matrix).flatten()
    best_idx = np.argmax(similarities)
    
    return centroid_tags.index[best_idx], similarities[best_idx]

In [100]:
closest_centroid_tag(tags,centroid_tags, 'ntp')

('datetime', np.float64(0.8098148848579392))

In [103]:
centroid_tags.head(2)

Unnamed: 0,closest_dist,cluster_mean,tag_embedding,cluster_names,cluster_size,top_5_popular_tags
selenium,0.048786,"[0.012638707, -0.002763408, 0.0031738898, -0.0...","[0.013658555, -0.001592277, 0.0061122323, -0.0...","[selenium, firefox, selenium-webdriver, seleni...",4,"[selenium, firefox, selenium-webdriver, seleni..."
android,0.075348,"[0.027628796, 0.0028474198, -0.009369309, -0.0...","[0.021739224, 0.013535966, -0.013542618, -0.00...","[flutter, java, android, android-studio, mobil...",7,"[flutter, java, android, android-studio, mobile]"


In [115]:
centroid_tags.shape

(100, 6)

---

In [107]:
questions = pd.read_pickle(os.path.join(Path.cwd().parent,'data','stackexchange_embeddings.pkl'))

In [104]:
exploded_series = centroid_tags['cluster_names'].explode()
tag_to_class_map = pd.Series(exploded_series.index, index=exploded_series.values).to_dict()

In [109]:
questions['tags'] = data.loc[data.index,'tags']
questions.head(2)

Unnamed: 0_level_0,title_embedding,question_text_embedding,tags
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
79802517,"[0.008553513, -0.009437113, 0.009673767, -0.02...","[0.0013518566, -0.015674047, -0.004076924, -0....","[c#, entity-framework]"
79802934,"[0.019479005, 0.007850029, -0.020600174, -0.02...","[0.009473968, -0.014583107, -0.019139914, 0.00...",[ntp]


In [111]:
questions['tags'] = questions['tags'].apply(
    lambda tag_list: [closest_centroid_tag(tags, centroid_tags, tag)[0] for tag in tag_list]
)

In [112]:
questions.head(2)

Unnamed: 0_level_0,title_embedding,question_text_embedding,tags
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
79802517,"[0.008553513, -0.009437113, 0.009673767, -0.02...","[0.0013518566, -0.015674047, -0.004076924, -0....","[.net, entity-framework]"
79802934,"[0.019479005, 0.007850029, -0.020600174, -0.02...","[0.009473968, -0.014583107, -0.019139914, 0.00...",[datetime]


In [114]:
centroid_tags.loc['entity-framework']

closest_dist                                                   0.054664
cluster_mean          [0.01717913, -0.017281244, 0.0089047905, -0.02...
tag_embedding         [0.013604954, -0.017427877, 0.008887872, -0.02...
cluster_names           [entity-framework, entity-framework-core, linq]
cluster_size                                                          3
top_5_popular_tags      [entity-framework, entity-framework-core, linq]
Name: entity-framework, dtype: object

In [None]:
# questions.to_pickle(os.path.join(Path.cwd().parent,'data','stackexchange_reduced_tags_embeddings.pkl'))