In [137]:
import pandas as pd
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from sklearn.cluster import HDBSCAN, KMeans
import time
from tqdm import tqdm
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import calinski_harabasz_score,silhouette_score
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances
from sklearn.decomposition import PCA
from umap import UMAP
import optuna

In [4]:
questions = np.load(os.path.join(Path.cwd().parent,'data', "question_embeddings.npy"))
tags = pd.read_parquet(os.path.join(Path.cwd().parent,'data', "tag_embeddings.parquet"))

In [5]:
tags

Unnamed: 0_level_0,embedding
tag,Unnamed: 1_level_1
c#,"[0.015589412, 0.0028190461, -0.02159284, -0.00..."
entity-framework,"[0.013604954, -0.017427877, 0.008887872, -0.02..."
ntp,"[0.0025991949, 0.020412365, -0.025822664, -0.0..."
python,"[0.0044617336, 0.017571222, -0.028520588, -0.0..."
pandas,"[0.013557544, 0.023968168, -0.04845796, -0.009..."
...,...
gocardless,"[0.031818543, 0.0014768182, 0.0064690523, -0.0..."
daxstudio,"[0.01692661, -0.008786624, -0.008296905, -0.02..."
woocommerce-email,"[0.032131393, -0.0161308, 0.029701, 0.01068102..."
automapper-6,"[0.01038144, 0.00035326037, 0.0029597834, -0.0..."


In [None]:
class DualEncoderMatcher:
    def __init__(self, question_embeddings, tag_embeddings, tag_names):
        self.question_embeddings = question_embeddings 
        self.tag_embeddings = tag_embeddings            
        self.tag_names = tag_names
        
        self.question_embeddings_norm = self._normalize(question_embeddings)
        self.tag_embeddings_norm = self._normalize(tag_embeddings)
    
    def _normalize(self, embeddings):
        """L2 normalize embeddings"""
        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
        return embeddings / (norms + 1e-8)
    
    def predict_top_k(self, question_idx, k=5):
        similarities = self.question_embeddings_norm[question_idx] @ self.tag_embeddings_norm.T
        
        top_k_indices = np.argsort(similarities)[-k:][::-1]
        top_k_tags = [self.tag_names[i] for i in top_k_indices]
        top_k_scores = similarities[top_k_indices]
        
        return list(zip(top_k_tags, top_k_scores))
    
    def batch_predict(self, question_indices, k=5):
        similarities = self.question_embeddings_norm[question_indices] @ self.tag_embeddings_norm.T
        
        top_k_indices = np.argsort(similarities, axis=1)[:, -k:][:, ::-1]
        
        predictions = []
        for i, _ in enumerate(question_indices):
            top_tags = [self.tag_names[idx] for idx in top_k_indices[i]]
            top_scores = similarities[i, top_k_indices[i]]
            predictions.append(list(zip(top_tags, top_scores)))
        
        return predictions


In [22]:
questions[:10_000]

array([[ 0.00135186, -0.01567405, -0.00407692, ..., -0.00051256,
        -0.01407612,  0.03170738],
       [ 0.00947397, -0.01458311, -0.01913991, ...,  0.01237694,
         0.0026381 ,  0.00367426],
       [-0.00196639, -0.02462533, -0.02752738, ..., -0.02915011,
        -0.01131252,  0.00531276],
       ...,
       [ 0.00475327, -0.03546025, -0.02484306, ..., -0.00861502,
        -0.05014591,  0.0091939 ],
       [-0.01815439, -0.00208452, -0.01850394, ..., -0.00084887,
         0.00824582,  0.00753479],
       [ 0.03789256, -0.01389924, -0.02730731, ..., -0.01784343,
        -0.00101591, -0.00454621]], shape=(10000, 4096))

In [34]:
tags.embedding.values
t_stacked = np.stack(tags.embedding.values)
t_stacked.shape

(22753, 4096)

In [35]:
np.linalg.norm(t_stacked, axis=1, keepdims=True)

array([[0.99999972],
       [1.00000007],
       [0.99999997],
       ...,
       [0.99999902],
       [1.00000045],
       [1.00000032]], shape=(22753, 1))

In [26]:
questions[:10_000].shape

(10000, 4096)

In [24]:
np.linalg.norm(questions[:10_000], axis=1, keepdims=True)


array([[1.00000024],
       [0.99999981],
       [1.00000021],
       ...,
       [1.00000067],
       [1.00000047],
       [0.99999971]], shape=(10000, 1))

In [36]:
matcher = DualEncoderMatcher(questions[:10_000], t_stacked, tags.index)

In [37]:
predictions = matcher.predict_top_k(question_idx=0, k=5)
print(f"Top 5 predictions: {predictions}")


Top 5 predictions: [('entity-framework', np.float64(0.5814004526467879)), ('linq-to-entities', np.float64(0.5646984342057525)), ('entity-framework-6', np.float64(0.5354078775941149)), ('entity-framework-5', np.float64(0.5260456250104916)), ('entity-framework-4.1', np.float64(0.5250464008985329))]


In [38]:

batch_predictions = matcher.batch_predict(
    question_indices=range(1000),  # First 1000 questions
    k=5
)

In [40]:
batch_predictions[0]

[('entity-framework', np.float64(0.5814004526467882)),
 ('linq-to-entities', np.float64(0.5646984342057526)),
 ('entity-framework-6', np.float64(0.535407877594115)),
 ('entity-framework-5', np.float64(0.5260456250104918)),
 ('entity-framework-4.1', np.float64(0.5250464008985329))]

In [41]:
original_data = pd.read_csv(os.path.join(Path.cwd().parent,'data', 'stackexchange_dataset.csv'))

In [46]:
original_data.iloc[0]['tags']

"['c#', 'entity-framework']"

In [44]:
original_data.iloc[0]['question_text']

'I am looking for a better way to use the .Include clause of Entity Framework. I want to avoid duplicate code. I have a lot of classes, and every class has a method which looks something like this: query = (from n in currentDBContext.FBBuchungenCollection .Include(x => x.BelegHerkunft) .Include(x => x.Buchungsordner) .Include(x => x.Buchungsperiode).ThenInclude(x => x.Geschaeftsjahr) .Include(x => x.BuchungsUser) .Include(x => x.Erfassungsart) .Include(x => x.ErstellUser) .Include(x => x.Mandant).ThenInclude(x => x.HauptAdresse) .Include(x => x.StornoUser) .Include(x => x.Teilbuchungen).ThenInclude(x => x.FremdWaehrung) .Include(x => x.Teilbuchungen).ThenInclude(x => x.KKArt) .Include(x => x.Teilbuchungen).ThenInclude(x => x.KKKonto) .Include(x => x.Teilbuchungen).ThenInclude(x => x.Konto) .Include(x => x.Teilbuchungen).ThenInclude(x => x.KoReVerteilung).ThenInclude(x => x.Periodenverteilungen).ThenInclude(x => x.Kontierungen).ThenInclude(x => x.Kontierungsangaben).ThenInclude(x => x.K

In [None]:
class DirectHierarchicalLabelTree:
    
    def __init__(self, tag_embeddings, tag_names, branching_factors=[100, 100]):
        """
        Args:
            branching_factors: hierarchy structure [level1_clusters, level2_clusters]
        """
        self.tag_embeddings = tag_embeddings
        self.tag_names = tag_names
        self.branching_factors = branching_factors
        self.tree = {}
        self.tag_to_path = {} 
        
    def build_tree(self):
        self._recursive_cluster(
            embeddings=self.tag_embeddings,
            tag_indices=np.arange(len(self.tag_names)),
            level=0,
            parent_id='root'
        )
        return self.tree
    
    def _recursive_cluster(self, embeddings, tag_indices, level, parent_id):
    
        # Base case: reached max depth or too few tags
        if level >= len(self.branching_factors) or len(tag_indices) <= 5:
            # Store leaf tags
            leaf_tags = [self.tag_names[i] for i in tag_indices]
            self.tree[parent_id] = {
                'is_leaf': True,
                'tags': leaf_tags,
                'tag_indices': tag_indices,
                'centroid': embeddings.mean(axis=0)
            }
            # Record paths for all tags in this leaf
            for idx in tag_indices:
                self.tag_to_path[idx] = parent_id
            return
        
        # Apply k-means at current level
        n_clusters = min(self.branching_factors[level], len(tag_indices))
        
        kmeans = KMeans(
            n_clusters=n_clusters,
            init='k-means++',  
            n_init=20,          
            max_iter=500,
            random_state=42
        )
        
        cluster_labels = kmeans.fit_predict(embeddings)
        
        # Store node information
        self.tree[parent_id] = {
            'is_leaf': False,
            'n_clusters': n_clusters,
            'centroids': kmeans.cluster_centers_,
            'children': []
        }
        
        # Recursively process each cluster
        for cluster_id in range(n_clusters):
            child_id = f"{parent_id}_c{cluster_id}"
            self.tree[parent_id]['children'].append(child_id)
            
            mask = cluster_labels == cluster_id
            cluster_tag_indices = tag_indices[mask]
            cluster_embeddings = embeddings[mask]
            
            self._recursive_cluster(
                cluster_embeddings,
                cluster_tag_indices,
                level + 1,
                child_id
            )
    
    def get_cluster_path(self, tag_name):
        tag_idx = self.tag_names.index(tag_name)
        return self.tag_to_path.get(tag_idx, None)
    
    def visualize_tree_statistics(self):
        print(f"Total tags: {len(self.tag_names)}")
        
        level_counts = {0: 1}
        for node_id in self.tree:
            if node_id == 'root':
                continue
            level = node_id.count('_c')
            level_counts[level] = level_counts.get(level, 0) + 1
        
        for level, count in sorted(level_counts.items()):
            print(f"Level {level}: {count} clusters")

In [None]:

tree_builder = DirectHierarchicalLabelTree(
    tag_embeddings=t_stacked,
    tag_names=tags.index,
    branching_factors=[100, 100] 
)

In [51]:
tree_builder.build_tree()

{'root': {'is_leaf': False,
  'n_clusters': 100,
  'centroids': array([[ 0.01314796, -0.00666611, -0.01345625, ..., -0.00378615,
          -0.01279567,  0.00506993],
         [ 0.0291119 ,  0.00219823, -0.00156632, ..., -0.00134885,
          -0.00589906, -0.00124969],
         [ 0.00611679, -0.01480442,  0.00854416, ..., -0.0079063 ,
          -0.00779623,  0.0133387 ],
         ...,
         [ 0.01581322, -0.00537735, -0.00568448, ..., -0.00810855,
          -0.01347901, -0.00086362],
         [ 0.01377514, -0.00864554, -0.00809561, ..., -0.01294104,
          -0.00096438, -0.00042522],
         [ 0.01536255, -0.01240802, -0.00419845, ...,  0.00236259,
           0.00564149,  0.00486155]], shape=(100, 4096)),
  'children': ['root_c0',
   'root_c1',
   'root_c2',
   'root_c3',
   'root_c4',
   'root_c5',
   'root_c6',
   'root_c7',
   'root_c8',
   'root_c9',
   'root_c10',
   'root_c11',
   'root_c12',
   'root_c13',
   'root_c14',
   'root_c15',
   'root_c16',
   'root_c17',
   'roo

In [52]:
tree_builder.visualize_tree_statistics()

Hierarchical Label Tree Statistics:
Total tags: 22753
Level 0: 1 clusters
Level 1: 100 clusters
Level 2: 9885 clusters


In [86]:
tree_builder.tag_to_path[0]

'root_c16_c12'

In [96]:
tree_builder.tag_to_path[0].split('_c')

['root', '16', '12']

In [None]:
np.mean(tree_builder.tree['root_c16']['centroids'], axis=0)

array([ 0.00887133,  0.00042595, -0.00304763, ..., -0.00895712,
       -0.00428712,  0.0040433 ], shape=(4096,))

In [116]:
tst_mean = np.mean(tree_builder.tree['root_c16']['centroids'], axis=0)
tst_v = np.stack(tree_builder.tree[tree_builder.tag_to_path[1]]['centroid'])

In [127]:
euclidean_distances(tst_mean.reshape(1, -1), tst_v.reshape(1, -1))[0][0]

np.float64(0.5633195376844613)

In [None]:
children_0_tags=[]
closest_tag,closest_dist = None, float('inf')
for tag_name,i in zip(tags.index,range(tags.index.shape[0])):
    tag_path = tree_builder.tag_to_path[i]
    tag_path_parts = tag_path.split('_c')
    if tag_path_parts[1] == '0':
        embedding_cluster_mean = np.mean(tree_builder.tree[tag_path_parts[0] + '_c' + tag_path_parts[1]]['centroids'], axis=0)
        children_0_tags.append(tag_name)
        embedding_val = np.stack(tree_builder.tree[tag_path]['centroid'])
        dist = euclidean_distances(embedding_val.reshape(1, -1), embedding_cluster_mean.reshape(1, -1))[0][0]
        if dist < closest_dist:
            closest_dist = dist
            closest_tag = tag_name
    
    

In [134]:
h1_t = {centroid_idx: [] for centroid_idx in range(100)}


In [147]:
len(tree_builder.tree['root']['children'])

100

In [None]:
def find_representative_tags(tree_builder, tags, n_centroids=100):
    
    cluster_means = {}
    pattern_cluster = re.compile(r'_c(\d+)$')
    for key in tree_builder.tree['root']['children']:
        m = pattern_cluster.search(key)
        if not m:
            continue
        idx = int(m.group(1))
        if 0 <= idx < n_centroids:
            centroids = np.asarray(tree_builder.tree[key]['centroids'])
            if centroids.size == 0:
                continue
            cluster_means[idx] = np.mean(centroids, axis=0)

    N = len(tags.index)
    tag_paths = [None] * N
    cluster_idxs = np.full(N, -1, dtype=int)
    embeddings = []

    pattern_any_c = re.compile(r'_c(\d+)') 

    for i, _ in enumerate(tags.index):
        tag_path = tree_builder.tag_to_path[i]
        tag_paths[i] = tag_path

        m = pattern_any_c.search(tag_path)
        if m:
            cluster_idxs[i] = int(m.group(1))
        else:
            cluster_idxs[i] = -1  # ignore

        emb = np.asarray(tree_builder.tree[tag_path]['centroid'])
        embeddings.append(emb)

    embeddings = np.stack(embeddings)       # shape (N, D)
    D = embeddings.shape[1]

    h1_t = {idx: None for idx in range(n_centroids)}

    for idx in range(n_centroids):
        if idx not in cluster_means:
            continue

        mean = cluster_means[idx].reshape(1, D)  # (1, D)
        mask = (cluster_idxs == idx)
        if not mask.any():
            continue

        candidate_embs = embeddings[mask]      # (m, D)
        # vectorized euclidean distances computation
        diffs = candidate_embs - mean          # broadcasting (m, D)
        dists = np.linalg.norm(diffs, axis=1)  # (m,)

        argmin = int(np.argmin(dists))
        closest_dist = float(dists[argmin])

        true_indices = np.nonzero(mask)[0]
        chosen_tag_idx = true_indices[argmin]
        closest_tag_name = tags.index[chosen_tag_idx]
        closest_tag_emb = embeddings[chosen_tag_idx]

        h1_t[idx] = {
            'closest_tag': closest_tag_name,
            'closest_dist': closest_dist,
            'cluster_mean': cluster_means[idx],   
            'tag_embedding': closest_tag_emb      
        }

    return h1_t


In [149]:
h1_t = find_representative_tags(tree_builder, tags, n_centroids=100)

In [153]:
centroid_100_tags = pd.DataFrame.from_dict(h1_t, orient='index')
centroid_100_tags.index = centroid_100_tags['closest_tag'].values
centroid_100_tags.drop(columns=['closest_tag'], inplace=True)
centroid_100_tags

Unnamed: 0,closest_dist,cluster_mean,tag_embedding
denodb,0.320131,"[0.014417376148063496, -0.007384050656546192, ...","[0.0116502746, -0.008069663799999998, -0.01297..."
memory,0.385309,"[0.028688511345533328, 0.0014271237903345237, ...","[0.034116245, 0.010937103414285715, -0.0185003..."
database-administration,0.322269,"[0.007003796571137018, -0.01554148007059889, 0...","[0.014782062733333332, -0.012128053166666665, ..."
rpc,0.256773,"[0.013661504061644281, -0.002496577893879999, ...","[0.0054010459, -0.0003999848199999998, -0.0013..."
stablexui,0.352327,"[0.025036833951179758, -0.00036814772256623003...","[0.026074393999999994, 0.0023631919622222218, ..."
...,...,...,...
pytorch,0.380658,"[0.016381107836795236, 0.013594593070475718, -...","[0.008694216432857142, 0.025070296428571424, -..."
field,0.269966,"[0.017792385638042314, 0.00044468486129393747,...","[0.013452607600000001, 0.010712051466666667, -..."
netlify,0.387222,"[0.01575377066778831, -0.004307782403267663, -...","[0.009402036083333334, -0.004371085221666666, ..."
word-table,0.370249,"[0.01430713080853095, -0.010452169643353429, -...","[0.016797865071428573, -0.018058393357142856, ..."


In [156]:
centroids_lite = centroid_100_tags.copy()
centroids_lite = centroids_lite.drop(columns=['closest_dist', 'tag_embedding'])
# centroids_lite.to_parquet(os.path.join(Path.cwd().parent,'data', 'centroid_100_tags_lite.parquet'))
centroids_lite

Unnamed: 0,cluster_mean
denodb,"[0.014417376148063496, -0.007384050656546192, ..."
memory,"[0.028688511345533328, 0.0014271237903345237, ..."
database-administration,"[0.007003796571137018, -0.01554148007059889, 0..."
rpc,"[0.013661504061644281, -0.002496577893879999, ..."
stablexui,"[0.025036833951179758, -0.00036814772256623003..."
...,...
pytorch,"[0.016381107836795236, 0.013594593070475718, -..."
field,"[0.017792385638042314, 0.00044468486129393747,..."
netlify,"[0.01575377066778831, -0.004307782403267663, -..."
word-table,"[0.01430713080853095, -0.010452169643353429, -..."


In [166]:
dem = DualEncoderMatcher(
    question_embeddings=questions[:10_000],
    tag_embeddings=centroids_lite['cluster_mean'].values.tolist(),
    tag_names=centroids_lite.index.tolist()
)

In [173]:
original_data.iloc[2]['question_text']

'I am new to pandas library in python. When I loaded a file and was printing the output of df.info into console, the data is getting printed first instead of the text that I have printed. What is causing this behavior(since it was not async I couldn\'t understand how it is behaving) import os import pandas as pd dataset_directory_path = "/home/user/python-temp/datasets" file_datasets = os.listdir(dataset_directory_path) datasets = dict() for file_dataset in file_datasets: dataset_name = file_dataset.split(\'.\')[0] dataset_data = pd.read_csv(os.path.join(dataset_directory_path, file_dataset)) datasets[dataset_name] = dataset_data un_processed_customers_data = datasets["customers"] print("Data structure for Df", un_processed_customers_data.info()) Ouput: <class \'pandas.core.frame.DataFrame\'> RangeIndex: 10000 entries, 0 to 9999 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Index 10000 non-null int64 1 Customer Id 10000 non-null object

In [171]:
predictions = dem.predict_top_k(question_idx=2, k=5)
print(f"Top 5 predictions: {predictions}")


Top 5 predictions: [('python', np.float64(0.38295367797329094)), ('word-table', np.float64(0.3696944707340029)), ('ironpdf', np.float64(0.3647400168217892)), ('django', np.float64(0.3297345619706671)), ('apache-spark', np.float64(0.3283243442282066))]
