In [1]:
import pandas as pd
import os
import ast
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from sklearn.cluster import HDBSCAN, KMeans
import time
from tqdm import tqdm
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import calinski_harabasz_score,silhouette_score
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances
from sklearn.decomposition import PCA
from umap import UMAP
import optuna
from algorithms import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
questions = np.load(os.path.join(Path.cwd().parent,'data', "question_embeddings_reduced.npy"))
tags = np.load(os.path.join(Path.cwd().parent,'data', "tag_embeddings_reduced.npy"))
tag_names = pd.read_parquet(os.path.join(Path.cwd().parent,'data', "tag_embeddings.parquet")).index.tolist()

In [3]:
# questions, tags

In [None]:
model = DirectHierarchicalLabelTree(
    tag_embeddings=tags,
    tag_names=tag_names,
    branching_factors=[30, 100],
    )

In [5]:
model.build_tree()

{'root': {'is_leaf': False,
  'n_clusters': 30,
  'centroids': array([[ 0.0346487 ,  0.44179145,  0.9743783 , ...,  5.8904204 ,
           5.2228503 ,  3.790582  ],
         [ 0.01399324,  0.33877188,  0.74150777, ...,  4.9367514 ,
           5.027527  ,  4.016882  ],
         [ 0.02164239,  0.308098  ,  0.802525  , ...,  5.528275  ,
           5.048002  ,  3.8638873 ],
         ...,
         [ 9.392605  ,  0.46268606,  0.7877343 , ...,  5.189184  ,
           5.0104485 ,  3.883345  ],
         [ 0.02684761,  0.32583976,  0.79607415, ...,  5.3017774 ,
           5.0818152 ,  3.9736855 ],
         [-0.01164233,  0.28389096,  0.9158489 , ...,  5.307026  ,
           5.122044  ,  3.9416614 ]], shape=(30, 100), dtype=float32),
  'children': ['root_c0',
   'root_c1',
   'root_c2',
   'root_c3',
   'root_c4',
   'root_c5',
   'root_c6',
   'root_c7',
   'root_c8',
   'root_c9',
   'root_c10',
   'root_c11',
   'root_c12',
   'root_c13',
   'root_c14',
   'root_c15',
   'root_c16',
   'root_c

In [6]:
model.visualize_tree_statistics()

Total tags: 22753
Level 0: 1 clusters
Level 1: 30 clusters
Level 2: 2912 clusters


In [7]:
h1_t = find_representative_tags(tree_builder=model, tags=tag_names,n_centroids=30)

In [8]:
centroid_100_tags = pd.DataFrame.from_dict(h1_t, orient='index')
centroid_100_tags.index = centroid_100_tags['closest_tag'].values
centroid_100_tags.drop(columns=['closest_tag'], inplace=True)
centroid_100_tags.head(5)

Unnamed: 0,closest_dist,cluster_mean,tag_embedding,cluster_size,top_5_popular_tags
jackhenry-jxchange,0.163803,"[0.03265746, 0.43158105, 0.96425986, 0.754395,...","[0.0348195, 0.44542092, 0.9631855, 0.7556813, ...",293,"[stripe-payments, stripe-payments-js, nfc, ope..."
ms-release-management,0.301975,"[0.01419038, 0.33625367, 0.75058687, 0.4378765...","[0.0124511, 0.3332421, 0.7607521, 0.4398694, 0...",512,"[git, git-lfs, azure-devops, azure-pipelines, ..."
nsfetchedresultscontroller,0.24357,"[0.021016814, 0.30880412, 0.8113877, 0.3647373...","[0.018170116, 0.29826337, 0.7920201, 0.3836398...",830,"[data.table, tabular-editor, datagrid, primeng..."
indexeddb,0.197217,"[0.023174245, 0.38816428, 0.7571122, 0.3885716...","[0.025483228, 0.376365, 0.75070536, 0.39624017...",1311,"[entity-framework, pandas, database, mongodb, ..."
svd,0.36204,"[-0.002116722, 0.2987644, 1.0821071, 0.296734,...","[-0.009780283, 0.2787495, 1.0140694, 0.2471242...",621,"[delphi, stdvector, numpy, differential-equati..."


In [None]:
centroid_100_tags.index

Index(['jackhenry-jxchange', 'ms-release-management',
       'nsfetchedresultscontroller', 'indexeddb', 'svd', 'doctoolchain',
       'high-availability', 'mobile-development', 'cornerstone', 'sum',
       'wagtail', 'spring-cloud-function', 'android-cardview', 'streaming',
       'time-tracking', 'aspose.words', 'storyboard', 'jaspic',
       'custom-training', 'google-app-maker', 'junit-runner', 'heatmap',
       'xmlserializer', 'ractivejs', 'undefined-behavior', 'fileutils',
       'android-networking', 'typo3', 'document.write', 'bare-metal'],
      dtype='object')

In [None]:
(questions)

array([[4.8297772, 4.098127 , 1.3978301, ..., 6.0315194, 4.4951262,
        3.9698122],
       [6.0218472, 4.9324474, 2.0986912, ..., 6.0374565, 4.46779  ,
        4.0001073],
       [5.3929715, 6.145968 , 2.357819 , ..., 6.013956 , 4.4575744,
        3.9892414],
       ...,
       [2.9614463, 6.109658 , 2.38245  , ..., 5.9989014, 4.4371834,
        3.9829078],
       [3.3357139, 4.4645443, 4.964673 , ..., 6.027926 , 4.4659896,
        4.005909 ],
       [1.4921734, 5.252755 , 3.3661427, ..., 5.99878  , 4.438702 ,
        3.9807606]], shape=(99992, 100), dtype=float32)

In [None]:
matcher = DualEncoderMatcher(
    question_embeddings=questions,
    tag_embeddings=np.vstack(centroid_100_tags['cluster_mean'].values),
    tag_names=centroid_100_tags.index.tolist(),
)

In [None]:
top_k = matcher.predict_top_k(1, k=5)

In [None]:
original_data = pd.read_csv(os.path.join(Path.cwd().parent,'data','stackexchange_dataset.csv'))
original_data.index = original_data['question_id']
original_data.drop(columns=['question_id'],inplace=True)
original_data = original_data[~original_data.index.duplicated()]
original_data.loc[:,('tags')] = original_data.loc[:,('tags')].apply(ast.literal_eval)

In [None]:
questions_indexed = pd.DataFrame(index=original_data.index)
questions_indexed['predicted_tags'] = list(questions)

In [None]:
questions_indexed.head(3)

Unnamed: 0_level_0,predicted_tags
question_id,Unnamed: 1_level_1
79802517,"[4.8297772, 4.098127, 1.3978301, 9.298622, 5.4..."
79802934,"[6.0218472, 4.9324474, 2.0986912, 8.825055, 4...."
79802909,"[5.3929715, 6.145968, 2.357819, 7.2965007, 4.5..."


In [None]:
# original_data['tags'].apply(lambda x: 1 if 'python' in x else 0).to_list()

In [None]:
idx_test = original_data.index[original_data['tags'].apply(lambda x: True if 'python' in x else False).to_list()]
idx_test

Index([79802909, 79802796, 79802906,  6918493, 79219163, 79802672, 79398519,
       79799821, 79358078, 75372024,
       ...
       79663235, 79188312, 79663249, 79663207, 29236267, 78454606, 75329557,
       38667728, 79663212, 79659671],
      dtype='int64', name='question_id', length=11489)

In [None]:
python_results = questions_indexed.loc[idx_test]
python_results.head(3)

Unnamed: 0_level_0,predicted_tags
question_id,Unnamed: 1_level_1
79802909,"[5.3929715, 6.145968, 2.357819, 7.2965007, 4.5..."
79802796,"[4.611259, 4.9051175, 4.63642, 7.9914837, 4.41..."
79802906,"[1.5987006, 6.086713, 3.7549717, 7.726263, 2.9..."


In [None]:
python_results.shape[0]

11489

In [None]:
most_common_python_tags = {}
for i in range(python_results.shape[0]-1):
    top_tags = matcher.predict_top_k(i, k=1)
    if top_tags[0][0] in most_common_python_tags:
        most_common_python_tags[top_tags[0][0]] += 1
    else:
        most_common_python_tags[top_tags[0][0]] = 1
    

In [None]:
most_common_python_tags

{'storyboard': 10668, 'typo3': 820}