In [1]:
from sentence_transformers import SentenceTransformer

model_use = SentenceTransformer("distiluse-base-multilingual-cased")

def embed(sentences):
    """
    wrapper function for generating message embeddings
    """
    embeddings = model_use.encode(sentences)
    return embeddings

2022-08-08 12:29:26.608104: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-08 12:29:26.608142: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
import umap

def generate_clusters(
    message_embeddings,
    n_neighbors,
    n_components,
    min_cluster_size,
    random_state=None,
    metric="cosine",
    cluster_selection_method="eom",
):
    """
    Generate HDBSCAN cluster object after reducing embedding dimensionality with UMAP

    Parameters
    ----------
    n_neighbors  : int , number of nearest neighbors in UMAP
    n_components : int, number of component in low dimension in UMAP
    min_cluster_size : int, min cluster size for HDBSCAN
    random_state : int, RandomState instance or None, optional (default: None);
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used by np.random.ù
    metric : str, name of the metric to use in HDBSCAN
    cluster_selection_method : 'eom' (Excess of mass) or 'leaf', for hdbscan

    Return
    ---------
    clusters : List[files, ..]
    clusters.relative_validity_ : DBCV score
    """

    umap_embeddings = umap.UMAP(
        n_neighbors=n_neighbors,
        n_components=n_components,
        metric=metric,
        random_state=random_state,
        min_dist=0.03,
    ).fit_transform(message_embeddings)

    clusters = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        metric="euclidean",
        gen_min_span_tree=True,
        cluster_selection_method=cluster_selection_method,
    ).fit(umap_embeddings)
    
    dbcv = clusters.relative_validity_
    return clusters, umap_embeddings, dbcv  # sensé être une approximation de DBCV

In [3]:
import pandas as pd 

df = pd.read_csv("./data/cleaned_data.csv")
def get_top_n(dataframe, top_n):
    """
    Get the top n of the created 
    
    Parameters
    -----------
    dataframe : pandas.DataFrame
        input dataframe
    top_n : int,
        number of desired category
        
    Return
    -------
        pandas.DataFrame with only the top_n of the desired categories
    """
    categories = list(dataframe.columns.values)[1:]
    df_stats = pd.DataFrame([(category, dataframe[category].sum()) for category in categories], columns=['category', 'number of queries'])
    df_stats.sort_values(by=['number of queries'], ascending = False, inplace=True)
    
    top_ = df_stats['category'][:top_n].tolist()
    df_top = dataframe[['full_text'] + top_]
    # Remove raws without labels
    df_top = df_top[(df_top[top_].T != 0).any()]
    df_top = df_top.reset_index(drop=True)
    return df_top, top_

num_labels = 5
df_, label_cols = get_top_n(df, num_labels)
df_.head()

Unnamed: 0,full_text,java,python,javascript,ios,android
0,What is the advantage of storing schema in avr...,1,0,0,0,0
1,How do you get JavaScript/jQuery Intellisense ...,0,0,1,0,0
2,Mocking Static Blocks in Java\nMy motto for Ja...,1,0,0,0,0
3,Getting random row through SQLAlchemy\nHow do ...,0,1,0,0,0
4,Python dictionary from an object's fields\nDo ...,0,1,0,0,0


In [4]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12700 entries, 0 to 12699
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   full_text   12700 non-null  object
 1   java        12700 non-null  int64 
 2   python      12700 non-null  int64 
 3   javascript  12700 non-null  int64 
 4   ios         12700 non-null  int64 
 5   android     12700 non-null  int64 
dtypes: int64(5), object(1)
memory usage: 595.4+ KB


In [7]:
from tqdm.notebook import tqdm
import hdbscan

print('Create embeddings : ')
embeddings_questions = embed(df_.full_text.tolist()[:1000])
#parameters_ = {'min_cluster_size': 3, 'n_components': 10, 'n_neighbors': 5, 'random_state': 42}

print("Clustering .. ")
clusters, umap_embeddings, dbcv = generate_clusters(
    message_embeddings=embeddings_questions,
    n_neighbors=5,
    n_components=10,
    min_cluster_size= 3,
    random_state=42,
    metric="cosine",
    cluster_selection_method="eom")

Create embeddings : 
Clustering .. 


In [10]:
len(set(clusters.labels_))

100