In [1]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

In [2]:
df = pd.read_csv(r'Data/cleaned_data.csv', encoding='utf-8')
df.head()

Unnamed: 0,Head,Body,Tags,Text,Tags Count
0,Brain Segmentation To 3D Model,\r\nMy goal is to take a dataset of Brain Tumo...,"['computer-vision', 'python']",brain segmentation model goal dataset brain tu...,2
1,Active Learning regression with Random Forest,\r\nI have a dataset of about 8k points and I ...,"['machine-learning', 'regression', 'uncertaint...",active learning regression random forest I dat...,5
2,Comparing Reinforcement Learning models,\r\nI am currently completing my thesis on opt...,"['reinforcement-learning', 'policy-gradients',...",compare reinforcement learning model I current...,4
3,"Why ""Good Model"" that performs great on holdou...",\r\nI have this binary regression model that h...,"['deep-learning', 'deep-neural-networks', 'pre...",good model perform great holdout validation da...,5
4,What are Reservoir computers used for today?,\r\nReservoir computers were very popular in t...,"['machine-learning', 'recurrent-neural-network...",reservoir computer use today reservoir compute...,3


In [3]:
df['LenofText'] = df['Text'].apply(len)

minval = maxval = None

minval = df['LenofText'].median() - 0.5*(708-271)
maxval = df['LenofText'].median() + 1.5*(708-271)
print("minval:",minval,"maxval:",maxval)

print(df.shape)

df = df[(df['LenofText'] > minval) & (df['LenofText'] < maxval)]
print(df.shape)

minval: 216.5 maxval: 1090.5
(48802, 6)
(35704, 6)


In [4]:
# Data processing
import pandas as pd
import numpy as np
# Text preprocessiong
import nltk
wn = nltk.WordNetLemmatizer()
# Dimension reduction
from umap import UMAP
from hdbscan import HDBSCAN

In [None]:
# Initiate UMAP
umap_model = UMAP(
    n_neighbors=15, n_components=5, min_dist=0.05, metric="cosine", random_state=100
)

# Initiate SentenceTransformer
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

# Initiate Hdbscan

hdb_model = HDBSCAN(
    min_cluster_size=15,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

# Initiate BERTopic
topic_model = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdb_model,
    language="english",
    calculate_probabilities=True,
    n_gram_range=(1, 3),
    nr_topics=200,
    verbose=True
)
# Run BERTopic model
topics, probabilities = topic_model.fit_transform(df["Text"])

In [None]:
# Get the list of topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,22150,-1_model_datum_use_test,"[model, datum, use, test, value, variable, dis...",[test datum accuracy high training datum I use...
1,0,3053,0_action_state_reward_policy,"[action, state, reward, policy, agent, self, g...",[monte carlo epsilon soft approach compute max...
2,1,1084,1_loss_lstm_train_layer,"[loss, lstm, train, layer, model, epoch, input...",[cnn model learn I want train model predict em...
3,2,1000,2_bayesian_prior_posterior_theta,"[bayesian, prior, posterior, theta, distributi...",[posterior variance vs variance posterior mean...
4,3,809,3_image_object_detection_face,"[image, object, detection, face, box, detect, ...",[handle multiple object instance object detect...
...,...,...,...,...,...
95,94,18,94_birthday_people_share birthday_birthday par...,"[birthday, people, share birthday, birthday pa...",[american randomly choose need chance live adj...
96,95,17,95_cointegration_johansen_vecm_cointegration test,"[cointegration, johansen, vecm, cointegration ...",[sort conclusion cointegration cointegration t...
97,96,17,96_matrix_hat_projection_design,"[matrix, hat, projection, design, orthogonal, ...",[data matrix predictor matrix observation matr...
98,97,16,97_kappa_cohen_fleiss_cohen kappa,"[kappa, cohen, fleiss, cohen kappa, agreement,...",[use cohen kappa judgement I use cohen kappa c...


In [None]:
# Visualize top topic keywords
topic_model.visualize_barchart(top_n_topics=16)

In [None]:
# Visualize term rank decrease
topic_model.visualize_term_rank()

In [None]:
# Visualize intertopic distance
topic_model.visualize_topics()

In [None]:
# Visualize connections between topics using hierachical clustering
topic_model.visualize_hierarchy(top_n_topics=10)

In [None]:
# Visualize probability distribution
topic_model.visualize_distribution(topic_model.probabilities_[0], min_probability=0.015)

In [None]:
# New data for the review
new_review = "What are all the tests to check if the data sample is properly representing the population ?"
# Find topics
num_of_topics = 3
similar_topics, similarity = topic_model.find_topics(new_review, top_n=num_of_topics); 
# Print results
print(f'The top {num_of_topics} similar topics are {similar_topics}, and the similarities are {np.round(similarity,2)}')

The top 3 similar topics are [8, 38, 77], and the similarities are [0.5  0.38 0.37]


In [None]:
# Print the top keywords for the top similar topics
for i in range(num_of_topics):
  print(f'The top keywords for topic {similar_topics[i]} are:')
  print(topic_model.get_topic(similar_topics[i]))

The top keywords for topic 8 are:
[('test', 0.032552260861836435), ('hypothesis', 0.014503456762420137), ('power', 0.013033089620190104), ('value', 0.012028706998739997), ('sample', 0.011839406935943906), ('null', 0.010562881291732426), ('group', 0.00992152228227011), ('null hypothesis', 0.009259918000594258), ('sample size', 0.009115195156264699), ('size', 0.008787406010101794)]
The top keywords for topic 38 are:
[('census', 0.033183048675949144), ('zip code', 0.023627849377536595), ('zip', 0.02178187635912109), ('county', 0.021214795551213313), ('datum', 0.019140389118501203), ('crime', 0.016470400143609665), ('election', 0.014871992003081756), ('city', 0.012676418066033968), ('find', 0.01218410288477609), ('look', 0.01215465092508235)]
The top keywords for topic 77 are:
[('permutation', 0.10190531866276197), ('permutation test', 0.06326805587819234), ('test', 0.03356073657718246), ('length', 0.015388951537045602), ('sample', 0.014502430691291777), ('test statistic', 0.01354920768823

In [None]:
# Save the topic model
topic_model.save(r"D:\Github\Tag App\stackexchange_topic_model.pkl")