In [1]:
!pip install -qU sentence_transformers umap-learn hdbscan bertopic

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stackexchange-tag-dataset/cleaned_data.csv


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
import os

In [28]:
df = pd.read_csv(r'/kaggle/input/stackexchange-tag-dataset/cleaned_data.csv')[['Text']]
df.head()

Unnamed: 0,Text
0,brain segmentation model goal dataset brain tu...
1,active learning regression random forest I dat...
2,compare reinforcement learning model I current...
3,good model perform great holdout validation da...
4,reservoir computer use today reservoir compute...


In [29]:
df['LenofText'] = df['Text'].apply(len)
minval = maxval = None
minval = df['LenofText'].median() - (708-271)+32
maxval = df['LenofText'].median() + (708-271)
print("minval:",minval,"maxval:",maxval)

print("Original Data",df.shape)
df = df[(df['LenofText']<maxval) & (df['LenofText']>minval)]
print("Filtered Data",df.shape)

minval: 30.0 maxval: 872.0
Original Data (48803, 2)
Filtered Data (40469, 2)


In [30]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration

# Initiate UMAP
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine',n_jobs=1,random_state=101)

# Initiate SentenceTransformer
sentence_model = SentenceTransformer("BAAI/bge-small-en-v1.5")

# Initiate Hdbscan

hdbscan_model = HDBSCAN(
    min_cluster_size=100,
    metric='euclidean',
    min_samples=5,
    prediction_data=True)

# KeyBERT
keybert = KeyBERTInspired()

# MMR
mmr = MaximalMarginalRelevance(diversity=0.3)

# All representation models
representation_model = {
    "KeyBERT": keybert,
    "MMR": mmr,
}

# Initiate BERTopic
topic_model = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    language="english",
    calculate_probabilities=True,
    n_gram_range=(1, 3),
    verbose=True,
    nr_topics=200,
    representation_model=representation_model
)
# Run BERTopic model
topics, probabilities = topic_model.fit_transform(df["Text"])

Batches:   0%|          | 0/1265 [00:00<?, ?it/s]

2023-11-03 17:48:30,642 - BERTopic - Transformed documents to Embeddings
2023-11-03 17:49:19,645 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2023-11-03 17:49:39,966 - BERTopic - Clustered reduced embeddings
2023-11-03 17:51:13,640 - BERTopic - Reduced number of topics from 77 to 77


In [31]:
# Get the list of topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,14729,-1_datum_use_model_variable,"[datum, use, model, variable, regression, valu...","[regression, dataset, predict, prediction, neu...","[datum, use, model, variable, regression, valu...",[I need quadratic linear coefficient glm binar...
1,0,119,0_mimic_mimic iii_iii_patient,"[mimic, mimic iii, iii, patient, admission, ta...","[mimic iii database, mimic ii database, physio...","[mimic, mimic iii, iii, patient, admission, ta...",[mimic iii waveform database somebody help cla...
2,1,251,1_drug_fda_openfda_api,"[drug, fda, openfda, api, fda gov, ndc, gov, j...","[drug ndc json, https open fda, api fda gov, d...","[drug, fda, openfda, api, fda gov, ndc, gov, j...",[api use find information base ndc code I find...
3,2,174,2_bootstrap_bootstrappe_sample_interval,"[bootstrap, bootstrappe, sample, interval, boo...","[bootstrappe sample, bootstrap estimate, boots...","[bootstrap, bootstrappe, sample, interval, boo...",[estimate confidence interval mean bootstrap m...
4,3,2170,3_action_policy_reward_state,"[action, policy, reward, state, agent, reinfor...","[reinforcement learning, reinforcement learn, ...","[action, policy, reward, state, agent, reinfor...",[state action value state value function equiv...
...,...,...,...,...,...,...,...
72,71,219,71_word_vec_word vec_vector,"[word, vec, word vec, vector, embed, embedding...","[use word vec, word vec model, vec word, word ...","[word, vec, word vec, vector, embed, embedding...",[word vec handle unseen new word bypass new cl...
73,72,265,72_datum_open_open datum_company,"[datum, open, open datum, company, dataset, go...","[open datum, open data, open source, datum set...","[datum, open, open datum, company, dataset, go...",[protocol usgov open datum available gov creat...
74,73,230,73_census_population_datum_county,"[census, population, datum, county, city, leve...","[www census gov, census gov geo, census block,...","[census, population, datum, county, city, leve...",[income dataset census tract block level I cur...
75,74,537,74_map_shapefile_datum_find,"[map, shapefile, datum, find, road, gis, look,...","[gis datum, geospatial, openstreetmap, spatial...","[map, shapefile, datum, find, road, gis, look,...",[seek free gis datum usa equivalent swedish ge...


In [32]:
# Visualize top topic keywords
topic_model.visualize_barchart(top_n_topics=16)

In [33]:
# Visualize term rank decrease
topic_model.visualize_term_rank()

In [34]:
# Visualize intertopic distance
topic_model.visualize_topics()

In [35]:
# Visualize connections between topics using hierachical clustering
topic_model.visualize_hierarchy()

In [36]:
# Visualize probability distribution
topic_model.visualize_distribution(topic_model.probabilities_[0], min_probability=0.015)

In [37]:
# New data for the review
new_review = "What are all the tests to check if the data sample is properly representing the population ?"
# Find topics
num_of_topics = 3
similar_topics, similarity = topic_model.find_topics(new_review, top_n=num_of_topics); 
# Print results
print(f'The top {num_of_topics} similar topics are {similar_topics}, and the similarities are {np.round(similarity,2)}')

The top 3 similar topics are [46, 45, 56], and the similarities are [0.76 0.75 0.75]


In [38]:
# Print the top keywords for the top similar topics
for i in range(num_of_topics):
  print(f'The top keywords for topic {similar_topics[i]} are:')
  print(topic_model.get_topic(similar_topics[i]))

The top keywords for topic 46 are:
[('distribution', 0.03500990863610241), ('test', 0.027622827166774715), ('kolmogorov', 0.021004746219876504), ('smirnov', 0.020413452824537763), ('kolmogorov smirnov', 0.020302465402689283), ('fit', 0.016399566018616512), ('smirnov test', 0.01638240908309052), ('kolmogorov smirnov test', 0.016246217614190754), ('normal', 0.014386525730384582), ('datum', 0.01352241647493662)]
The top keywords for topic 45 are:
[('chi', 0.055989474065455444), ('chi square', 0.049263347008503), ('test', 0.04478659585540312), ('square', 0.035757307623663706), ('square test', 0.03286577208648563), ('chi square test', 0.03276541407104839), ('fisher', 0.025885248157264684), ('exact test', 0.021225185588143465), ('fisher exact', 0.021099947087345597), ('fisher exact test', 0.020202828971244396)]
The top keywords for topic 56 are:
[('interval', 0.05555788124718985), ('confidence', 0.05170300611353267), ('confidence interval', 0.04989136545622027), ('population', 0.029914656684

In [39]:
# Save the topic model
topic_model.save("stackexchange_topic_key_model.pkl")