# Analysis using BERTopic
Before running this notebook run 00_generate_embeddings to extract text from PDFs

In [16]:
# Load libraries
import os
import pandas as pd 
from bertopic import BERTopic # if installation fails cpp build tools might be the issue https://visualstudio.microsoft.com/de/visual-cpp-build-tools/
from bertopic.vectorizers import ClassTfidfTransformer
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

# Load stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hoolj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
# Set relative paths where data is stored
RAW_DATA_PATH = 'data/raw'
TRANSFORMED_DATA_PATH = 'data/transformed'
MODELS_PATH = 'data/models'

# Create directories if they don't exist
if not os.path.exists(MODELS_PATH):
    os.makedirs(MODELS_PATH)

# The embedding model is used by BERTopic to generate embeddings in a first step. Here no stop words are used as transformers work best with uncleansed text.
EMBEDDING_MODEL = 'paraphrase-multilingual-MiniLM-L12-v2'

# The vectorizer model is used after the embeddings are generated to generate topic terms. Here we use stop words to prevent meaningles words from being uses as topic term
VECTORIZER_MODEL = CountVectorizer(stop_words=stopwords.words('english') + stopwords.words('german') + stopwords.words('french') + stopwords.words('italian'))

# The CTFIDF Model can be used to additionally remove frequent words from the topic terms
CTFIDF_MODEL = ClassTfidfTransformer(reduce_frequent_words=True)

file_name = 'dataset_1.parquet'
model_name = 'bertopic_naive'
model = None

# Load data
df = pd.read_parquet(f'{TRANSFORMED_DATA_PATH}/{file_name}')

# Use only text from data (we ignore the embeddings generated in 00_generate_embeddings.ipynb as BERTopic regenerates the embeddings in our tutorial)
docs = df['chunk_text'].to_list()

In [18]:
# If the model was not yet fittet, fit a new model
if not os.path.exists(f'{MODELS_PATH}/{model_name}'):
    print('Fit new model')

    # Create and fit model 
    model = BERTopic(language='multilingual', embedding_model=EMBEDDING_MODEL, vectorizer_model=VECTORIZER_MODEL, verbose=True).fit(docs)

    # The commented out version would use a ctfidf_model
    # model = BERTopic(language='multilingual', embedding_model=EMBEDDING_MODEL, vectorizer_model=VECTORIZER_MODEL, ctfidf_model=CTFIDF_MODEL, verbose=True).fit(docs)
    
    # Save model
    model.save(f'{MODELS_PATH}/{model_name}', serialization="safetensors", save_ctfidf=True, save_embedding_model=EMBEDDING_MODEL)

# If the model was fitted already, we reuse it
else:
    print('Load existing model')
    model = BERTopic.load(f'{MODELS_PATH}/bertopic_naive')

Load existing model


In [19]:
# Get number of topics. -1 is the non-topic.
model.get_topic_freq()

Unnamed: 0,Topic,Count
3,-1,1080
74,0,210
20,1,82
75,2,81
34,3,80
...,...,...
257,358,11
304,359,11
94,360,11
302,361,11


In [20]:
# Visualize all topics in a first step
model.visualize_topics()

In [21]:
model.visualize_barchart()

In [22]:
# Reduce topics
model.reduce_topics(docs, nr_topics=30)

2024-04-14 11:50:50,332 - BERTopic - Topic reduction - Reducing number of topics


2024-04-14 11:50:50,956 - BERTopic - Topic reduction - Reduced number of topics from 364 to 30


<bertopic._bertopic.BERTopic at 0x204d05f9d60>

In [23]:
# Visualize topics in a second step
model.visualize_topics()

In [24]:
# Extract hierarchical topics and their representations
hierarchical_topics = model.hierarchical_topics(docs)

# Visualize these representations
model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 28/28 [00:00<00:00, 185.56it/s]


Print out topics with their topic descriptors and the matched text chunks

In [25]:
# Placeholder to hold the topics
df_freq = model.get_topic_freq().sort_values('Count', ascending=False)

# Label texts in data set
topics, probabilities = model.transform(docs)
df['topic'] = topics


# Iterate over all topics and print out the respective descriptors and chunks
for idx, row in df_freq.iterrows():

    # Ignore the non-topic
    if row['Topic'] != -1:

        # Get topic descriptors
        topic_vals = model.get_topic(row['Topic'])
        description = ', '.join([val[0] for val in topic_vals])

        # Print topic descriptors
        print(description)

        # Print out all texts of topic
        for text in df[df['topic'] == row['Topic']]['chunk_text']:
            print('-', text)

        
        print('****************')

Batches:   0%|          | 0/327 [00:00<?, ?it/s]

2024-04-14 12:01:42,900 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


recherche, art, consentement, daten, einwilligung, forschung, information, patienten, général, proben
- Der Regierungsrat des Kantons Aargau begrüsst, dass zukünftig in jeder Ethikkommission mindestens eine Person Einsitz nehmen muss, die  Fachkenntnisse im Bereich "Informationstechnologie im Gesundheitsbereich" besitzt.
- swissethics  Generelle Anmerkungen:  swissethics und die kantonalen Ethikkommissionen anerkennen die vielen zielführenden Vorschläge des BAG zur Verordnungsrevision HFG. Die  Verwirklichung der Vorschläge wird einerseits dem Schutz der Forschungsteilnehmenden gerecht und andererseits die Arbeit der  Ethikkommissionen beeinflussen. Gesamthaft sind die Vorschläge der Revision sehr zu begrüssen, insbesondere was die Implementierung der  zunehmenden Digitalisierung und Technisierung der Forschung betrifft. Expl
- as die Implementierung der  zunehmenden Digitalisierung und Technisierung der Forschung betrifft. Explizit hervorgehoben und begrüsst seien die Implementierung 