### Attention

If you are using a Windows based system then the installation of `bertopic` might fail because cpp build tools are missing.

If that is the case, here is a starting link to get the installation going: https://visualstudio.microsoft.com/de/visual-cpp-build-tools/

# Setup

In [16]:
# Load libraries
import os
import pandas as pd 
from bertopic import BERTopic # if installation fails cpp build tools might be the issue 
from bertopic.vectorizers import ClassTfidfTransformer
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

# from transformers import AutoTokenizer
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
import os
import pandas as pd
import re

# Load stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hoolj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Parameters

In [17]:
# Paths
TEXTS_DIRECTORY = 'data/texts'
CLUSTERING_DIRECTORY = 'data/clustering'


# Model settings
EMBEDDING_MODEL = 'paraphrase-multilingual-MiniLM-L12-v2' # The embedding model is used by BERTopic to generate embeddings in a first step. Here no stop words are used as transformers work best with uncleansed text.
CTFIDF_MODEL = ClassTfidfTransformer(reduce_frequent_words=True) # The CTFIDF Model can be used to additionally remove frequent words from the topic terms
VECTORIZER_MODEL = CountVectorizer(stop_words=stopwords.words('german')) # The vectorizer model is used after the embeddings are generated to generate topic terms. Here we use stop words to prevent meaningles words from being uses as topic term

# Number of lines to skip at the beginning of each txt file (header text)
lines_to_skip = 8

# Line patterns to ignore
page_pattern_1 = re.compile(r'Seite \d+ von \d+', re.IGNORECASE)
page_pattern_2 = re.compile(r'Seite \d/\d', re.IGNORECASE) 

# Number of tokens and overlap used to split up the text into manageable chunks
tokens_per_chunk = 128 # 128 is the maximum for paraphrase-multilingual-MiniLM-L12-v2
chunk_overlap = 20

In [18]:
# Crate folders if the don't exist
if not os.path.exists(CLUSTERING_DIRECTORY):
    os.makedirs(CLUSTERING_DIRECTORY)

# Read and cleanse text

In [19]:
# Placeholder to store all valid chunks
texts = []

# Placeholder to store skipped lines/chunks
skipped_lines = []

# # Tokenizer object used to split up text into chunks
# tokenizer = AutoTokenizer.from_pretrained(f'sentence-transformers/{EMBEDDING_MODEL}')

# Text splitter object
text_splitter = SentenceTransformersTokenTextSplitter(
    chunk_overlap=chunk_overlap, 
    model_name=f'sentence-transformers/{EMBEDDING_MODEL}', 
    tokens_per_chunk=tokens_per_chunk
)

# Read all text files
for text_file in os.listdir(TEXTS_DIRECTORY):

    file_path = f'{TEXTS_DIRECTORY}/{text_file}'

    # Load all lines of text file into lines variable
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Placeholder for used text content
    content = ''

    # Iterate over all lines (skip first lines as specified in parameters)
    for line_index in range(len(lines)):
        line = lines[line_index]
        # Add lines to content if it does not match an exclusion criterion
        if line_index < lines_to_skip or page_pattern_1.match(line.strip()) or page_pattern_2.match(line.strip()) or line.strip() == '\n' or len(line.strip()) < 2:
            skipped_lines.append(line)
        else:
            content += line
    
    # Cleanse content a little bit
    content = content.replace('\t', ' ')

    # Split up text content into chunks
    chunks = text_splitter.split_text(content)

    # Add text chunks and indexes for file and chunk to texts variable
    for index in range(len(chunks)):
        texts.append([text_file[:-4], index, chunks[index]])

# Write all excluded lines to disk
with open(f'{CLUSTERING_DIRECTORY}/skipped_text.txt', 'w', encoding='utf-8') as skipped_text_file:
    for skipped_text in skipped_lines:
        skipped_text_file.write(skipped_text)

# Generate chunks dataframe and save write it to disk
df = pd.DataFrame(texts, columns=['text_id', 'chunk_id', 'chunk_text'])
df.to_csv(f'{CLUSTERING_DIRECTORY}/text_chunks.csv', index=False, sep='\t')

# Runn Topic Clustering

In [20]:
# Put all text chunks into list
docs = df['chunk_text'].to_list()

# Fit the BERTopic model
model = BERTopic(embedding_model=EMBEDDING_MODEL, vectorizer_model=VECTORIZER_MODEL, ctfidf_model=CTFIDF_MODEL, verbose=True).fit(docs)

# Generate topic classifications
topics, probabilities = model.transform(docs)

# Add topic classifications to dataframe and write it to disk
df['topic'] = topics
df['topic_probability'] = probabilities
df.to_csv(f'{CLUSTERING_DIRECTORY}/text_chunks_topics.csv', index=False, sep='\t')

2024-05-01 11:03:17,081 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

2024-05-01 11:03:35,372 - BERTopic - Embedding - Completed ✓
2024-05-01 11:03:35,374 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-01 11:03:47,493 - BERTopic - Dimensionality - Completed ✓
2024-05-01 11:03:47,493 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-01 11:03:47,505 - BERTopic - Cluster - Completed ✓
2024-05-01 11:03:47,505 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-01 11:03:47,554 - BERTopic - Representation - Completed ✓


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

2024-05-01 11:04:03,222 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-05-01 11:04:03,222 - BERTopic - Dimensionality - Completed ✓
2024-05-01 11:04:03,222 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-05-01 11:04:03,240 - BERTopic - Cluster - Completed ✓


# Create List of all topics

In [21]:
# Create two dataframes, one with the topic id and the keywords, and one with the topic id and the keywords together with the keyword importance
df_1 = []
df_2 = []

for key, val in model.get_topics().items():
    df_1.append([key, [x[0] for x in val]])
    df_2.append([key, val])

# Write dataframes to disk
pd.DataFrame(df_1, columns=['topic', 'keywords']).to_csv(f'{CLUSTERING_DIRECTORY}/topics_with_keywords.csv', index=False, sep='\t')
pd.DataFrame(df_2, columns=['topic', 'keywords_prob']).to_csv(f'{CLUSTERING_DIRECTORY}/topics_with_keywords_and_keywordimportance.csv', index=False, sep='\t')

# Visualizations

In [22]:
df_text_topics = df.groupby('text_id')['topic'].agg(lambda x: list(set(x))).reset_index()
df_text_topics.to_csv(f'{CLUSTERING_DIRECTORY}/text_topics.csv', index=False, sep='\t')

In [23]:
# Create dimensionality reduced map of topics

visualization = model.visualize_topics()

visualization.write_html(f'{CLUSTERING_DIRECTORY}/topic_visualization_map.html')
visualization

In [24]:
# Create cluster hierarchy

visualization = model.visualize_hierarchy(hierarchical_topics=model.hierarchical_topics(docs))

visualization.write_html(f'{CLUSTERING_DIRECTORY}/topic_visualization_hierarchical.html')
visualization

100%|██████████| 5/5 [00:00<00:00, 355.58it/s]
