In [43]:
from bertopic import BERTopic
import pandas as pd
import re
import spacy
import fitz
import os
import plotly.io as pio
import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
df = pd.read_csv('../dataset/anfragen_oldenburg_detailed.csv')

# Pre-processing data

In [26]:
df['Beschreibung_original'] = df['Beschreibung']
# lowercasing
df['Beschreibung'] = df['Beschreibung'].str.lower()

# removal of punctuation and digits
def clean_entries(entry):
    entry = re.sub(r'[\d]', ' ', entry) # remove digits
    entry = re.sub(r'[^\w\s]', ' ', entry) # remove non-word characters
    return entry
df['Beschreibung'] = df['Beschreibung'].apply(clean_entries)

# removal of stopwords (e.g., der, die, das, ein)
nlp = spacy.load("de_core_news_sm")
def remove_stopwords(entry):
    doc = nlp(entry)
    return " ".join([token.text for token in doc if not token.is_stop and not token.is_punct and not token.is_space])
df['Beschreibung'] = df['Beschreibung'].apply(remove_stopwords)



In [27]:
# remove street names
street_doc = fitz.open('./Strassenverzeichnis-Oldb.pdf')
street_names = []
for i, page in enumerate(street_doc):
    lines = [line.strip() for line in page.get_text().splitlines() if line.strip()]
    
    if i == 0:
        lines = lines[11:]
    else:
        lines = lines[8:]

    # two special cases for long street names and one for weird formatting
    if i == 2:
        idx = lines.index('ANNETTE-VON-DROSTE-HUELSHOFF-')
        lines[idx] = lines[idx] + ' ' + lines[idx + 1]
        del lines[idx + 1]
    if i == 7:
        idx = lines.index('FRIEDRICH-CHRISTOPH-DAHLMANN-')
        lines[idx] = lines[idx] + ' ' + lines[idx + 1]
        del lines[idx + 1]
    if i == 15:
        idx = lines.index('LENZWEG')
        corrected = []
        corrected.append(lines[idx])
        corrected.extend(lines[idx + 4:idx + 6])
        corrected.extend(lines[idx + 10::3])
        street_names.extend(corrected)
        continue 
    streets = lines[::3]
    street_names += streets

# extend street names to account for ß and str. and take care of umlaute
street_names = [name.lower() for name in street_names]
# umlaute
def generate_umlaut_variants(name):
    variants = set()
    variants.add(name)

    replacements = [
        ('ae', 'ä'),
        ('oe', 'ö'),
        ('ue', 'ü'),
        ('ä', 'ae'),
        ('ö', 'oe'),
        ('ü', 'ue'),
    ]

    for a, b in replacements:
        if a in name:
            variants.add(name.replace(a, b))
        if b in name:
            variants.add(name.replace(b, a))
    
    return variants
street_names_ext = set()
name_patterns = []
for name in street_names:
    umlaut_names = generate_umlaut_variants(name)
    street_names_ext.update(umlaut_names)
for name in street_names_ext:
    ext_name = name
    if 'strasse' in name:
        ext_name = name.replace('strasse', r'\s?(strasse|straße|str\.?)')
    name_patterns.append(ext_name)

street_names_pattern = r'\b(?:' + '|'.join(name_patterns) + r')\b'
df['Beschreibung'] = df['Beschreibung'].str.replace(street_names_pattern, '', flags=re.IGNORECASE, regex=True)

In [28]:
# remove hausnummer, nr. etc.
hausnummer_pattern = r'\b(haus)?(nummer|nr\.?)\b'
df['Beschreibung'] = df['Beschreibung'].str.replace(hausnummer_pattern, '', flags=re.IGNORECASE, regex=True)

In [29]:
# remove stadtteile (stadtteile according to Wikipedia)
stadtteile = [
    'zentrum',
    'dobben',
    'haarenesch',
    'bahnhofsviertel',
    'gerichtsviertel',
    'ziegelhof',
    'ehnern',
    'bürgeresch',
    'buergeresch',
    'donnerschwee',
    'osternburg',
    'drielake',
    'everstehn',
    'hundsmühler höhe',
    'hundsmuehler hoehe',
    'thomasburg',
    'bloherfelde',
    'haarentor',
    'wechloy',
    'bürgerfelde',
    'buergerfelde',
    'rauhehorst',
    'dietrichsfeld',
    'alexandersfeld',
    'flugplatz',
    'ofenerdiek',
    'nadorst',
    'etzhorn',
    'ohmstede',
    'bornhorst',
    'neuenwege',
    'kloster blankenburg',
    'kreyenbrück',
    'kreyenbrueck',
    'bümmerstede',
    'buemmerstede',
    'tweelbäke west',
    'tweelbaeke west',
    'krusenbusch',
    'drielaker moor'
]
stadtteile_pattern = r'\b(?:' + '|'.join(re.escape(stadtteil) for stadtteil in stadtteile) + r')\b'
df['Beschreibung'] = df['Beschreibung'].str.replace(stadtteile_pattern, '', flags=re.IGNORECASE, regex=True)

In [30]:
# lemmatization (e.g., reducing gegangen to gehen)
def lemmatize(entry):
    doc = nlp(entry)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space])
df['Beschreibung'] = df['Beschreibung'].apply(lemmatize)

Removal of categories, e.g., Straßen or Wilde Müllkippe, would also be an option. In some sense, people are primed towards using these phrases, since they have previously selected categories. But removing e.g., Straßen would come at the expense of losing a lot of relevant information.

In [31]:
display(df['Beschreibung'].head())

0    Famila liegen rechts Müll Gebüsch bitte entfernen
1                 Ecke diverser Abfall illegal ablegen
2            Gang Höhe erneut illegal Abfall entsorgen
3    Containerstellplatz ecke diverser Hausmüll ent...
4                          Gefrier Kühlschrank ablegen
Name: Beschreibung, dtype: object

# Topic model

In [33]:
docs = df['Beschreibung'].dropna()

In [10]:
topic_model = BERTopic(language="german")
topics, probs = topic_model.fit_transform(docs)

In [34]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,240,-1_liegen_stehen_höhe_weg,"[liegen, stehen, höhe, weg, müll, gehweg, spie...",[befinden Absenkung mindestens cm breit cm Loc...
1,0,96,0_müll_wild_mülleimer_liegen,"[müll, wild, mülleimer, liegen, parkplatz, meh...",[erneut vermüllen Restmüll Sperrmüll blau Säck...
2,1,82,1_ampel_straße_signal_schlaglöch,"[ampel, straße, signal, schlaglöch, richtung, ...",[Ampel Defekt Grün leuchten wüsting kommend St...
3,2,71,2_fußball_werbung_aufkleber_unterhaltungsfußball,"[fußball, werbung, aufkleber, unterhaltungsfuß...",[Aufkleber Werbung Unterhaltungsfußball vfb Ol...
4,3,45,3_gefährlich_auto_straße_beschädigen,"[gefährlich, auto, straße, beschädigen, gefähr...",[schlaglöch Weg Straßenverkehr gefährden Zenti...
5,4,44,4_stehen_fahrzeug_kennzeichen_auto,"[stehen, fahrzeug, kennzeichen, auto, parkplat...",[mehrere Monat stehen Kurve Dersagauwegs alt P...
6,5,41,5_sperrmüll_gebotszeich_mast_drückampel,"[sperrmüll, gebotszeich, mast, drückampel, kap...",[Druckknopf Ampel Höhe hnr südlich Seite defek...
7,6,35,6_fahrrad_radfahrer_fahrradweg_rollstuhlfahrer,"[fahrrad, radfahrer, fahrradweg, rollstuhlfahr...",[Fußweg stehen stark regenfäll Bereichsweis Wa...
8,7,30,7_bushaltestellen_bus_schwelle_liegen,"[bushaltestellen, bus, schwelle, liegen, busha...",[Schliefenstraße Einmündung befinden quer Fahr...
9,8,29,8_glascontainer_müll_ecke_papier,"[glascontainer, müll, ecke, papier, altglas, g...",[Parkbucht Glascontainer Wilhelm Krüger strn s...


In [35]:
# Export the top 25 words for each topic
path = './topic_model_results'
file= os.path.join(path, 'top_words_per_topic.txt')
topic_info = topic_model.get_topic_info()
with open(file, 'w', encoding='utf-8') as f:
    for topic_id in topic_info[topic_info.Topic != -1].Topic:
        words = topic_model.get_topic(topic_id)
        f.write(f'Topic {topic_id}:\n')
        for word, score in words[:10]:
            f.write(f'{word} ({score:.4f})\n')
        f.write('\n')

In [39]:
topic_model.visualize_barchart(top_n_topics=20, topics=topic_info[topic_info.Topic != -1].Topic.tolist(), n_words=10)
#fig = topic_model.visualize_barchart(top_n_topics=20, topics=topic_info[topic_info.Topic != -1].Topic.tolist(), n_words=10)
#fig.write_image("./topic_model results/top_words_per_topic.png", width=1200, height=800)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [49]:
#topic_model.visualize_topics()
# Generate the interactive Plotly figure
fig = topic_model.visualize_topics()

# Export to HTML file (specify your desired path)
fig.write_html("./topic_model_results/topic_model_visualization.html")

In [56]:
topic_model.visualize_heatmap()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [55]:
topic_model.visualize_hierarchy()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [41]:
# export top documents per topic
file= os.path.join(path, 'top_docs_per_topic.txt')
representative_docs = topic_model.get_representative_docs()
with open(file, 'w', encoding='utf-8') as f:
    for topic_id, docs in representative_docs.items():
        if topic_id == -1:
            continue
        f.write(f'Topic {topic_id}:\n')
        for i, doc in enumerate(docs[:3]):
            f.write(f'Document {i+1}:\n')
            f.write(f'{doc.strip()}\n\n')
        f.write('\n\n')

In [42]:
# export original/unprocessed documents per topic
df['Topic'] = topics
file= os.path.join(path, 'top_original_docs_per_topic.txt')
with open(file, 'w', encoding='utf-8') as f:
    topic_info = topic_model.get_topic_info()
    for topic_id in topic_info[topic_info.Topic != -1].Topic:
        f.write(f"Topic {topic_id}:\n")

        top_docs = df[df["Topic"] == topic_id].head(3)

        for i, row in top_docs.iterrows():
            f.write(f"  Document {i}:\n")
            f.write(f"  {row['Beschreibung_original'].strip()}\n\n")

        f.write("\n\n")