In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import spacy
from spacy_langdetect import LanguageDetector
import easyocr
import de_core_news_sm
import en_core_web_sm
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

import re
import string
import sys
from pathlib import Path

import umap
import hdbscan

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
topic_model = BERTopic()

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

reader = easyocr.Reader(['en', 'de'], gpu=True) 

module_path = str(Path.cwd().parents[0] / "Scripts")
if module_path not in sys.path:
    sys.path.append(module_path)

from notebook_scripts import split_array

## PATHS

In [2]:
poster_text_df_path = "../../Data/poster_text.csv"
corpus_df_path = "../../Data/corpus.csv"

## CONSTANTS

In [11]:
MODEL_NAME = 'multi-qa-MiniLM-L6-cos-v1'
MIN_CLUSTER_SIZE = 5
N_COMPONENTS = 15
N_NEIGHBORS = 15
NUMBER_WORDS = 5

## 1) Data

In [12]:
df = pd.read_csv(poster_text_df_path)
df = df[["Path", "Most_Likely"]]
df["Most_Likely"] = df["Most_Likely"].apply(lambda x: ''.join(str(word) for word in x))
df.head()

Unnamed: 0,Path,Most_Likely
0,../../Data/PlakateBayreuth/../../Data/PlakateB...,"['FORUM', 'Kirchner', 'Das', 'expressionistisc..."
1,../../Data/PlakateBayreuth/../../Data/PlakateB...,"['SEST', 'I4', 'g', "" ' ^ ."", '1472', 'Das Pa ..."
2,../../Data/PlakateBayreuth/../../Data/PlakateB...,"['U', 'M 1 V R (; E R', 'KUNSTHALLE', 'LICHTWA..."
3,../../Data/PlakateBayreuth/../../Data/PlakateB...,"['PRIS O NS', 'AUSSTELLUNGE', 'DER FRIEDRICH-S..."
4,../../Data/PlakateBayreuth/../../Data/PlakateB...,"['Christoph Brech', ""it's about time"", '10 Mai..."


In [13]:
try:
    corpus_df = df = pd.read_csv(corpus_df_path)
    
except:
    print("No file found. Creating Corpus...")
    corpus = []

    for i, row in enumerate(df["Most_Likely"], 1):
        cleaned_row = cleaner(row.split(","))
        corpus.append([cleaned_row])
        if i % 1000 == 0:
            print(f"[{i}/{len(df)}] processed successfully.")
            
    corpus_np = np.array(corpus)
    corpus_df = pd.DataFrame(corpus_np, columns=["Text"])
    corpus_df.to_csv(corpus_df_path, ignore_index=True)
  
print(f"Corpus Length: {len(corpus_df)}")

Corpus Length: 17786


### 1.1) Data Preprocessing

In [14]:
processed_corpus_df = corpus_df[corpus_df["Text"].apply(lambda x: split_array(x) > NUMBER_WORDS)]
corpus = processed_corpus_df["Text"].tolist()
print(f"Original Length: {len(corpus_df)} - Processed Length: {len(corpus)}")

Original Length: 17786 - Processed Length: 10663


## 2) Models
- multi-qa-MiniLM-L6-cos-v1
- distilbert-base-nli-mean-tokens
- nq-distilbert-base-v1

In [15]:
model = SentenceTransformer(MODEL_NAME)
embeddings = model.encode(corpus, show_progress_bar=True)
embeddings = embeddings.reshape(-1, 1)

Batches:   0%|          | 0/334 [00:00<?, ?it/s]

In [None]:
umap_embeddings = umap.UMAP(n_neighbors=N_NEIGHBORS, n_components=N_COMPONENTS, metric='cosine').fit_transform(embeddings)

In [None]:
cluster = hdbscan.HDBSCAN(min_cluster_size=MIN_CLUSTER_SIZE, metric='euclidean',                      
                          cluster_selection_method='eom').fit(umap_embeddings)

In [None]:
# Prepare data
umap_data = umap.UMAP(n_neighbors=N_NEIGHBORS, 
                      n_components=N_COMPONENTS, 
                      min_dist=0.0, metric='cosine').fit_transform(embeddings)

result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_

# Visualize clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.5, cmap='hsv_r')
plt.title(f"Model: {model_name} | {n_components} Components | {min_cluster_size} Clusters | {n_neighbors} Neighbors")
plt.colorbar()
plt.show()