### Notebook for exploring the data

In [1]:
from pathlib import Path
import json

In [2]:
metadata_files = [json.load(open(file)) for file in list(Path("metadata").glob("*.json"))[:100000]]

In [3]:
#metadata_files = [metadata for metadata in metadata_files if metadata["language_metadata"]["language"] == "de"]

In [11]:
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
header_texts = []
for metadata in metadata_files:
    pdf_toc = [toc[1] for toc in metadata["pdf_statistics"]["table_of_contents"]]
    docling_toc = metadata["docling_metadata"]["section_headers"]
    header_texts.extend(pdf_toc + docling_toc)
    pdf_title = metadata["pdf_statistics"]["title"]
    if pdf_title and len(pdf_title) > 0:
        header_texts.append(pdf_title)

In [13]:
%pip install spacy
!python -m spacy download de_core_news_sm

[0mNote: you may need to restart the kernel to use updated packages.
2025-03-21 12:04:10.454901: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-21 12:04:10.454954: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-21 12:04:10.459878: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-21 12:04:10.466524: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate

In [15]:
import spacy
from tqdm import tqdm
nlp = spacy.load('de_core_news_sm')
lemmas = []
for header in tqdm(header_texts[:10000]):
    lemmas.extend([t.lemma_.lower() for t in nlp(header) if t.pos_ == "NOUN"])


100%|██████████| 10000/10000 [00:44<00:00, 224.98it/s]


In [16]:
# Get German stop words
german_stop_words = set(stopwords.words('german'))

# Filter out stop words from lemmas
lemmas = [lemma for lemma in lemmas if lemma not in german_stop_words]

# Remove numbers, special characters, and single characters
lemmas = [lemma for lemma in lemmas if lemma.isalpha() and len(lemma) > 1]

In [17]:
from collections import Counter

# Count word frequencies
word_counts = Counter(lemmas)

# Get the 10 most common words
most_common_words = word_counts.most_common(200)

# Display the most common words
[(count, word) for word, count in most_common_words]

[(91, 'datum'),
 (71, 'information'),
 (70, 'uhr'),
 (53, 'kosten'),
 (52, 'anlage'),
 (48, 'gmbh'),
 (42, 'art'),
 (41, 'sitzung'),
 (41, 'tagesordnung'),
 (40, 'hinweis'),
 (34, 'antrag'),
 (34, 'name'),
 (34, 'kontakt'),
 (34, 'vorlage'),
 (33, 'begründung'),
 (33, 'produkt'),
 (32, 'beschlussvorlage'),
 (31, 'verein'),
 (30, 'anfrage'),
 (30, 'sachverhalt'),
 (29, 'jahr'),
 (29, 'einladung'),
 (28, 'zweck'),
 (28, 'angabe'),
 (27, 'teil'),
 (27, 'änderung'),
 (27, 'recht'),
 (26, 'aufgabe'),
 (26, 'herr'),
 (26, 'ag'),
 (25, 'kind'),
 (24, 'ziel'),
 (24, 'april'),
 (23, 'datenschutz'),
 (23, 'application'),
 (22, 'beratungsfolge'),
 (22, 'anmeldung'),
 (22, 'juni'),
 (22, 'auswirkung'),
 (22, 'mai'),
 (21, 'schule'),
 (20, 'stadt'),
 (20, 'juli'),
 (19, 'angebot'),
 (19, 'product'),
 (18, 'mitglied'),
 (18, 'grund'),
 (17, 'mitgliedschaft'),
 (17, 'stand'),
 (17, 'lage'),
 (16, 'preis'),
 (16, 'person'),
 (16, 'text'),
 (16, 'manufacturer'),
 (16, 'pflicht'),
 (16, 'haftung'),
 (16

In [18]:
language_cnt = {}
for metadata in metadata_files:
    language_cnt[metadata["language_metadata"]["language"]] = language_cnt.get(metadata["language_metadata"]["language"], 0) + 1

language_cnt

{'fr': 1378,
 'de': 35701,
 'en': 7224,
 'pt': 242,
 'el': 461,
 'unknown': 304,
 'nl': 443,
 'hr': 508,
 'cs': 161,
 'it': 1028,
 'es': 1039,
 'pl': 303,
 'fi': 29,
 'bg': 37,
 'no': 86,
 'ru': 207,
 'sv': 116,
 'ca': 241,
 'da': 147,
 'ro': 121,
 'lv': 41,
 'hu': 196,
 'tr': 83,
 'sk': 85,
 'sl': 71,
 'et': 54,
 'id': 34,
 'zh-cn': 9,
 'cy': 43,
 'uk': 44,
 'fa': 24,
 'ja': 12,
 'mk': 48,
 'ta': 1,
 'lt': 16,
 'tl': 37,
 'vi': 19,
 'ar': 26,
 'so': 27,
 'sw': 14,
 'sq': 15,
 'he': 13,
 'ko': 11,
 'hi': 3,
 'af': 13,
 'th': 1,
 'gu': 1,
 'bn': 1}

In [19]:
total_docs = sum(language_cnt.values())

language_percentages = {lang: (count/total_docs)*100 for lang, count in language_cnt.items()}

sorted_percentages = dict(sorted(language_percentages.items(), key=lambda x: x[1], reverse=True))

for lang, pct in sorted_percentages.items():
    print(f"{lang}: {pct:.2f}%")


de: 70.39%
en: 14.24%
fr: 2.72%
es: 2.05%
it: 2.03%
hr: 1.00%
el: 0.91%
nl: 0.87%
unknown: 0.60%
pl: 0.60%
pt: 0.48%
ca: 0.48%
ru: 0.41%
hu: 0.39%
cs: 0.32%
da: 0.29%
ro: 0.24%
sv: 0.23%
no: 0.17%
sk: 0.17%
tr: 0.16%
sl: 0.14%
et: 0.11%
mk: 0.09%
uk: 0.09%
cy: 0.08%
lv: 0.08%
bg: 0.07%
tl: 0.07%
id: 0.07%
fi: 0.06%
so: 0.05%
ar: 0.05%
fa: 0.05%
vi: 0.04%
lt: 0.03%
sq: 0.03%
sw: 0.03%
he: 0.03%
af: 0.03%
ja: 0.02%
ko: 0.02%
zh-cn: 0.02%
hi: 0.01%
ta: 0.00%
th: 0.00%
gu: 0.00%
bn: 0.00%
