In [3]:
# Standard
import os
import pandas as pd # Datenanalyse
from datetime import datetime # Datumsangaben


In [7]:
# ERste Analyse mit Daten von Marcel
# CSV Datei  einlesen
df_or = pd.read_csv("output/cluster_oeffentlich.csv")
df_wm = pd.read_csv("output/cluster_wirtschaft.csv")
df_gm = pd.read_csv("output/cluster_grossemedien.csv")
df_rm = pd.read_csv("output/cluster_regiomedien.csv")
df_di = pd.read_csv("output/cluster_digital.csv")
df_tech = pd.read_csv("output/cluster_tech.csv")


In [3]:
# Spalten
for name, df in zip(
    ["Öffentlich", "Wirtschaft", "Groß", "Regio", "Divers", "Tech"],
    [df_or, df_wm, df_gm, df_rm, df_di, df_tech]
):
    print(f"\n[{name}] Columns:", df.columns.tolist())


[Öffentlich] Columns: ['count', 'word', 'source', 'date']

[Wirtschaft] Columns: ['count', 'word', 'source', 'date']

[Groß] Columns: ['count', 'word', 'source', 'date']

[Regio] Columns: ['count', 'word', 'source', 'date']

[Divers] Columns: ['count', 'word', 'source', 'date']

[Tech] Columns: ['count', 'word', 'source', 'date']


In [4]:
for name, df in zip(
    ["Öffentlich", "Wirtschaft", "Groß", "Regio", "Divers", "Tech"],
    [df_or, df_wm, df_gm, df_rm, df_di, df_tech]
):
    print(f"{name}: {df.shape}")

Öffentlich: (2783770, 4)
Wirtschaft: (8107394, 4)
Groß: (19194483, 4)
Regio: (5742758, 4)
Divers: (5144626, 4)
Tech: (5232188, 4)


In [5]:
df_or.isnull().sum()

count      0
word      81
source     0
date       0
dtype: int64

In [7]:
df_or[df_or["word"].isnull()].sample(5, random_state=1)

Unnamed: 0,count,word,source,date
1752451,1,,dlf,2023-11-29
773554,1,,dlf,2022-07-26
781226,1,,dlf,2022-07-30
2180510,1,,tagesschau,2024-06-23
810992,1,,dlf,2022-08-14


In [14]:
df_or.sample(5, random_state=1)

Unnamed: 0,count,word,source,date
2201196,1,frühere,dlf,2024-07-02
2074551,1,verstärkung.,dlf,2024-05-05
1393432,1,krywyj,tagesschau,2023-06-13
1945289,4,russische,dlf,2024-03-02
2034636,1,pädagogen,dlf,2024-04-15


In [8]:
# Prüfung NaN
df_null = df_or[df_or["word"].isnull()]
print(f"[INFO] Anzahl leerer Zeilen: {len(df_null)}")

[INFO] Anzahl leerer Zeilen: 81


In [9]:
df_null["source"].value_counts()

source
dlf           51
tagesschau    30
Name: count, dtype: int64

In [10]:
df_null["date"].value_counts().sort_index()

date
2021-04-28    1
2021-05-13    1
2021-06-06    1
2021-06-07    1
2021-06-08    1
             ..
2024-12-05    1
2025-01-11    1
2025-01-12    1
2025-02-16    1
2025-02-17    1
Name: count, Length: 78, dtype: int64

In [11]:
# Fehlende Werte speichern - falls für spätere Analyse wichtig sind
df_null.to_csv("output/leere_zeilen_dlf_tagesschau.csv", index=False)


In [12]:
# Fehlende Werte löschen
df_or = df_or[df_or["word"].notnull()]

In [13]:
# Prüfung NaN
df_null = df_or[df_or["word"].isnull()]
print(f"[INFO] Anzahl leerer Zeilen: {len(df_null)}")

[INFO] Anzahl leerer Zeilen: 0


In [8]:
# Alles in einem DataFrame vereinen
df_medien = pd.concat([df_or, df_wm, df_gm, df_rm, df_di, df_tech], ignore_index=True)

In [8]:
# Alles kleinschreiben zur Sicherheit
df_medien["word"] = df_medien["word"].str.lower()

# Nur Wörter, die "bvg" enthalten
df_bvg = df_medien[df_medien["word"].str.contains("bvg", na=False)]
print("[INFO] Anzahl Zeilen mit 'bvg' im Wort:", len(df_bvg))

MemoryError: Unable to allocate 353. MiB for an array with shape (46205219,) and data type float64

In [9]:
# Alle Wörter vorher kleinschreiben
df_medien["word"] = df_medien["word"].astype(str).str.lower()

# RAM-schonender: einfache Filterfunktion
mask = df_medien["word"].apply(lambda w: "bvg" in w if isinstance(w, str) else False)
df_bvg = df_medien.loc[mask]

MemoryError: Unable to allocate 353. MiB for an array with shape (46205219,) and data type float64

In [12]:
df_rm = pd.read_csv("output/cluster_regiomedien.csv")
df_rm["word"] = df_rm["word"].astype(str).str.lower()
df_rm[df_rm["word"].str.contains("bvg", na=False)].shape

(2140, 4)

In [13]:
def bvg_haeufigkeit_pro_cluster(csv_path, cluster_name):
    try:
        df = pd.read_csv(csv_path, usecols=["word", "count", "date", "source"])
        df["word"] = df["word"].astype(str).str.lower()
        df = df[df["word"].str.contains("bvg", na=False)]
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
        df["year"] = df["date"].dt.year
        grouped = df.groupby("year")["count"].sum()
        return cluster_name, grouped
    except Exception as e:
        print(f"[ERROR] Fehler bei {cluster_name}: {e}")
        return cluster_name, None

In [14]:
cluster_files = {
    "Öffentlich-rechtlich": "output/cluster_oeffentlich.csv",
    "Wirtschaft": "output/cluster_wirtschaft.csv",
    "Große Medien": "output/cluster_grossemedien.csv",
    "Regional": "output/cluster_regiomedien.csv",
    "Digital": "output/cluster_digital.csv",
    "Tech": "output/cluster_tech.csv"
}

results = {}
for name, path in cluster_files.items():
    cluster_name, grouped = bvg_haeufigkeit_pro_cluster(path, name)
    if grouped is not None:
        results[cluster_name] = grouped

[ERROR] Fehler bei Große Medien: Unable to allocate 146. MiB for an array with shape (19194483,) and data type float64


In [1]:
# Neuer Versuch
def wortsuche_cluster(csv_path, suchwort="bvg"):
    df = pd.read_csv(csv_path, usecols=["word", "count", "date", "source"])
    df["word"] = df["word"].astype(str).str.lower()
    df_treffer = df[df["word"].str.contains(suchwort, na=False)]
    return df_treffer

In [4]:
df_bvg_rm = wortsuche_cluster("output/cluster_regiomedien.csv", "bvg")
df_bvg_rm["date"] = pd.to_datetime(df_bvg_rm["date"], errors="coerce")
df_bvg_rm["year"] = df_bvg_rm["date"].dt.year

# Überblick:
df_bvg_rm["year"].value_counts().sort_index()

year
2021    447
2022    495
2023    412
2024    580
2025    206
Name: count, dtype: int64

In [9]:
# Diese Zeilen liefern dir die "bvg"-Artikel
df_bvg = df_medien.loc[df_medien["word"].str.contains("bvg", na=False)]
print(df_bvg.shape)

(2422, 4)


In [10]:
df_bvg["word"].value_counts().head(10)

word
bvg                 1291
bvg:                 195
bvg-bussen            76
bvg-bus               76
bvg.                  65
bvg-app               43
bvg-chefin            35
bvg-streik            35
bvg,                  33
bvg-fahrinfo-app      26
Name: count, dtype: int64

In [11]:
# Summe der Nennungen
df_bvg["count"].sum()

np.int64(3369)

In [12]:
# Top 10 Begriffe mit BVG
df_bvg.groupby("word")["count"].sum().sort_values(ascending=False).head(10)

word
bvg            2005
bvg:            226
bvg-bus          98
bvg-bussen       82
bvg-app          72
bvg.             69
bvg-streik       62
bvg-chefin       42
bvg-account      36
bvg,             34
Name: count, dtype: int64

In [13]:
df_bvg["date"] = pd.to_datetime(df_bvg["date"], errors="coerce")
df_bvg["year"] = df_bvg["date"].dt.year
bvg_zeit = df_bvg.groupby("year")["count"].sum()
print(bvg_zeit)

year
2021     646
2022     701
2023     609
2024    1056
2025     357
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bvg["date"] = pd.to_datetime(df_bvg["date"], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bvg["year"] = df_bvg["date"].dt.year


In [14]:
# Häufigkeit nach Quellen
df_bvg["source"].value_counts()

source
berliner        1127
tagesspiegel    1011
taz               92
netzpolitik       71
ntv               17
zeit              14
heise             13
welt              12
wiwo              11
mm                10
stern             10
spiegel            8
sz                 6
dlf                6
t3n                5
faz                4
handelsblatt       2
abendblatt         2
tagesschau         1
Name: count, dtype: int64

In [15]:
df_bvg.groupby("source")["count"].sum().sort_values(ascending=False)

source
tagesspiegel    1571
berliner        1445
netzpolitik      115
taz              111
ntv               18
zeit              14
welt              13
heise             13
wiwo              13
mm                10
stern             10
spiegel            8
dlf                6
t3n                6
sz                 6
faz                5
abendblatt         2
handelsblatt       2
tagesschau         1
Name: count, dtype: int64

In [16]:
pivot = df_bvg.pivot_table(index="year", columns="source", values="count", aggfunc="sum", fill_value=0)
pivot.head()

source,abendblatt,berliner,dlf,faz,handelsblatt,heise,mm,netzpolitik,ntv,spiegel,stern,sz,t3n,tagesschau,tagesspiegel,taz,welt,wiwo,zeit
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021,0,115,5,1,2,2,0,0,0,0,3,0,0,0,497,7,0,13,1
2022,0,181,0,3,0,0,0,0,2,5,1,0,0,0,493,14,1,0,1
2023,2,359,0,0,0,3,9,0,2,0,2,2,6,0,187,31,1,0,5
2024,0,580,1,1,0,6,1,115,5,2,2,2,0,0,289,43,5,0,4
2025,0,210,0,0,0,2,0,0,9,1,2,2,0,1,105,16,6,0,3


In [18]:
# Test für Handelsblatt
from bs4 import BeautifulSoup

# Beispiel-Dateiname vom Handelsblatt
html_datei = "D:/DBU/ADSC11 ADS-01/Studienarbeit/newspaper-scraping/input/raw/data-lake/2025-02-28-handelsblatt.html"

# 1. HTML-Datei einlesen
with open(html_datei, "r", encoding="utf-8") as f:
    html = f.read()

# 2. Soup-Objekt erstellen
soup = BeautifulSoup(html, "html.parser")

# 3. Nur gezielte Tags durchsuchen
relevante_tags = soup.find_all(["p", "h2", "h3"])
relevanter_text = " ".join(tag.get_text(separator=" ").strip() for tag in relevante_tags)

# 4. Ergebnis anzeigen
print("[INFO] Länge des extrahierten Texts:", len(relevanter_text))
print("[INFO] Textvorschau:\n", relevanter_text[:500])
print("[INFO] Enthält 'bvg'? →", "bvg" in relevanter_text.lower())

[INFO] Länge des extrahierten Texts: 0
[INFO] Textvorschau:
 
[INFO] Enthält 'bvg'? → False


In [19]:
# 1. CSV laden
df_wm = pd.read_csv("output/cluster_wirtschaft.csv")

# 2. Nur Handelsblatt-Artikel filtern
df_hb = df_wm[df_wm["source"] == "handelsblatt"]

# 3. Top 10 Wörter nach Gesamtanzahl
top10_hb = df_hb.groupby("word")["count"].sum().sort_values(ascending=False).head(10)

# 4. Anzeigen
print(top10_hb)

word
:               47721
mehr…           44711
image           23869
handelsblatt    21714
 mehr…          19880
merken          15286
play            12846
thema           11337
unternehmen      8136
kommentieren     7682
Name: count, dtype: int64


In [20]:
df_hb_bvg = df_hb[df_hb["word"].str.lower() == "bvg"]
print(df_hb_bvg["count"].sum())

0


In [21]:
df_hb[df_hb["word"].str.contains("bvg", na=False, case=False)].groupby("word")["count"].sum().sort_values(ascending=False)

word
(bvg)         1
bvg-chefin    1
Name: count, dtype: int64

In [15]:
# Grobe Analyse
# 1. Medienbeiträge filtern, die über die BVG sprechen
# Nur Zeilen mit dem Wort "bvg"
df_bvg = df_or[df_or["word"] == "bvg"]
print(f"[INFO] Anzahl BVG-Nennungen: {len(df_bvg)}")

[INFO] Anzahl BVG-Nennungen: 0


In [19]:
df_or[df_or["word"].str.startswith("bvg", na=False)]["word"].value_counts().head(20)

word
bvg-urteil         3
bvg-direktor       3
bvg-tarifstreit    1
Name: count, dtype: int64

In [18]:
df_or[df_or["word"].str.lower().isin(["bvg", "bvg.", "bvg,", "bvg)"])]

Unnamed: 0,count,word,source,date


In [16]:
df_or["word"].value_counts().head(50)

word
impressum               2856
wirtschaft              2856
deutschland             2856
nachrichten             2856
ard                     2856
audios                  2856
service                 2856
kontakt                 2856
live                    2855
sendungen               2853
europa                  2852
livestream              2852
gesellschaft            2852
programm                2816
aktuelle                2814
podcasts                2812
menschen                2724
sport                   2633
datenschutzerklärung    2627
welt                    2482
deutsche                2453
ukraine                 2373
russland                2366
jahren                  2361
wissen                  2339
startseite              2324
kultur                  2317
hintergrund             2317
werden.                 2315
deutschen               2278
usa                     2270
krieg                   2264
podcast                 2258
100                     2237
verbrauch

In [17]:
df_or[df_or["word"].str.contains("bvg", na=False)]

Unnamed: 0,count,word,source,date
9935,1,bvg-urteil,dlf,2021-04-08
187566,1,bvg-urteil,dlf,2021-08-05
196356,1,bvg-direktor,dlf,2021-08-11
197899,1,bvg-direktor,dlf,2021-08-12
199216,1,bvg-direktor,dlf,2021-08-13
2074833,1,bvg-urteil,dlf,2024-05-05
2706307,1,bvg-tarifstreit,tagesschau,2025-01-27


In [21]:
import re
from bs4 import BeautifulSoup

def process_html_test(html, stopwords_list=None):
    """Bereinigt HTML-Inhalt testweise robuster (für Medienanalyse)"""
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(separator=" ").lower()

    # Wörter mit mindestens 2 Buchstaben (keine Satzzeichen etc.)
    tokens = re.findall(r"\b[a-zäöüß]{2,}\b", text)

    if stopwords_list is not None:
        tokens = [t for t in tokens if t not in stopwords_list]

    return tokens

In [22]:
html = "<p>Die BVG, so sagt man, fährt selten pünktlich.</p>"
print(process_html_test(html))

['die', 'bvg', 'so', 'sagt', 'man', 'fährt', 'selten', 'pünktlich']


In [1]:
def extract_text_by_medium(html, medium):
    soup = BeautifulSoup(html, "html.parser")

    if medium == "tagesschau":
        text_tags = soup.find_all("p", class_=True)
    elif medium == "deutschlandfunk":
        text_tags = soup.find_all("p", class_=True)
    elif medium == "handelsblatt":
        text_tags = soup.find_all(["h2", "h3"])
    elif medium == "tagesspiegel":
        text_tags = soup.find_all("p")
    else:
        # Fallback: alles nehmen
        return soup.get_text(separator=" ", strip=True)

    return " ".join(t.get_text(separator=" ", strip=True) for t in text_tags)

In [2]:
text = extract_text_by_medium(html, newspaper["name"])
tokens = process_html_test(text, stopwords_list)

NameError: name 'html' is not defined