In [None]:
# Imports + Load + Basic cleaning 
import re
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency, entropy

FILE_PATH = r"...\PDA_Data_Final.xlsx"

df = pd.read_excel(FILE_PATH)

expected_cols = [
    "city", "title", "address", "Öffnungsklasse", "Services",
    "Restaurantkonzept", "Küchenregion", "Kette oder unabhängig",
    "Kettenzuordnung", "Preiskategorie"
]

missing = [c for c in expected_cols if c not in df.columns]
if missing:
    print("Missing columns:", missing)

for c in expected_cols:
    if c in df.columns:
        df[c] = df[c].astype("string").str.strip()

df["city"] = df["city"].fillna("Unbekannt")
df["Küchenregion"] = df["Küchenregion"].fillna("Unbekannt")
df["Restaurantkonzept"] = df["Restaurantkonzept"].fillna("Unbekannt")
df["Services"] = df["Services"].fillna("Unbekannt")
df["Öffnungsklasse"] = df["Öffnungsklasse"].fillna("Unbekannt")
df["Preiskategorie"] = df["Preiskategorie"].fillna("Unbekannt")
df["Kettenzuordnung"] = df["Kettenzuordnung"].fillna("")

plt.rcParams.update({
    "figure.dpi": 120,
    "savefig.dpi": 300,
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
    "legend.fontsize": 10,
    "axes.grid": True,
    "grid.alpha": 0.25,
})

In [None]:
#  Helper functions 
def top_n_with_other(series, n=20, other_label="Other"):
    vc = series.value_counts(dropna=False)
    if len(vc) <= n:
        return series
    top = vc.index[:n]
    return series.where(series.isin(top), other_label)

def rotate_xticks(ax, rotation=45, ha="right"):
    ax.tick_params(axis="x", labelrotation=rotation)
    plt.setp(ax.get_xticklabels(), ha=ha)

def annotate_bars(ax, fmt="{:.0f}"):
    for p in ax.patches:
        h = p.get_height()
        if np.isfinite(h) and h > 0:
            ax.annotate(
                fmt.format(h),
                (p.get_x() + p.get_width() / 2, h),
                ha="center",
                va="bottom",
                fontsize=9,
                xytext=(0, 2),
                textcoords="offset points",
            )

def percent_stack(ax):
    ax.set_ylim(0, 1)
    ax.set_ylabel("Anteil")
    ax.yaxis.set_major_formatter(lambda x, pos: f"{int(round(x*100))}%")

def plot_distribution(col, top_n=25, title=None):
    s = df[col].fillna("Unbekannt")
    s2 = top_n_with_other(s, n=top_n, other_label="Other")
    counts = s2.value_counts()

    fig, ax = plt.subplots(figsize=(10, 5))
    ax.bar(counts.index.astype(str), counts.values)
    ax.set_title(title or f"Distribution: {col}")
    ax.set_xlabel(col)
    ax.set_ylabel("Anzahl")
    rotate_xticks(ax, 45, "right")
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    annotate_bars(ax)
    fig.tight_layout()
    plt.show()

In [None]:
# Restaurantkonzept distribution
plot_distribution("Restaurantkonzept", top_n=30, title="Restaurantkonzept – Verteilung")

In [None]:
# Küchenregion distribution
plot_distribution("Küchenregion", top_n=30, title=" ")

In [None]:
# Services distribution
plot_distribution("Services", top_n=10, title=" ")

In [None]:
# Öffnungsklasse distribution
plot_distribution("Öffnungsklasse", top_n=10, title=" ")

In [None]:
# Top 30 Städte nach Anzahl Restaurants
city_counts = df["city"].value_counts()
top_k = 30
top_cities = city_counts.head(top_k)

fig, ax = plt.subplots(figsize=(11, 5.5))
ax.bar(top_cities.index.astype(str), top_cities.values)

ax.set_title(f"Top {top_k} Städte nach Anzahl der Restaurants")
ax.set_xlabel("Stadt")
ax.set_ylabel("Anzahl Restaurants")

rotate_xticks(ax, 45, "right")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

for p in ax.patches:
    h = p.get_height()
    if np.isfinite(h) and h > 0:
        ax.annotate(
            f"{int(h)}",
            (p.get_x() + p.get_width() / 2, h),
            ha="center",
            va="bottom",
            fontsize=9,
            xytext=(0, 2),
            textcoords="offset points",
        )

fig.tight_layout()
plt.show()

In [None]:
# Städtekonzentration (Pareto / kumulativer Anteil)
fig, ax = plt.subplots(figsize=(8.5, 5.2))

cum = (city_counts / city_counts.sum()).cumsum()
ax.plot(np.arange(1, len(cum) + 1), cum.values)

ax.set_title("  ")
ax.set_xlabel("Städterang (1 = größte Stadt)")
ax.set_ylabel("Kumulativer Anteil")

ax.set_ylim(0, 1.02)
ax.yaxis.set_major_formatter(lambda x, pos: f"{int(round(x*100))}%")

ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

fig.tight_layout()
plt.show()

In [None]:
# 100%-Stacked: Restaurantkonzept-Mix nach Stadt (Top 20 + Sonstige)
def plot_100pct_stacked_by_city(col, top_k=20):
    d = df.copy()
    d["city_top"] = top_n_with_other(d["city"], n=top_k, other_label="Sonstige")

    tab = pd.crosstab(d["city_top"], d[col], normalize="index")
    tab = tab.loc[tab.sum(axis=1).sort_values(ascending=False).index]

    fig, ax = plt.subplots(figsize=(12.5, 6.2))
    bottom = np.zeros(len(tab))
    colors = plt.cm.tab20(np.linspace(0, 1, tab.shape[1]))

    for i, c in enumerate(tab.columns):
        ax.bar(
            tab.index.astype(str),
            tab[c].values,
            bottom=bottom,
            label=str(c),
            color=colors[i],
        )
        bottom += tab[c].values

    ax.set_title(" ")
    ax.set_ylabel("Anteil")
    ax.set_ylim(0, 1)
    ax.yaxis.set_major_formatter(lambda x, pos: f"{int(round(x*100))}%")

    rotate_xticks(ax, 45, "right")
    ax.legend(ncol=2, bbox_to_anchor=(1.02, 1), loc="upper left", frameon=False)

    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)

    fig.tight_layout()
    plt.show()

plot_100pct_stacked_by_city("Restaurantkonzept", top_k=20)

In [None]:
# Heatmap: Stadt × Küchenregion (Anteile innerhalb der Stadt)
def heatmap_city_x_category(cat_col, top_city=25, top_cat=20):
    d = df.copy()
    d["city_top"] = top_n_with_other(d["city"], n=top_city, other_label="Sonstige")
    d[cat_col + "_top"] = top_n_with_other(d[cat_col], n=top_cat, other_label="Sonstige")

    tab = pd.crosstab(d["city_top"], d[cat_col + "_top"], normalize="index")

    fig, ax = plt.subplots(figsize=(12.5, 7.2))
    im = ax.imshow(tab.values, aspect="auto", interpolation="nearest", cmap="viridis")

    ax.set_title(f"Anteil-Heatmap: Stadt × {cat_col}")
    ax.set_yticks(np.arange(tab.shape[0]))
    ax.set_yticklabels(tab.index.astype(str))

    ax.set_xticks(np.arange(tab.shape[1]))
    ax.set_xticklabels(tab.columns.astype(str))
    rotate_xticks(ax, 45, "right")

    cbar = fig.colorbar(im, ax=ax, fraction=0.035, pad=0.02)
    cbar.set_label("Anteil innerhalb der Stadt")

    fig.tight_layout()
    plt.show()

heatmap_city_x_category("Küchenregion", top_city=25, top_cat=20)

In [None]:
# Zusammenhang: Restaurantkonzept × Küchenregion (standardisierte Residuen + Cramérs V)
def cramers_v_from_chi2(chi2, n, r, k):
    denom = n * (min(r - 1, k - 1))
    return math.sqrt(chi2 / denom) if denom > 0 else np.nan

def chi2_residual_heatmap(a, b, top_a=25, top_b=25):
    d = df.copy()
    d[a] = top_n_with_other(d[a], n=top_a, other_label="Sonstige")
    d[b] = top_n_with_other(d[b], n=top_b, other_label="Sonstige")

    ct = pd.crosstab(d[a], d[b])
    chi2, p, dof, expected = chi2_contingency(ct.values)

    n = ct.values.sum()
    r, k = ct.shape
    v = cramers_v_from_chi2(chi2, n, r, k)

    resid = (ct.values - expected) / np.sqrt(expected)
    resid_df = pd.DataFrame(resid, index=ct.index.astype(str), columns=ct.columns.astype(str))

    fig, ax = plt.subplots(figsize=(12.5, 7.2))
    vmax = np.nanpercentile(np.abs(resid_df.values), 95)
    im = ax.imshow(
        resid_df.values,
        aspect="auto",
        interpolation="nearest",
        cmap="coolwarm",
        vmin=-vmax,
        vmax=vmax,
    )

    ax.set_title(
        f"Zusammenhang: {a} × {b}\n"
        f"Chi²={chi2:.1f}, p={p:.3g}, df={dof}, Cramérs V={v:.3f}"
    )

    ax.set_yticks(np.arange(resid_df.shape[0]))
    ax.set_yticklabels(resid_df.index)

    ax.set_xticks(np.arange(resid_df.shape[1]))
    ax.set_xticklabels(resid_df.columns)
    rotate_xticks(ax, 45, "right")

    cbar = fig.colorbar(im, ax=ax, fraction=0.035, pad=0.02)
    cbar.set_label("Standardisierte Residuen")

    fig.tight_layout()
    plt.show()

chi2_residual_heatmap("Restaurantkonzept", "Küchenregion", top_a=25, top_b=25)

In [None]:
# Küchenregion-Diversität nach Stadt: Entropie vs. Stadtgröße (Top 25 Städte)
def hhi(shares):
    s = np.asarray(shares, dtype=float)
    s = s[s > 0]
    return np.sum(s**2)

def city_diversity_metrics(cat_col, top_city=50, top_cat=25):
    d = df.copy()
    d["city_top"] = top_n_with_other(d["city"], n=top_city, other_label="Sonstige")
    d[cat_col + "_top"] = top_n_with_other(d[cat_col], n=top_cat, other_label="Sonstige")

    tab = pd.crosstab(d["city_top"], d[cat_col + "_top"], normalize="index")
    ent = tab.apply(lambda row: entropy(row.values, base=2), axis=1)
    conc = tab.apply(lambda row: hhi(row.values), axis=1)

    out = pd.DataFrame({
        "Stadt": tab.index.astype(str),
        "Anzahl": d["city_top"].value_counts().reindex(tab.index).values,
        "Entropie": ent.values,
        "HHI": conc.values
    }).sort_values("Anzahl", ascending=False)

    return out

metrics_cuisine = city_diversity_metrics("Küchenregion", top_city=50, top_cat=25)

fig, ax = plt.subplots(figsize=(10, 5))
m = metrics_cuisine.sort_values("Anzahl", ascending=False).head(25)

ax.scatter(m["Anzahl"], m["Entropie"])
ax.set_title("Küchenregion-Diversität nach Stadt (Shannon-Entropie, Basis 2) – Top 25")
ax.set_xlabel("Stadtgröße (Anzahl Restaurants)")
ax.set_ylabel("Entropie (höher = vielfältiger)")

ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

fig.tight_layout()
plt.show()

In [None]:
# Top 25 Ketten nach Anzahl (aus "Kettenzuordnung") – explorativ
df["is_chain_assigned"] = df["Kettenzuordnung"].astype("string").str.len().fillna(0) > 0

chain_counts = (
    df.loc[df["is_chain_assigned"], "Kettenzuordnung"]
    .astype("string")
    .str.strip()
    .replace({"Missing": pd.NA, "missing": pd.NA, "Unbekannt": pd.NA, "": pd.NA})
    .dropna()
    .value_counts()
    .head(25)
)

fig, ax = plt.subplots(figsize=(11, 5.5))
ax.bar(chain_counts.index.astype(str), chain_counts.values)

ax.set_title("Top 25 Ketten nach Anzahl (aus „Kettenzuordnung“) – explorativ")
ax.set_xlabel("Kette")
ax.set_ylabel("Anzahl Restaurants")

rotate_xticks(ax, 45, "right")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

for p in ax.patches:
    h = p.get_height()
    if np.isfinite(h) and h > 0:
        ax.annotate(
            f"{int(h)}",
            (p.get_x() + p.get_width() / 2, h),
            ha="center",
            va="bottom",
            fontsize=9,
            xytext=(0, 2),
            textcoords="offset points",
        )

fig.tight_layout()
plt.show()

In [None]:
# 2D-Strukturkarte (One-Hot-Kategorien - SVD)
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder

cols_mca = ["Restaurantkonzept", "Küchenregion", "Services", "Öffnungsklasse"]
X = df[cols_mca].fillna("Unbekannt").astype("string")

try:
    enc = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
except TypeError:
    enc = OneHotEncoder(handle_unknown="ignore", sparse=True)

Xo = enc.fit_transform(X)

svd = TruncatedSVD(n_components=2, random_state=42)
Z = svd.fit_transform(Xo)

fig, ax = plt.subplots(figsize=(8.5, 6.5))
ax.scatter(Z[:, 0], Z[:, 1], s=6, alpha=0.35)

ax.set_title("2D-Strukturkarte (One-Hot-Kategorien → SVD)")
ax.set_xlabel("Komponente 1")
ax.set_ylabel("Komponente 2")

ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

fig.tight_layout()
plt.show()