In [1]:
import pandas as pd

df_fw = pd.read_csv("data/Davos_Apartments.csv")   # Ferienwohnungen
df_hotel = pd.read_csv("data/Davos_Hotels.csv")    # Hotels


In [2]:
df_fw["type"] = "ferienwohnung"
df_hotel["type"] = "hotel"


In [3]:
df_combined = pd.concat([df_fw, df_hotel], ignore_index=True)


In [4]:
df_combined.to_csv("data/Davos_combined.csv", index=False)


In [5]:
# (pandas is already imported in another cell)
df = pd.read_csv("data/Davos_combined.csv")

# --- Skilift-Spalte zusammenführen (wenn vorhanden) ---
if "distance_skilift_raw" in df.columns:
	if "distance_skilift" in df.columns:
		df["distance_skilift"] = df["distance_skilift"].combine_first(df["distance_skilift_raw"])
	else:
		df["distance_skilift"] = df["distance_skilift_raw"]
	df = df.drop(columns=["distance_skilift_raw"])

# --- Zentrum-Spalte zusammenführen (unterstütze Varianten für den Raw-Name) ---
raw_zentrum = next((c for c in ("distance_zentrum_raw", "distance_centre_raw") if c in df.columns), None)
if raw_zentrum is not None:
	if "distance_zentrum" in df.columns:
		df["distance_zentrum"] = df["distance_zentrum"].combine_first(df[raw_zentrum])
	else:
		df["distance_zentrum"] = df[raw_zentrum]
	df = df.drop(columns=[raw_zentrum])

# --- Zimmergrösse zusammenführen ---
if "size_raw" in df.columns:
	if "zimmer_grösse" in df.columns:
		df["zimmer_grösse"] = df["zimmer_grösse"].combine_first(df["size_raw"])
	else:
		df["zimmer_grösse"] = df["size_raw"]
	df = df.drop(columns=["size_raw"])

# Speichern (lokaler data-Ordner)
df.to_csv("data/Davos_combined_clean.csv", index=False)

df.head()


Unnamed: 0,web_scraper_order,web_scraper_start_url,price_raw,type,distance_zentrum,distance_skilift,zimmer_grösse
0,1763925941-1,https://www.booking.com/searchresults.de.html?...,CHF 411,ferienwohnung,"Zentrum: 0,8 km",100 m vom Skilift entfernt,Apartment mit Blick auf die Berge
1,1763925941-2,https://www.booking.com/searchresults.de.html?...,CHF 134,ferienwohnung,"Zentrum: 0,9 km",600 m vom Skilift entfernt,Comfort Studio (2 Erwachsene)
2,1763925941-3,https://www.booking.com/searchresults.de.html?...,CHF 120,ferienwohnung,"Zentrum: 1,4 km","1,2 km vom Skilift entfernt",Zweibettzimmer
3,1763925941-4,https://www.booking.com/searchresults.de.html?...,CHF 199,ferienwohnung,"Zentrum: 0,8 km",300 m vom Skilift entfernt,Standard Doppelzimmer
4,1763925941-5,https://www.booking.com/searchresults.de.html?...,CHF 285,ferienwohnung,"Zentrum: 1,2 km",100 m vom Skilift entfernt,Executive Doppelzimmer


In [6]:
df["distance_zentrum"] = (
    df["distance_zentrum"]
    .astype(str)
    .str.replace("Zentrum", "", case=False, regex=False)  # entfernt "Zentrum" & "zentrum"
    .str.replace(":", "", regex=False)                    # optional: ":" entfernen
    .str.strip()                                          # Leerzeichen trimmen
)


In [7]:
# normalize NBSPs and remove thousand separators from price_raw, then save
df["price_raw"] = (
    df["price_raw"]
    .astype(str)
    .str.replace("\u00A0", " ", regex=False)         # normalise non-breaking spaces
    .str.replace(r"(?<=\d)\.(?=\d)", "", regex=True) # remove dots between digits (thousand separators)
    .str.strip()
)

# save to the cleaned combined CSV (don't use undefined variable `p`)
df.to_csv("data/Davos_combined_clean.csv", index=False)

# show a quick preview
df["price_raw"].head(10)

0     CHF 411
1     CHF 134
2     CHF 120
3     CHF 199
4     CHF 285
5     CHF 165
6     CHF 198
7    CHF 1170
8     CHF 464
9     CHF 679
Name: price_raw, dtype: object

In [8]:
# entferne die Wörter "vom Skilift entfernt" (case-insensitive, inkl. NBSP-Normalisierung) und speichere
if "distance_skilift" in df.columns:
    df["distance_skilift"] = (
        df["distance_skilift"]
        .astype(str)
        .str.replace("\u00A0", " ", regex=False)                       # NBSP normalisieren
        .str.replace(r"\s*vom\s*skilift\s*entfernt", "", regex=True, case=False)
        .str.strip()
    )
    # save to the cleaned combined CSV (use the same path used elsewhere)
    df.to_csv("data/Davos_combined_clean.csv", index=False)

# schnelle Vorschau
df["distance_skilift"].head(10)

0     100 m
1     600 m
2    1,2 km
3     300 m
4     100 m
5     500 m
6     650 m
7     550 m
8     200 m
9     550 m
Name: distance_skilift, dtype: object

In [9]:
def km_to_meter(val):
    # String-Fälle
    if isinstance(val, str):
        val = val.strip()

        # km → m
        if "km" in val:
            try:
                meters = float(val.replace("km", "").replace(",", ".").strip()) * 1000
                return f"{int(meters)} m"
            except ValueError:
                return val

        # m → m (bereinigen)
        if "m" in val:
            try:
                meters = int(val.replace("m", "").strip())
                return f"{meters} m"
            except ValueError:
                return val

        #reine Zahl als String
        if val.replace(".", "").isdigit():
            return f"{int(float(val))} m"

    # echte Zahlen
    if isinstance(val, (int, float)):
        return f"{int(val)} m"

    return val


df["distance_skilift"] = df["distance_skilift"].apply(km_to_meter)
df["distance_zentrum"] = df["distance_zentrum"].apply(km_to_meter)


In [10]:
df["distance_skilift"].head(10)

0     100 m
1     600 m
2    1200 m
3     300 m
4     100 m
5     500 m
6     650 m
7     550 m
8     200 m
9     550 m
Name: distance_skilift, dtype: object

In [11]:
df = df.rename(columns={"distance_skilift": "distance_skilift_meters"})

In [12]:


import re


def distance_to_meters(val):
    if pd.isna(val):
        return pd.NA

    if not isinstance(val, str):
        return val

    v = val.lower().replace(",", ".").strip()

    # km → Meter
    if "km" in v:
        num = re.findall(r"[\d.]+", v)
        if num:
            return int(float(num[0]) * 1000)

    # m → Meter
    if "m" in v:
        num = re.findall(r"[\d.]+", v)
        if num:
            return int(float(num[0]))

    return pd.NA


In [13]:
df.head()

Unnamed: 0,web_scraper_order,web_scraper_start_url,price_raw,type,distance_zentrum,distance_skilift_meters,zimmer_grösse
0,1763925941-1,https://www.booking.com/searchresults.de.html?...,CHF 411,ferienwohnung,800 m,100 m,Apartment mit Blick auf die Berge
1,1763925941-2,https://www.booking.com/searchresults.de.html?...,CHF 134,ferienwohnung,900 m,600 m,Comfort Studio (2 Erwachsene)
2,1763925941-3,https://www.booking.com/searchresults.de.html?...,CHF 120,ferienwohnung,1400 m,1200 m,Zweibettzimmer
3,1763925941-4,https://www.booking.com/searchresults.de.html?...,CHF 199,ferienwohnung,800 m,300 m,Standard Doppelzimmer
4,1763925941-5,https://www.booking.com/searchresults.de.html?...,CHF 285,ferienwohnung,1200 m,100 m,Executive Doppelzimmer


In [14]:
df = df.rename(columns={"distance_zentrum": "distance_zentrum_meters"})

In [15]:
# Speichern (lokaler data-Ordner)
df.to_csv("data/Davos_combined_clean.csv", index=False)

In [16]:
s = df["zimmer_grösse"].astype(str).str.replace("\u00A0", " ", regex=False).str.strip().str.lower()

# Zimmer-Typ (deins kann so bleiben)
def _detect_typ(txt):
    if pd.isna(txt) or txt == "" or txt == "nan":
        return pd.NA
    if any(k in txt for k in ("apartment", "ap. ", "apt", "apart")):
        return "apartment"
    if "studio" in txt:
        return "studio"
    if "chalet" in txt:
        return "chalet"
    if any(k in txt for k in ("suite", "loft")):
        return "suite"
    if any(k in txt for k in ("doppelzimmer", "einzelzimmer", "zweibett", "dreibett", "bett", "zimmer")):
        return "zimmer"
    return "other"

df["zimmer_typ"] = s.apply(_detect_typ)

# --- WICHTIG: Zimmercount korrekt definieren ---

# (A) Explizite Angaben wie "2 Zimmer", "3 Schlafzimmer"
num_rooms = s.str.extract(r'(\d+)\s*(?:zimmer|schlafzimmer)')[0]

# (B) Doppel-/Einzelzimmer sind IMMER 1 Zimmer
is_double_or_single = s.str.contains(r'\b(doppelzimmer|einzelzimmer)\b', na=False)

# (C) Bett-Angaben NICHT als Zimmer zählen (optional als extra Spalte)
num_beds = s.str.extract(r'(\d+)\s*(?:bett|betten)\b')[0]

# Zimmercount: zuerst num_rooms, sonst bei doppel/einzel -> 1, sonst NA
zimmer_count = pd.to_numeric(num_rooms, errors="coerce")
zimmer_count = zimmer_count.where(~zimmer_count.isna(), other=pd.NA)
zimmer_count = zimmer_count.mask(is_double_or_single, 1)

df["zimmer_count"] = zimmer_count.astype("Int64")


# Balkon
df["has_balkon"] = s.str.contains("balkon", na=False)

# Speichern (lokaler data-Ordner)


  is_double_or_single = s.str.contains(r'\b(doppelzimmer|einzelzimmer)\b', na=False)


In [17]:
# sicherstellen, dass df vorhanden ist
if "df" not in globals():
    df = pd.read_csv("data/Davos_combined_clean.csv")

# nur fehlende zimmer_count füllen, falls "doppelzimmer" oder "einzelzimmer" in zimmer_grösse steht
mask_missing = df["zimmer_count"].isna()
text = df["zimmer_grösse"].astype(str).str.lower()

df.loc[mask_missing & text.str.contains("doppelzimmer", na=False), "zimmer_count"] = 2
df.loc[mask_missing & text.str.contains("einzelzimmer", na=False), "zimmer_count"] = 1

# sicherstellen, dass Spalte wieder als nullable Int gespeichert wird
df["zimmer_count"] = df["zimmer_count"].astype("Int64")

# Fallback: wenn zimmer_count fehlt, mindestens 1 Zimmer zählen
mask_missing = df["zimmer_count"].isna()
df.loc[mask_missing, "zimmer_count"] = 1

df["zimmer_count"] = df["zimmer_count"].astype("Int64")


# speichern und kurze Vorschau
df.to_csv("data/Davos_combined_clean.csv", index=False)
df[["zimmer_grösse", "zimmer_count"]].head(20)

Unnamed: 0,zimmer_grösse,zimmer_count
0,Apartment mit Blick auf die Berge,1
1,Comfort Studio (2 Erwachsene),1
2,Zweibettzimmer,1
3,Standard Doppelzimmer,1
4,Executive Doppelzimmer,1
5,Einzelzimmer,1
6,Standard Einzelzimmer,1
7,Chalet,1
8,Deluxe Apartment,1
9,Apartment mit 2 Schlafzimmern,2


In [18]:
# rename column in the main df (and in related frames if present), then save
if "df" not in globals():
    df = pd.read_csv("data/Davos_combined_clean.csv")

if "zimmer_grösse" in df.columns:
    df.rename(columns={"zimmer_grösse": "zimmer_description"}, inplace=True)


    df.to_csv("data/Davos_combined_clean.csv", index=False)
    df[["zimmer_description"]].head(10)


In [19]:
# Entferne die beiden Spalten "web_scraper_order" und "web_scraper_start_url" aus df (falls vorhanden) und speichere
cols_to_drop = ["web_scraper_order", "web_scraper_start_url"]
df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])

df.to_csv("data/Davos_combined_clean.csv", index=False)
df.head()

Unnamed: 0,price_raw,type,distance_zentrum_meters,distance_skilift_meters,zimmer_description,zimmer_typ,zimmer_count,has_balkon
0,CHF 411,ferienwohnung,800 m,100 m,Apartment mit Blick auf die Berge,apartment,1,False
1,CHF 134,ferienwohnung,900 m,600 m,Comfort Studio (2 Erwachsene),studio,1,False
2,CHF 120,ferienwohnung,1400 m,1200 m,Zweibettzimmer,zimmer,1,False
3,CHF 199,ferienwohnung,800 m,300 m,Standard Doppelzimmer,zimmer,1,False
4,CHF 285,ferienwohnung,1200 m,100 m,Executive Doppelzimmer,zimmer,1,False


In [20]:
from sqlalchemy import create_engine
import pandas as pd

engine = create_engine(
    "mysql+pymysql://davos_user:Davos.123@localhost:3306/davos_db?charset=utf8mb4"
)

df = pd.read_csv("data/Davos_combined_clean.csv")

df.to_sql(
    "ferienwohnung_hotel",
    con=engine,
    if_exists="replace",
    index=False
)

pd.read_sql("SELECT * FROM ferienwohnung_hotel LIMIT 5;", con=engine)


OperationalError: (pymysql.err.OperationalError) (2003, "Can't connect to MySQL server on 'localhost' ([Errno 111] Connection refused)")
(Background on this error at: https://sqlalche.me/e/20/e3q8)