In [2]:
import pandas as pd#
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import lightgbm as lgb
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import Lasso, Ridge
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor

from ydata_profiling import ProfileReport


import xgboost as xgb
import mlflow
import mlflow.sklearn
import mlflow.xgboost

# Preprocessing & Feature Engineering


In [35]:
def preprocess_data (df): 
    df.replace('""', np.nan, inplace=True)
    df.dropna(inplace=True)
    df["LivingSpace"] = pd.to_numeric(df["LivingSpace"], errors="coerce")
    df["Rooms"] = pd.to_numeric(df["Rooms"], errors="coerce")
    df.dropna(subset=['ConstructionYear'], inplace=True)
    df.dropna(subset=['Object_price'], inplace=True)
    df.dropna(subset=['Rooms'], inplace=True)
    df.dropna(subset=['LivingSpace'], inplace=True)
    df.drop('Unnamed: 0', axis=1, inplace=True)
    df['ZipCode']= df['ZipCode'].astype(str)
    df['LivingSpace']= df['LivingSpace'].astype(float)
    df = df.reindex()
    df = df.reset_index(drop=True)

    # keep only the most promising features
    # sns.heatmap(df.corr(), cmap="RdBu")
    important_num_cols = list(df.corr()["Object_price"][(df.corr()["Object_price"]>0.20) | (df.corr()["Object_price"]<-0.20)].index)
    cat_cols = [col for col in df.columns if df[col].dtype == 'object']
    important_cols = important_num_cols + cat_cols
    df = df[important_cols]

    # one-hot encoding
    y = df['Object_price']
    X = df.drop('Object_price', axis=1)
    X = pd.get_dummies(X, columns=cat_cols)

    return X, y


df = pd.read_excel(r'data\flats_to_rent_wue_preprocessed_0407.xlsx')
X, y = preprocess_data(df)
print(X.shape)
print(y.shape)

(138, 49)
(138,)


  important_num_cols = list(df.corr()["Object_price"][(df.corr()["Object_price"]>0.20) | (df.corr()["Object_price"]<-0.20)].index)


# Explorative Data Analysis

In [3]:
df = pd.read_excel(r'data\flats_to_rent_wue_preprocessed_0407.xlsx')
profile = ProfileReport(df, title="Flats -  Würzburg - Rent - Overview", explorative=True)
profile.to_file("eda-wue-rent-all.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Data Split


In [None]:
def data_split(X, y, train_size=0.8, random_state=42):
    train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                  train_size=train_size, 
                                                  random_state = random_state)
    train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, 
                                                  train_size=train_size, 
                                                  random_state = random_state)
    return train_X, test_X, train_y, test_y, val_X, val_y

train_X, test_X, train_y, test_y, val_X, val_y = data_split(X, y)

# Train Different Models

## Baseline 

In [None]:
def scrape_avg_rental_prices():
    url = "https://www.wohnungsboerse.net/mietspiegel-Wuerzburg/2772"
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    script_tag = soup.find("script", string=lambda text: "pdfData" in text)
    rental_price = 0
    if script_tag:
        script_content = script_tag.string
        start_index = script_content.find("avg_rent_price: ") + len("avg_rent_price: '")
        end_index = script_content.find("',", start_index)
        rental_price = script_content[start_index:end_index]
        rental_price = (
            rental_price.replace("€/m2", "").replace(".", "").replace(",", ".")
        )
        rental_price = rental_price.strip()
        rental_price = float(rental_price)
        print(f"Extrcated rental price per square meter via scraper: {rental_price}")
    else:
        print("The script tag containing the rental price was not found.")
    return rental_price


def scrape_avg_buy_prices():
    url = "https://www.wohnungsboerse.net/immobilienpreise-Wuerzburg/2772"
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    p_element = soup.find("p", class_="mb-8")
    buy_price = 0
    if p_element:
        pattern = r"\d{1,3}(?:\.\d{3})*(?:,\d{2})?€/m²"
        match = re.search(pattern, p_element.text)
        if match:
            buy_price = match.group()
            buy_price = buy_price.replace("€/m²", "").replace(".", "").replace(",", ".")
            print(f"Extrcated buy price per square meter via scraper: {buy_price}")
        else:
            print("Price not found")
    else:
        print("The element ontaining the buy price was not found.")
    return buy_price

avg_rentprice = scrape_avg_rental_prices()
buyprice = scrape_avg_buy_prices()

In [None]:
def baseline_avg_rent_price(val_X, val_y, runname="baseline"):
    avg_price_per_sqm_rent = scrape_avg_rental_prices()
    print(f"Average rental price per sqm: {avg_price_per_sqm_rent}")
    
    baseline_preds = val_X['LivingSpace'] * avg_price_per_sqm_rent
    baseline_mae = mean_absolute_error(val_y, baseline_preds)
    baseline_r2 = r2_score(val_y, baseline_preds)
    baseline_mse = mean_squared_error(val_y, baseline_preds)

    with mlflow.start_run(run_name=runname):
        mlflow.log_metric("mse", baseline_mse)
        mlflow.log_metric("mae", baseline_mae)
        mlflow.log_metric("r2", baseline_r2)

    print(f"Baseline Mae: {baseline_mae}")
    print(f"Baseline MSE: {baseline_mse}")    
    print(f"Baseline R2 Score: {baseline_r2}")

    return avg_price_per_sqm_rent, baseline_mae, baseline_mse, baseline_r2

baseline_avg_rent_price(val_X, val_y, runname="baseline")

In [None]:
def baseline_avg_buy_price(val_X, val_y, runname="baseline"):
    avg_price_per_sqm_buy = scrape_avg_buy_prices()
    print(f"Average rental price per sqm: {avg_price_per_sqm_buy}")
    
    baseline_preds = val_X['LivingSpace'] * avg_price_per_sqm_buy
    baseline_mae = mean_absolute_error(val_y, baseline_preds)
    baseline_r2 = r2_score(val_y, baseline_preds)
    baseline_mse = mean_squared_error(val_y, baseline_preds)

    with mlflow.start_run(run_name=runname):
        mlflow.log_metric("mse", baseline_mse)
        mlflow.log_metric("mae", baseline_mae)
        mlflow.log_metric("r2", baseline_r2)

    print(f"Baseline Mae: {baseline_mae}")
    print(f"Baseline MSE: {baseline_mse}")    
    print(f"Baseline R2 Score: {baseline_r2}")

    return avg_price_per_sqm_buy, baseline_mae, baseline_mse, baseline_r2

In [20]:
mlflow.set_tracking_uri("http://localhost:5000")

## Regression - Linear, Lasso, Ridge

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def train_and_eval_linear(X_train, y_train, X_val, y_val, runname ="linear-regression"):
    mlflow.sklearn.autolog()
    with mlflow.start_run(run_name=runname):
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        pred_train = model.predict(train_X)
        mae_train = mean_absolute_error(train_y, pred_train)
        mse_train = mean_squared_error(train_y, pred_train)
        r2_train = r2_score(train_y, pred_train)
        mlflow.log_metric("mae_train", mae_train)
        mlflow.log_metric("mse_train", mse_train)
        mlflow.log_metric("r2_train", r2_train)
        
        y_pred = model.predict(X_val)
        mae = mean_absolute_error(y_val, y_pred)
        mse = mean_squared_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        mlflow.log_metric("mae", mae_train)
        mlflow.log_metric("mse", mse_train)
        mlflow.log_metric("r2", r2_train)
    return model, mae, mse, r2, mae_train, mse_train, r2_train

train_and_eval_linear(train_X, train_y, val_X, val_y, runname="linear-regression")



MAE: 174.06
MSE: 60887.43
R2 Score: 0.74


(LinearRegression(),
 174.0608502915187,
 60887.42932009267,
 0.7416748394649766,
 110.6579085035783,
 23748.43205730316,
 0.9298036397061346)

In [25]:
def train_and_eval_lasso(X_train, y_train, X_val, y_val, runname ="lasso-regression"):
    mlflow.sklearn.autolog()
    with mlflow.start_run(run_name=runname):
        model = Lasso()
        model.fit(X_train, y_train)
        
        pred_train = model.predict(train_X)
        mae_train = mean_absolute_error(train_y, pred_train)
        mse_train = mean_squared_error(train_y, pred_train)
        r2_train = r2_score(train_y, pred_train)
        mlflow.log_metric("mae_train", mae_train)
        mlflow.log_metric("mse_train", mse_train)
        mlflow.log_metric("r2_train", r2_train)
        
        y_pred = model.predict(X_val)
        mae = mean_absolute_error(y_val, y_pred)
        mse = mean_squared_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        mlflow.log_metric("mae", mae_train)
        mlflow.log_metric("mse", mse_train)
        mlflow.log_metric("r2", r2_train)
    return model, mae, mse, r2, mae_train, mse_train, r2_train

train_and_eval_lasso(train_X, train_y, val_X, val_y, runname="lasso-regression")



(Lasso(),
 177.7017030222935,
 69339.67627920092,
 0.7058147600204208,
 129.25425351412179,
 26787.135348427222,
 0.9208217452157864)

In [26]:
def train_and_eval_ridge(X_train, y_train, X_val, y_val, runname ="ridge-regression"):
    mlflow.sklearn.autolog()
    with mlflow.start_run(run_name=runname):
        model = Lasso()
        model.fit(X_train, y_train)
        
        pred_train = model.predict(train_X)
        mae_train = mean_absolute_error(train_y, pred_train)
        mse_train = mean_squared_error(train_y, pred_train)
        r2_train = r2_score(train_y, pred_train)
        mlflow.log_metric("mae_train", mae_train)
        mlflow.log_metric("mse_train", mse_train)
        mlflow.log_metric("r2_train", r2_train)
        
        y_pred = model.predict(X_val)
        mae = mean_absolute_error(y_val, y_pred)
        mse = mean_squared_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        mlflow.log_metric("mae", mae_train)
        mlflow.log_metric("mse", mse_train)
        mlflow.log_metric("r2", r2_train)
    return model, mae, mse, r2, mae_train, mse_train, r2_train

train_and_eval_ridge(train_X, train_y, val_X, val_y, runname="ridge-regression")



(Lasso(),
 177.7017030222935,
 69339.67627920092,
 0.7058147600204208,
 129.25425351412179,
 26787.135348427222,
 0.9208217452157864)

## XGBRegressor


In [19]:
def train_and_eval_xgb(train_X, train_y, val_X, val_y, test_X, test_y, early_stopping_rounds=30, max_depth=6, n_estimators=1000):
    mlflow.xgboost.autolog()
    with mlflow.start_run():
        model = xgb.XGBRegressor(eval_metric=['rmse', 'mae'], early_stopping_rounds=early_stopping_rounds, random_state=42, max_depth=max_depth, n_estimators=n_estimators)
        mlflow.log_param("early_stopping_rounds", early_stopping_rounds)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("n_estimators", n_estimators)
        model.fit(X=train_X, 
                  y=train_y,
                  eval_set=[(val_X, val_y)],
                  verbose=True
        )

        pred_train = model.predict(train_X)
        mae_train = mean_absolute_error(train_y, pred_train)
        mse_train = mean_squared_error(train_y, pred_train)
        r2_train = r2_score(train_y, pred_train)
        mlflow.log_metric("mae_train", mae_train)
        mlflow.log_metric("mse_train", mse_train)
        mlflow.log_metric("r2_train", r2_train)

        preds= model.predict(val_X)
        mae = mean_absolute_error(val_y, preds)
        mse = mean_squared_error(val_y, preds)
        r2 = r2_score(val_y, preds)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("r2", r2)        
        return model, mae, mse, r2, mae_train, mse_train, r2_train

train_and_eval_xgb(train_X, train_y, val_X, val_y, test_X, test_y, early_stopping_rounds=30, max_depth=6, n_estimators=1000)


[0]	validation_0-rmse:789.05448	validation_0-mae:689.82105
[1]	validation_0-rmse:573.56434	validation_0-mae:484.09524
[2]	validation_0-rmse:429.39471	validation_0-mae:339.46749
[3]	validation_0-rmse:327.95919	validation_0-mae:240.33495
[4]	validation_0-rmse:271.51062	validation_0-mae:173.21899
[5]	validation_0-rmse:263.05629	validation_0-mae:177.64294
[6]	validation_0-rmse:271.60454	validation_0-mae:183.17610
[7]	validation_0-rmse:288.47457	validation_0-mae:182.65352
[8]	validation_0-rmse:309.84957	validation_0-mae:189.41346
[9]	validation_0-rmse:330.46469	validation_0-mae:201.99247
[10]	validation_0-rmse:346.54070	validation_0-mae:211.82935
[11]	validation_0-rmse:349.93356	validation_0-mae:215.36463
[12]	validation_0-rmse:355.03683	validation_0-mae:220.38869
[13]	validation_0-rmse:357.20198	validation_0-mae:222.20600
[14]	validation_0-rmse:360.83612	validation_0-mae:229.00910
[15]	validation_0-rmse:363.40318	validation_0-mae:231.41579
[16]	validation_0-rmse:365.21245	validation_0-mae:

(XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=30,
              enable_categorical=False, eval_metric=['rmse', 'mae'],
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=42, ...),
 173.2189884255149,
 73718.01714812755,
 0.6872389124198179,
 210.16370678294788,
 89450.79483926565,
 0.7355985351808199)

# Random Forest

In [None]:
def train_and_eval_rf(train_X, train_y, val_X, val_y, n_estimators=50, random_state=0):
    mlflow.sklearn.autolog()
    with mlflow.start_run():
        model = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
        model.fit(train_X, train_y)
        pred_train = model.predict(train_X)
        mae_train = mean_absolute_error(train_y, pred_train)
        mse_train = mean_squared_error(train_y, pred_train)
        r2_train = r2_score(train_y, pred_train)
        preds= model.predict(val_X)
        mae = mean_absolute_error(val_y, preds)
        mse = mean_squared_error(val_y, preds)
        r2 = r2_score(val_y, preds)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae_train", mae_train)
        mlflow.log_metric("mse_train", mse_train)
        mlflow.log_metric("r2_train", r2_train)
        return model, mae, mse, r2, mae_train, mse_train, r2_train
    
train_and_eval_rf(train_X, train_y, val_X, val_y, n_estimators=50, random_state=0)

## ElasticNet

In [None]:
def train_and_eval_elasticnet(train_X, train_y, val_X, val_y, runname="elasticNet"):
    mlflow.sklearn.autolog()
    with mlflow.start_run(run_name=runname):
        model = ElasticNet()
        model.fit(train_X, train_y)
        pred_train = model.predict(train_X)
        mae_train = mean_absolute_error(train_y, pred_train)
        mse_train = mean_squared_error(train_y, pred_train)
        r2_train = r2_score(train_y, pred_train)
        preds= model.predict(val_X)
        mae = mean_absolute_error(val_y, preds)
        mse = mean_squared_error(val_y, preds)
        r2 = r2_score(val_y, preds)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae_train", mae_train)
        mlflow.log_metric("mse_train", mse_train)
        mlflow.log_metric("r2_train", r2_train)
        return model, mae, mse, r2, mae_train, mse_train, r2_train

train_and_eval_elasticnet(train_X, train_y, val_X, val_y)