In [80]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import lightgbm as lgb
from matplotlib import pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
import shap
from ctgan import CTGAN

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import Lasso, Ridge, LinearRegression, ElasticNet
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

import mlflow
import mlflow.sklearn
import mlflow.xgboost
import matplotlib.pyplot as plt
import json
import optuna

mlflow.set_tracking_uri("http://localhost:5000")

# Explorative Data Analysis

In [81]:
df = pd.read_excel(r'data\flats_to_rent_wue_preprocessed_combined.xlsx')
profile = ProfileReport(df, title="Flats -  Würzburg - Rent - Overview", explorative=True)
profile.to_file("eda-wue-rent-all.html")

df = pd.read_excel(r'data\houses_to_buy_wue_preprocessed_1207.xlsx')
profile = ProfileReport(df, title="Houses -  Würzburg - Buy - Overview", explorative=True)
profile.to_file("eda-wue-houses.html")

# Preprocessing & Feature Engineering


In [82]:
def determineHighCorrCols(df):
    df.columns = [
        re.sub(r"\\u([0-9a-fA-F]{4})", lambda m: chr(int(m.group(1), 16)), col)
        for col in df.columns
    ]
    df.columns = [
        col.replace("ö", "oe").replace("ä", "ae").replace("ü", "ue").replace("ß", "ss")
        for col in df.columns
    ]
 
    important_num_cols = list(
        df.corr()["Object_price"][
            (df.corr()["Object_price"] > 0.20) | (df.corr()["Object_price"] < -0.20)
        ].index
    )
    cat_cols = [col for col in df.columns if df[col].dtype == "object"]
    important_cols = important_num_cols + cat_cols + ["ConstructionYear"] + ["ZipCode"]
    print(important_cols)
    return important_cols

In [83]:
def preprocess_data(df, feature_set):
    print(f"Used feature set for preprocessing:{feature_set}")
    df.columns = [
        re.sub(r"\\u([0-9a-fA-F]{4})", lambda m: chr(int(m.group(1), 16)), col)
        for col in df.columns
    ]
    df.columns = [
        col.replace("ö", "oe").replace("ä", "ae").replace("ü", "ue").replace("ß", "ss")
        for col in df.columns
    ]
    df = df.replace('""', np.nan)
    df = df.dropna()
    df["LivingSpace"] = df["LivingSpace"].astype(float)
    df["Rooms"] = df["Rooms"].astype(float)
    df["ZipCode"] = df["ZipCode"].astype(str)
    df["LivingSpace"] = df["LivingSpace"].astype(float)
    df = df[feature_set]
    df = df.reindex()
    df = df.reset_index(drop=True)
    cat_cols = [col for col in df.columns if df[col].dtype == "object"]
    df = pd.get_dummies(df, columns=cat_cols)
    return df

# Data Split


In [85]:
def data_split(df, train_size=0.8, random_state=42):
    y = df["Object_price"]
    X = df.drop("Object_price", axis=1)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_size, random_state=random_state
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, train_size=train_size, random_state=random_state
    )

    # X_train.to_excel("data/X_train.xlsx")
    # X_val.to_excel("data/X_val.xlsx")
    # X_test.to_excel("data/X_test.xlsx")
    # y_train.to_excel("data/y_train.xlsx")
    # y_val.to_excel("data/y_val.xlsx")
    # y_test.to_excel("data/y_test.xlsx")

    df_train = pd.concat([X_train, y_train], axis=1)
    df_val = pd.concat([X_val, y_val], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)
    df_train.to_excel("data/df_train.xlsx")
    df_val.to_excel("data/df_val.xlsx")
    df_test.to_excel("data/df_test.xlsx")

    return X_train, y_train, X_val, y_val, X_test, y_test

# Hyperparameter Tuning with OPTUNA Framework

Because of long calculation times (especially with enable autolog for mlflow) not included in pipeline, but are saved to json file. Might make sense to re-executed when major data changes were incoming. 

In [86]:
X_train = pd.read_excel(r"data\X_train.xlsx")
X_train = X_train.drop(columns=["Unnamed: 0"])
y_train = pd.read_excel(r"data\y_train.xlsx")
y_train = y_train.drop(columns=["Unnamed: 0"])

X_val = pd.read_excel(r"data\X_val.xlsx")
X_val = X_val.drop(columns=["Unnamed: 0"])
y_val = pd.read_excel(r"data\y_val.xlsx")
y_val = y_val.drop(columns=["Unnamed: 0"])

print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_val shape: ", X_val.shape)
print("y_val shape: ", y_val.shape)


X_train shape:  (108, 46)
y_train shape:  (108, 1)
X_val shape:  (27, 46)
y_val shape:  (27, 1)


In [87]:
mlflow.autolog(disable=True)

In [88]:
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = xgb.XGBRegressor(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return mean_squared_error(y_val, y_pred)


study = optuna.create_study(direction='minimize', study_name='regression')
study.optimize(objective, n_trials=100)

best_params = study.best_params
print('Best params:', best_params)

with open(r'hyperparameter_tuned/best_params_xgb.json', 'w') as f:
    json.dump(best_params, f)

## Random Forest

In [89]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 1000)
    max_depth = trial.suggest_int('max_depth', 1, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    random_state = trial.suggest_int('random_state', 1, 1000)
    
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=random_state
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return mean_squared_error(y_val, y_pred)

study = optuna.create_study(direction='minimize', study_name='random_forest_regression')
study.optimize(objective, n_trials=100)

best_params = study.best_params
print('Best params:', best_params)

with open(r'hyperparameter_tuned/best_params_rf.json', 'w') as f:
    json.dump(best_params, f)

## Lasso

In [90]:
def objective(trial):
    alpha = trial.suggest_float('alpha', 0.01, 1.0)
    random_state = trial.suggest_int('random_state', 1, 1000)
    
    model = Lasso(alpha=alpha, random_state=random_state)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return mean_squared_error(y_val, y_pred)

study = optuna.create_study(direction='minimize', study_name='lasso_regression')
study.optimize(objective, n_trials=1)

best_params = study.best_params
print('Best params:', best_params)

with open(r'hyperparameter_tuned/best_params_lasso.json', 'w') as f:
    json.dump(best_params, f)

## Ridge 

In [91]:
def objective(trial):
    alpha = trial.suggest_float('alpha', 0.01, 1.0)
    random_state = trial.suggest_int('random_state', 1, 1000)
    
    model = Ridge(alpha=alpha, random_state=random_state)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return mean_squared_error(y_val, y_pred)

study = optuna.create_study(direction='minimize', study_name='ridge_regression')
study.optimize(objective, n_trials=100)

best_params = study.best_params
print('Best params:', best_params)

with open(r'hyperparameter_tuned/best_params_ridge.json', 'w') as f:
    json.dump(best_params, f)

## ElasticNet 

In [92]:
def objective(trial):
    alpha = trial.suggest_float('alpha', 0.01, 1.0)
    l1_ratio = trial.suggest_float('l1_ratio', 0.01, 1.0)
    random_state = trial.suggest_int('random_state', 1, 1000)
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=random_state)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return mean_squared_error(y_val, y_pred)

study = optuna.create_study(direction='minimize', study_name='elasticnet_regression')
study.optimize(objective, n_trials=100)

best_params = study.best_params
print('Best params:', best_params)


with open(r'hyperparameter_tuned/best_params_elasticnet.json', 'w') as f:
    json.dump(best_params, f)

# Training, Evaluation and Logging Of Models

## Baseline 

In [93]:
def scrape_avg_rental_prices():
    url = "https://www.wohnungsboerse.net/mietspiegel-Wuerzburg/2772"
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    script_tag = soup.find("script", string=lambda text: "pdfData" in text)
    rental_price = 0
    if script_tag:
        script_content = script_tag.string
        start_index = script_content.find("avg_rent_price: ") + len("avg_rent_price: '")
        end_index = script_content.find("',", start_index)
        rental_price = script_content[start_index:end_index]
        rental_price = (
            rental_price.replace("€/m2", "").replace(".", "").replace(",", ".")
        )
        rental_price = rental_price.strip()
        rental_price = float(rental_price)
        print(f"Extrcated rental price per square meter via scraper: {rental_price}")
    else:
        print("The script tag containing the rental price was not found.")
    return rental_price

In [94]:
def scrape_avg_buy_prices():
    url = "https://www.wohnungsboerse.net/immobilienpreise-Wuerzburg/2772"
    response = requests.get(url)
    response.raise_for_status(
    )
    soup = BeautifulSoup(response.text, "html.parser")
    p_element = soup.find("p", class_="mb-8")
    buy_price = 0
    if p_element:
        pattern = r"\d{1,3}(?:\.\d{3})*(?:,\d{2})?€/m²"
        match = re.search(pattern, p_element.text)
        if match:
            buy_price = match.group()
            buy_price = buy_price.replace("€/m²", "").replace(".", "").replace(",", ".")
            print(f"Extrcated buy price per square meter via scraper: {buy_price}")
        else:
            print("Price not found")
    else:
        print("The element ontaining the buy price was not found.")
    return buy_price

In [95]:
def baseline_rent(val_X, val_y, runname="baseline_rent"):
    avg_price_per_sqm_rent = scrape_avg_rental_prices()
    print(f"Average rental price per sqm: {avg_price_per_sqm_rent}")
    return avg_price_per_sqm_rent

In [96]:
def baseline_buy(X_val, y_val, runname="baseline_buy"):
    avg_price_per_sqm_buy = scrape_avg_buy_prices()
    print(f"Average rental price per sqm: {avg_price_per_sqm_buy}")

    baseline_preds = X_val["LivingSpace"] * avg_price_per_sqm_buy
    baseline_mae = mean_absolute_error(y_val, baseline_preds)
    baseline_r2 = r2_score(y_val, baseline_preds)
    baseline_mse = mean_squared_error(y_val, baseline_preds)

    with mlflow.start_run(run_name=runname):
        mlflow.log_metric("mse", baseline_mse)
        mlflow.log_metric("mae", baseline_mae)
        mlflow.log_metric("r2", baseline_r2)

    print(f"Baseline Mae: {baseline_mae}")
    print(f"Baseline MSE: {baseline_mse}")
    print(f"Baseline R2 Score: {baseline_r2}")

    return avg_price_per_sqm_buy, baseline_mae, baseline_mse, baseline_r2

## Regression (Linear, Lasso, Ridge)

In [97]:
def train_linear(
    X_train,
    y_train,
):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model


def train_lasso(
    X_train,
    y_train,
    hyperparameter_tuning=True,
):
    if hyperparameter_tuning:
        with open(r"hyperparameter_tuned/best_params_lasso.json") as f:
            best_params = json.load(f)
        model = Lasso(**best_params)
    else:
        model = Lasso()
    model.fit(X_train, y_train)
    return model


def train_ridge(
    X_train,
    y_train,
    hyperparameter_tuning=True,
):
    if hyperparameter_tuning:
        with open(r"hyperparameter_tuned/best_params_ridge.json") as f:
            best_params = json.load(f)
        model = Ridge(**best_params)
    else:
        model = Ridge()
    model.fit(X_train, y_train)
    return model

## Random Forest

In [109]:
def train_rf(
    X_train,
    y_train,
    hyperparameter_tuning=True,
):
    if hyperparameter_tuning:
        with open(r"hyperparameter_tuned/best_params_rf.json") as f:
            best_params = json.load(f)
        model = RandomForestRegressor(**best_params)
    else:
        model = RandomForestRegressor()
    model.fit(X_train, y_train)
    return model

## XGBRegressor


In [99]:
def train_xgb(
    X_train,
    y_train,
    X_val,
    y_val,
    early_stopping_rounds=30,
    max_depth=6,
    n_estimators=100,
    hyperparameter_tuning=True,
):
    if hyperparameter_tuning:
        with open(r"hyperparameter_tuned/best_params_xgb.json") as f:
            best_params = json.load(f)
        model = xgb.XGBRegressor(**best_params)
    else:
        model = xgb.XGBRegressor(
            eval_metric=["rmse", "mae"],
            early_stopping_rounds=early_stopping_rounds,
            random_state=42,
            max_depth=max_depth,
            n_estimators=n_estimators,
        )
    model.fit(X=X_train, y=y_train, eval_set=[(X_val, y_val)], verbose=True)
    ## Example of shap values, due to calculation time in retraining process outcommented
    explainer = shap.Explainer(model)
    shap_values = explainer(X_train)
    shap.plots.waterfall(shap_values[6])
    shap.plots.waterfall(shap_values[5])
    shap.plots.waterfall(shap_values[9])
    shap.plots.waterfall(shap_values[10])
    shap.plots.beeswarm(shap_values)

    return model

## ElasticNet

In [100]:
def train_elasticnet(
    X_train, y_train, hyperparameter_tuning=True
):
    if hyperparameter_tuning:
        with open(r"hyperparameter_tuned/best_params_elasticnet.json") as f:
            best_params = json.load(f)
        model = ElasticNet(**best_params)
    else:
        model = ElasticNet()
    model.fit(X_train, y_train)
    return model

# Complete Pipeline

### Regular Pipeline

In [111]:
def pipeline_from_extracted(df, feature_set, model_name="lasso", hpt=True):
    mlflow.end_run()
    model = None
    df = preprocess_data(df, feature_set)
    print("Done with preprocessing")
    X_train, y_train, X_val, y_val, X_test, y_test = data_split(df)
    print("Done with data split")

    if model_name == "xgb":
        mlflow.xgboost.autolog()
    else:
        mlflow.sklearn.autolog()

    with mlflow.start_run(run_name=model_name):
        model, mae, mse, r2, mae_train, mse_train, r2_train = (
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        )

        if model_name == "lasso":
            model = train_lasso(X_train, y_train, hyperparameter_tuning=hpt)
        elif model_name == "ridge":
            model = train_ridge(X_train, y_train, hyperparameter_tuning=hpt)
        elif model_name == "rf":
            model = train_rf(
                X_train,
                y_train,
                hyperparameter_tuning=hpt,
            )
        elif model_name == "xgb":
            model = train_xgb(X_train, y_train,X_val, y_val, hyperparameter_tuning=hpt)
        elif model_name == "elasticnet":
            model = train_elasticnet(X_train, y_train, hyperparameter_tuning=hpt)
        elif model_name == "linear":
            model = train_linear(X_train, y_train)
        elif model_name == "baseline-rent":
            avg_price = baseline_rent(X_val, y_val)
            baseline_preds = X_val["LivingSpace"] * avg_price
            baseline_preds_test = X_test["LivingSpace"] * avg_price
            mlflow.log_metric("mae", mean_absolute_error(y_val, baseline_preds))
            mlflow.log_metric("r2", r2_score(y_val, baseline_preds))
            mlflow.log_metric("mse", mean_squared_error(y_val, baseline_preds))
            mlflow.log_metric(
                "rmse", mean_squared_error(y_val, baseline_preds, squared=False)
            )

            mlflow.log_metric(
                "mae_test", mean_absolute_error(y_test, baseline_preds_test)
            )
            mlflow.log_metric(
                "mse_test", mean_squared_error(y_test, baseline_preds_test)
            )
            mlflow.log_metric(
                "rmse_test",
                mean_squared_error(y_test, baseline_preds_test, squared=False),
            )
            mlflow.log_metric("r2_test", r2_score(y_test, baseline_preds_test))
            return model, mae, mse, r2, mae_train, mse_train, r2_train
        else:
            print(
                "Model not found. Model_name must be 'lasso', 'ridge', 'rf', 'xgb', 'elasticnet', 'linear', 'baseline_buy' or 'baseline_rent' or conigure the pipeline manually."
            )

        pred_train = model.predict(X_train)
        preds = model.predict(X_val)
        pred_test = model.predict(X_test)

        mlflow.log_metric("mae_test", mean_absolute_error(y_test, pred_test))
        mlflow.log_metric("r2_test", r2_score(y_test, pred_test))
        mlflow.log_metric("mse_test", mean_squared_error(y_test, pred_test))
        mlflow.log_metric(
            "rmse_test", mean_squared_error(y_test, pred_test, squared=False)
        )

        mlflow.log_metric("mae_train", mean_absolute_error(y_train, pred_train))
        mlflow.log_metric("mse_train", mean_squared_error(y_train, pred_train))
        mlflow.log_metric(
            "rmse_train", mean_squared_error(y_train, pred_train, squared=False)
        )
        mlflow.log_metric("r2_train", r2_score(y_train, pred_train))

        mlflow.log_metric("mae", mean_absolute_error(y_val, preds))
        mlflow.log_metric("mse", mean_squared_error(y_val, preds))
        mlflow.log_metric(
            "rmse", mean_squared_error(y_val, preds, squared=False)
        )
        mlflow.log_metric("r2", r2_score(y_val, preds))

    print("Done with train")
    mlflow.end_run()
    return model, mae, mse, r2, mae_train, mse_train, r2_train

### Augmented Data Approach with CTGan

In [102]:
def complete_from_extracted_with_augemented_training_data(
    df,
    feature_set,
    model_name,
    augmentation_epochs=30,
    n_added_fake_train_data=1000,
):
    df = pd.read_excel(r"data\flats_to_rent_wue_preprocessed_combined.xlsx")

    df = df.replace('""', np.nan)
    df = df.dropna()
    df["LivingSpace"] = df["LivingSpace"].astype(float)
    df["Rooms"] = df["Rooms"].astype(float)
    df["ZipCode"] = df["ZipCode"].astype(str)
    df = df[feature_set]
    df = df.reindex()
    df = df.reset_index(drop=True)
    X = df.drop(columns=["Object_price"])
    y = df["Object_price"]

    X_train, y_train, X_val, y_val, X_test, y_test = data_split(X, y)

    train = pd.concat([X_train, y_train], axis=1)
    val = pd.concat([X_val, y_val], axis=1)

    df_cttrain = train.copy()
    df_cttrain = df_cttrain.dropna()

    numerical_features = ["Rooms", "LivingSpace", "Object_price"]

    df_cttrain[numerical_features] = df_cttrain[numerical_features].astype("int64")
    categorical_features = [
        col for col in df_cttrain.columns if col not in numerical_features
    ]
    ctgan = CTGAN(verbose=True)
    ctgan.fit(df_cttrain, categorical_features, epochs=augmentation_epochs)

    samples = ctgan.sample(n_added_fake_train_data * 3)
    samples["Object_price"] = samples["Object_price"].clip(lower=0)
    samples = samples[samples["LivingSpace"] > 20]
    samples = samples[samples["Object_price"] > 120]
    samples = samples.head(n_added_fake_train_data)

    X_sample_train = samples.drop(columns=["Object_price"])
    y_sample_train = samples["Object_price"]
    X_train = pd.concat([X_train, X_sample_train])
    y_train = pd.concat([y_train, y_sample_train])

    X_train_cat_cols = [
        col for col in X_train.columns if X_train[col].dtype == "object"
    ]
    X_train = pd.get_dummies(X_train, columns=X_train_cat_cols)

    X_val_cat_cols = [col for col in X_val.columns if X_val[col].dtype == "object"]
    X_val = pd.get_dummies(X_val, columns=X_val_cat_cols)

    print(f"X_val shape: {X_val.shape}")
    print(f"val shape: {val.shape}")
    print(f"X_val head: {X_train.head()}")
    print(f"val head: {val.head()}")

    X_train, X_val = X_train.align(X_val, join="outer", axis=1, fill_value=0)

    model, mae, mse, r2, mae_train, mse_train, r2_train = (
        None,
        None,
        None,
        None,
        None,
        None,
        None,
    )

    if model_name == "lasso":
        model, mae, mse, r2, mae_train, mse_train, r2_train = train_lasso(
            X_train, y_train
        )
    elif model_name == "ridge":
        model, mae, mse, r2, mae_train, mse_train, r2_train = train_ridge(
            X_train,
            y_train
        )
    elif model_name == "rf":
        model, mae, mse, r2, mae_train, mse_train, r2_train = train_rf(X_train, y_train)
    elif model_name == "xgb":
        model, mae, mse, r2, mae_train, mse_train, r2_train = train_xgb(
            X_train, y_train, X_val, y_val
        )
    elif model_name == "elasticnet":
        model, mae, mse, r2, mae_train, mse_train, r2_train = train_elasticnet(
            X_train, y_train
        )
    elif model_name == "linear":
        model, mae, mse, r2, mae_train, mse_train, r2_train = train_linear(
            X_train, y_train
        )
    elif model_name == "baseline-rent":
        avg_price, mae, mse, r2 = baseline_rent(X_val, y_val)
    else:
        print(
            "Model not found. Model_name must be 'lasso', 'ridge', 'rf', 'xgb', 'elasticnet', 'linear', 'baseline_buy' or 'baseline_rent'"
        )
    print("Done with train")
    return model, mae, mse, r2, mae_train, mse_train, r2_train

# Execution Complete Pipeline

In [103]:
feature_set_selected = [
    "Object_price",
    "LivingSpace",
    "ZipCode",
    "Rooms",
    "altbau_(bis_1945)",
    "balkon",
    "barriefrei",
    "dachgeschoss",
    "einbaukueche",
    "neubau",
    "parkett",
    "stellplatz",
    "bad/wc_getrennt",
    "personenaufzug",
    "garten",
    "garage",
    "renoviert",
    "terrasse",
    "wanne",
    "zentralheizung",
    "abstellraum",
    "ferne",
    "fussbodenheizung",
    "gartennutzung",
    "kelleranteil",
]

In [112]:
df = pd.read_excel(r"data\flats_to_rent_wue_preprocessed_combined.xlsx")
# mlflow.autolog(disable=False)
mlflow.set_experiment("wue-rent-fs-app-no-hpt")
model_names =["rf",  "ridge", "xgb", "elasticnet", "linear", "baseline-rent", "lasso",]

for model_name in model_names:
    mlflow.end_run()
    df = pd.read_excel(r"data\flats_to_rent_wue_preprocessed_combined.xlsx")
    print("Starting with model: ", model_name)
    pipeline_from_extracted(df, model_name=model_name, feature_set=feature_set_selected, hpt=False)
    print("Done with model: ", model_name)
    print("---------------------------------------------------------------------------")

Starting with model:  rf
Used feature set for preprocessing:['Object_price', 'LivingSpace', 'ZipCode', 'Rooms', 'altbau_(bis_1945)', 'balkon', 'barriefrei', 'dachgeschoss', 'einbaukueche', 'neubau', 'parkett', 'stellplatz', 'bad/wc_getrennt', 'personenaufzug', 'garten', 'garage', 'renoviert', 'terrasse', 'wanne', 'zentralheizung', 'abstellraum', 'ferne', 'fussbodenheizung', 'gartennutzung', 'kelleranteil']
Done with preprocessing
Done with data split




Done with train
Done with model:  rf
---------------------------------------------------------------------------
Starting with model:  ridge
Used feature set for preprocessing:['Object_price', 'LivingSpace', 'ZipCode', 'Rooms', 'altbau_(bis_1945)', 'balkon', 'barriefrei', 'dachgeschoss', 'einbaukueche', 'neubau', 'parkett', 'stellplatz', 'bad/wc_getrennt', 'personenaufzug', 'garten', 'garage', 'renoviert', 'terrasse', 'wanne', 'zentralheizung', 'abstellraum', 'ferne', 'fussbodenheizung', 'gartennutzung', 'kelleranteil']
Done with preprocessing
Done with data split




Done with train
Done with model:  ridge
---------------------------------------------------------------------------
Starting with model:  xgb
Used feature set for preprocessing:['Object_price', 'LivingSpace', 'ZipCode', 'Rooms', 'altbau_(bis_1945)', 'balkon', 'barriefrei', 'dachgeschoss', 'einbaukueche', 'neubau', 'parkett', 'stellplatz', 'bad/wc_getrennt', 'personenaufzug', 'garten', 'garage', 'renoviert', 'terrasse', 'wanne', 'zentralheizung', 'abstellraum', 'ferne', 'fussbodenheizung', 'gartennutzung', 'kelleranteil']
Done with preprocessing
Done with data split
[0]	validation_0-rmse:975.49976	validation_0-mae:845.57492
[1]	validation_0-rmse:749.26720	validation_0-mae:618.15297
[2]	validation_0-rmse:599.25921	validation_0-mae:474.74173
[3]	validation_0-rmse:482.26499	validation_0-mae:358.96303
[4]	validation_0-rmse:414.90881	validation_0-mae:302.03719
[5]	validation_0-rmse:366.23603	validation_0-mae:264.90933
[6]	validation_0-rmse:336.23308	validation_0-mae:243.57824
[7]	validation_



Done with train
Done with model:  xgb
---------------------------------------------------------------------------
Starting with model:  elasticnet
Used feature set for preprocessing:['Object_price', 'LivingSpace', 'ZipCode', 'Rooms', 'altbau_(bis_1945)', 'balkon', 'barriefrei', 'dachgeschoss', 'einbaukueche', 'neubau', 'parkett', 'stellplatz', 'bad/wc_getrennt', 'personenaufzug', 'garten', 'garage', 'renoviert', 'terrasse', 'wanne', 'zentralheizung', 'abstellraum', 'ferne', 'fussbodenheizung', 'gartennutzung', 'kelleranteil']
Done with preprocessing
Done with data split




Done with train
Done with model:  elasticnet
---------------------------------------------------------------------------
Starting with model:  linear
Used feature set for preprocessing:['Object_price', 'LivingSpace', 'ZipCode', 'Rooms', 'altbau_(bis_1945)', 'balkon', 'barriefrei', 'dachgeschoss', 'einbaukueche', 'neubau', 'parkett', 'stellplatz', 'bad/wc_getrennt', 'personenaufzug', 'garten', 'garage', 'renoviert', 'terrasse', 'wanne', 'zentralheizung', 'abstellraum', 'ferne', 'fussbodenheizung', 'gartennutzung', 'kelleranteil']
Done with preprocessing
Done with data split




Done with train
Done with model:  linear
---------------------------------------------------------------------------
Starting with model:  baseline-rent
Used feature set for preprocessing:['Object_price', 'LivingSpace', 'ZipCode', 'Rooms', 'altbau_(bis_1945)', 'balkon', 'barriefrei', 'dachgeschoss', 'einbaukueche', 'neubau', 'parkett', 'stellplatz', 'bad/wc_getrennt', 'personenaufzug', 'garten', 'garage', 'renoviert', 'terrasse', 'wanne', 'zentralheizung', 'abstellraum', 'ferne', 'fussbodenheizung', 'gartennutzung', 'kelleranteil']
Done with preprocessing
Done with data split
Extrcated rental price per square meter via scraper: 11.21
Average rental price per sqm: 11.21
Done with model:  baseline-rent
---------------------------------------------------------------------------
Starting with model:  lasso
Used feature set for preprocessing:['Object_price', 'LivingSpace', 'ZipCode', 'Rooms', 'altbau_(bis_1945)', 'balkon', 'barriefrei', 'dachgeschoss', 'einbaukueche', 'neubau', 'parkett', '



Done with train
Done with model:  lasso
---------------------------------------------------------------------------


## Execution of Pipeline with synthetic data added to training base

In [None]:
df = pd.read_excel(r"data\flats_to_rent_wue_preprocessed_combined.xlsx")
mlflow.set_experiment("wue-rent-selected-features-aug")
model_names = ["baseline-rent", "lasso", "ridge", "xgb", "rf", "elasticnet", "linear"]

for model_name in model_names:
    mlflow.end_run()
    df = pd.read_excel(r"data\flats_to_rent_wue_preprocessed_combined.xlsx")
    print("Starting with model: ", model_name)
    complete_from_extracted_with_augemented_training_data(df, model_name=model_name, feature_set=feature_set_selected, n_added_fake_train_data=120, augmentation_epochs=170)
    print("Done with model: ", model_name)
    print("---------------------------------------------------------------------------")