In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import numpy as np
from sklearn.metrics import mean_squared_error
import mlflow
import mlflow.xgboost
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, Lasso, ElasticNet

In [None]:
data = pd.read_excel(r'data\flats_to_rent_wue_preprocessed_0407.xlsx')

data.dropna(subset=['ConstructionYear'], inplace=True)
data.dropna(subset=['Object_price'], inplace=True)
data.dropna(subset=['Rooms'], inplace=True)
# data.drop('Title', axis=1, inplace=True)
# data.drop('Object_currency', axis=1, inplace=True)
# data.drop('Url', axis=1, inplace=True)
# data.drop('EstateType', axis=1, inplace=True)
# data.drop('DistributionType', axis=1, inplace=True)
data = data.reindex()
data = data.reset_index(drop=True)

y = data['Object_price']
X = data.drop('Object_price', axis=1)

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                  train_size=0.8, 
                                                  random_state = 0)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, 
                                                  train_size=0.8, 
                                                  random_state = 0)

num_cols = [col for col in train_X.columns if train_X[col].dtype == 'float64']
cat_cols = [col for col in train_X.columns if train_X[col].dtype == 'object']
bin_cols = [col for col in train_X.columns if train_X[col].dtype == 'int64']

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore')

print (train_X[cat_cols])
X_train_categorical_xgb = encoder.fit_transform(train_X[cat_cols]).toarray()
X_valid_categorical_xgb = encoder.transform(val_X[cat_cols]).toarray()
X_test_categorical_xgb = encoder.transform(test_X[cat_cols]).toarray()
X_train_processed = np.concatenate([train_X[num_cols], X_train_categorical_xgb], axis=1)
X_valid_processed = np.concatenate([val_X[num_cols], X_valid_categorical_xgb], axis=1)
X_test_processed = np.concatenate([test_X[num_cols], X_test_categorical_xgb], axis=1)
X_train_processed = np.concatenate([train_X[bin_cols], X_train_processed], axis=1)
X_valid_processed = np.concatenate([val_X[bin_cols], X_valid_processed], axis=1)
X_test_processed = np.concatenate([test_X[bin_cols], X_test_processed], axis=1)


print (X_train_processed.shape)

In [None]:
mlflow.set_tracking_uri("http://localhost:5000")

# Basic Baseline


Refresh Function for Baseline


In [86]:
import requests
from bs4 import BeautifulSoup

def scrape_rental_prices():
    url = "https://www.wohnungsboerse.net/mietspiegel-Wuerzburg/2772"
    response = requests.get(url)

    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    script_tag = soup.find('script', text=lambda text: 'pdfData' in text)
    rental_price = 0
    if script_tag:
        script_content = script_tag.string
        start_index = script_content.find('avg_rent_price: ') + len('avg_rent_price: \'')
        end_index = script_content.find('\',', start_index)
        rental_price = script_content[start_index:end_index]
        print('The average rental price in Würzburg is:', rental_price)
        rental_price = rental_price.replace('€/m2', '').replace('.', '').replace(',', '.')
        rental_price = rental_price.strip()
        rental_price = float(rental_price)
        print(f"Extrcated rental price as float: {rental_price}")
    else:
        print('The script tag containing the rental price was not found.')
    return rental_price

rental_price = scrape_rental_prices()
print(rental_price)




The average rental price in Würzburg is: 11,21 €/m2
11.21


  script_tag = soup.find('script', text=lambda text: 'pdfData' in text)


In [87]:

avg_price_per_sqm_buy = 4070.62
avg_price_per_sqm_rent = scrape_rental_prices()

mlflow.end_run()

baseline_preds = val_X['LivingSpace'] * avg_price_per_sqm_rent
baseline_mae = mean_absolute_error(val_y, baseline_preds)
baseline_r2 = r2_score(val_y, baseline_preds)
baseline_mse = mean_squared_error(val_y, baseline_preds)

# Log baseline metrics to MLflow
with mlflow.start_run(run_name="baseline_with_avg_price_per_sqm_rent"):
    mlflow.log_metric("mse", baseline_mse)
    mlflow.log_metric("mae", baseline_mae)
    mlflow.log_metric("r2", baseline_r2)

print(f"Baseline Mae: {baseline_mae}")
print(f"Baseline R2 Score: {baseline_r2}")

  script_tag = soup.find('script', text=lambda text: 'pdfData' in text)


The average rental price in Würzburg is: 11,21 €/m2
Baseline Mae: 295.6022818181818
Baseline R2 Score: 0.3567048041389107


# XGBoost

In [None]:
mlflow.xgboost.autolog()
mlflow.end_run()
mlflow.start_run()

early_stopping_rounds = 19
mlflow.set_experiment("flats_to_rent")
mlflow.log_param("used_features", "all")
mlflow.log_param("early stopping rounds", early_stopping_rounds)

with mlflow.start_run(run_name="ridge-regression", description="Basic ridge Regression with scikit-learn. All features used.") as run:
    model = xgb.XGBRegressor(eval_metric=['rmse', 'mae', 'logloss'], early_stopping_rounds=early_stopping_rounds)
    model.fit(X=X_train_processed, 
            y=train_y,
            eval_set=[(X_valid_processed, val_y)],
            verbose=True)
    mlflow.xgboost.log_model(model, "xgboost")
    preds = model.predict(X_valid_processed)
    mlflow.log_metric("mae", mean_absolute_error(val_y, preds))
    mlflow.log_metric("mse", mean_squared_error(val_y , preds))
    mlflow.end_run()

## Linear Regression


In [None]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mlflow.end_run()
mlflow.sklearn.autolog()
mlflow.sklearn.autolog(disable=True)


with mlflow.start_run(run_name="ridge-regression", description="Basic ridge Regression with scikit-learn. All features used.") as run:
    model = Ridge(alpha=0.7)
    model.fit(X_train_processed, train_y)
    preds = model.predict(X_valid_processed)
    mlflow.sklearn.log_model(model, "ridge-regression-model")
    mlflow.log_metric("mse",  mean_squared_error(val_y, preds) )
    mlflow.log_metric("mae",mean_absolute_error(val_y, preds))
    mlflow.log_metric("r2", r2_score(val_y, preds))


# mlflow.log_metric("mse",  mean_squared_error(val_y, preds) )
# mlflow.log_metric("mae",mean_absolute_error(val_y, preds))
# mlflow.log_metric("r2", r2_score(val_y, preds))

mlflow.end_run()