In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from sklearn.linear_model import ElasticNet
import requests
from bs4 import BeautifulSoup
from ydata_profiling import ProfileReport


In [3]:
df = pd.read_excel(r'data\flats_to_rent_wue_preprocessed_0407.xlsx')
profile = ProfileReport(df, title="Profiling Report")

In [4]:
profile.to_file(r'your_report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Preprocessing & Feature Engineering


In [None]:
def preprocess_data (df): 
    df.replace('""', np.nan, inplace=True)
    df.dropna(inplace=True)
    df["LivingSpace"] = pd.to_numeric(df["LivingSpace"], errors="coerce")
    df["Rooms"] = pd.to_numeric(df["Rooms"], errors="coerce")
    df.dropna(subset=['ConstructionYear'], inplace=True)
    df.dropna(subset=['Object_price'], inplace=True)
    df.dropna(subset=['Rooms'], inplace=True)
    df.dropna(subset=['LivingSpace'], inplace=True)
    df.drop('Unnamed: 0', axis=1, inplace=True)
    df['ZipCode']= df['ZipCode'].astype(str)
    df['LivingSpace']= df['LivingSpace'].astype(float)
    df = df.reindex()
    df = df.reset_index(drop=True)

    # keep only the most promising features
    sns.heatmap(df.corr(), cmap="RdBu")
    important_num_cols = list(df.corr()["Object_price"][(df.corr()["Object_price"]>0.20) | (df.corr()["Object_price"]<-0.20)].index)
    cat_cols = [col for col in df.columns if df[col].dtype == 'object']
    important_cols = important_num_cols + cat_cols
    df = df[important_cols]

    # one-hot encoding
    y = df['Object_price']
    X = df.drop('Object_price', axis=1)
    X = pd.get_dummies(X, columns=cat_cols)

    return X, y


df = pd.read_excel(r'data\flats_to_rent_wue_preprocessed_0407.xlsx')
X, y = preprocess_data(df)
print(X.shape)
print(y.shape)

# Explorative Data Analysis

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), cmap="RdBu")
plt.title("Correlations between variables", size=15)
plt.show()

# Data Split


In [None]:
def data_split(X, y, train_size=0.8, random_state=42):
    train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                  train_size=train_size, 
                                                  random_state = random_state)
    train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, 
                                                  train_size=train_size, 
                                                  random_state = random_state)
    return train_X, test_X, train_y, test_y, val_X, val_y

train_X, test_X, train_y, test_y, val_X, val_y = data_split(X, y)

# Train Different Models

## Baseline 

In [35]:
def scrape_avf_rental_prices():
    url = "https://www.wohnungsboerse.net/mietspiegel-Wuerzburg/2772"
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    script_tag = soup.find('script', text=lambda text: 'pdfData' in text)
    rental_price = 0
    if script_tag:
        script_content = script_tag.string
        start_index = script_content.find('avg_rent_price: ') + len('avg_rent_price: \'')
        end_index = script_content.find('\',', start_index)
        rental_price = script_content[start_index:end_index]
        print('The average rental price in Würzburg is:', rental_price)
        rental_price = rental_price.replace('€/m2', '').replace('.', '').replace(',', '.')
        rental_price = rental_price.strip()
        rental_price = float(rental_price)
        print(f"Extrcated rental price as float: {rental_price}")
    else:
        print('The script tag containing the rental price was not found.')
    return rental_price

In [37]:
def baseline_avg_rent_price(val_X, val_y, runname="baseline"):
    avg_price_per_sqm_rent = scrape_recent_rental_prices()
    mlflow.end_run()

    baseline_preds = val_X['LivingSpace'] * avg_price_per_sqm_rent
    baseline_mae = mean_absolute_error(val_y, baseline_preds)
    baseline_r2 = r2_score(val_y, baseline_preds)
    baseline_mse = mean_squared_error(val_y, baseline_preds)

    with mlflow.start_run(run_name=runname):
        mlflow.log_metric("mse", baseline_mse)
        mlflow.log_metric("mae", baseline_mae)
        mlflow.log_metric("r2", baseline_r2)

    print(f"Baseline Mae: {baseline_mae}")
    print(f"Baseline MSE: {baseline_mse}")    
    print(f"Baseline R2 Score: {baseline_r2}")

## Linear-Regression

## XGBRegressor


In [None]:
mlflow.set_tracking_uri("http://localhost:5000")

In [None]:
def train_and_eval_xgb(train_X, train_y, val_X, val_y, test_X, test_y, early_stopping_rounds=30, max_depth=6, n_estimators=1000):
    mlflow.xgboost.autolog()
    with mlflow.start_run():
        model = xgb.XGBRegressor(eval_metric=['rmse', 'mae'], early_stopping_rounds=early_stopping_rounds, random_state=42, max_depth=max_depth, n_estimators=n_estimators)
        mlflow.log_param("early_stopping_rounds", early_stopping_rounds)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("n_estimators", n_estimators)
        model.fit(X=train_X, 
                  y=train_y,
                  eval_set=[(val_X, val_y)],
                  verbose=True
        )
        preds= model.predict(val_X)
        mae = mean_absolute_error(val_y, preds)
        mse = mean_squared_error(val_y, preds)
        r2 = r2_score(val_y, preds)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("r2", r2)        
        return model, mae, mse, r2

train_and_eval_xgb(train_X, train_y, val_X, val_y, test_X, test_y, early_stopping_rounds=30, max_depth=6, n_estimators=1000)


# Random Forest

In [None]:
def train_and_eval_rf(train_X, train_y, val_X, val_y, n_estimators=50, random_state=0):
    mlflow.sklearn.autolog()
    with mlflow.start_run():
        model = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
        model.fit(train_X, train_y)
        pred_train = model.predict(train_X)
        mae_train = mean_absolute_error(train_y, pred_train)
        mse_train = mean_squared_error(train_y, pred_train)
        r2_train = r2_score(train_y, pred_train)
        preds= model.predict(val_X)
        mae = mean_absolute_error(val_y, preds)
        mse = mean_squared_error(val_y, preds)
        r2 = r2_score(val_y, preds)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae_train", mae_train)
        mlflow.log_metric("mse_train", mse_train)
        mlflow.log_metric("r2_train", r2_train)
        return model, mae, mse, r2, mae_train, mse_train, r2_train
    
train_and_eval_rf(train_X, train_y, val_X, val_y, n_estimators=50, random_state=0)

## ElasticNet

In [29]:
def train_and_eval_elasticnet(train_X, train_y, val_X, val_y):
    mlflow.sklearn.autolog()
    with mlflow.start_run():
        model = ElasticNet(max_iter=30)
        model.fit(train_X, train_y)
        pred_train = model.predict(train_X)
        mae_train = mean_absolute_error(train_y, pred_train)
        mse_train = mean_squared_error(train_y, pred_train)
        r2_train = r2_score(train_y, pred_train)
        preds= model.predict(val_X)
        mae = mean_absolute_error(val_y, preds)
        mse = mean_squared_error(val_y, preds)
        r2 = r2_score(val_y, preds)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae_train", mae_train)
        mlflow.log_metric("mse_train", mse_train)
        mlflow.log_metric("r2_train", r2_train)
        return model, mae, mse, r2, mae_train, mse_train, r2_train

train_and_eval_elasticnet(train_X, train_y, val_X, val_y)



(ElasticNet(max_iter=30),
 240.95171039123125,
 118514.00451704123,
 0.4971844010433488,
 195.29754000458772,
 54045.200002599486,
 0.8402515028199589)

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                  train_size=0.8, 
                                                  random_state = 42)
rf_model = RandomForestRegressor(n_estimators=50, random_state=0)
rf_model.fit(train_X, train_y)
preds_test_rf = rf_model.predict(test_X)
score = mean_absolute_error(test_y, preds_test_rf)
print("MAE: {}".format(score))

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                  train_size=0.8, 
                                                  random_state = 42)
elastic_net = ElasticNet(max_iter=30)
elastic_net.fit(train_X, train_y)
test_preds = elastic_net.predict(test_X)
test_score = mean_absolute_error(test_y, test_preds)
print("MAE auf Testdaten:", test_score)