In [None]:
from enum import Enum
from preprocessing_methods import *
from apify_scrap import *
import datetime
import mlflow
import pickle
from model_functions import *
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
class ImmoWeltUrls(Enum):
    BUY_FLATS_WUE_10km = "https://www.immowelt.de/liste/wuerzburg/wohnungen/kaufen?d=true&r=10&sd=DESC&sf=RELEVANCE&sp=1"
    # add price range to avoid "consulting"-offers without named price
    BUY_HOUSES_WUE_10km = "https://www.immowelt.de/liste/wuerzburg/haeuser/kaufen?d=true&pma=10000000&pmi=10&r=10&sd=DESC&sf=RELEVANCE&sp=1"
    RENT_FLATS_WUE_10km = "https://www.immowelt.de/liste/wuerzburg/wohnungen/mieten?d=true&r=10&sd=DESC&sf=RELEVANCE&sp=1"
    RENT_HOUSES_WUE_10km = "https://www.immowelt.de/liste/wuerzburg/haeuser/mieten?d=true&r=10&sd=DESC&sf=RELEVANCE&sp=1"


def getFeatureSetApp():
    return [
        "Object_price",
        "LivingSpace",
        "ZipCode",
        "Rooms",
        "altbau_(bis_1945)",
        "balkon",
        "barriefrei",
        "dachgeschoss",
        "einbaukueche",
        "neubau",
        "parkett",
        "stellplatz",
        "bad/wc_getrennt",
        "personenaufzug",
        "garten",
        "garage",
        "renoviert",
        "terrasse",
        "wanne",
        "zentralheizung",
        "abstellraum",
        "ferne",
        "fussbodenheizung",
        "gartennutzung",
        "kelleranteil",
    ]

In [None]:
import pandas as pd

X_train = pd.read_excel("data/X_train.xlsx")
y_train = pd.read_excel("data/y_train.xlsx")
X_val = pd.read_excel("data/X_val.xlsx")
y_val = pd.read_excel("data/y_val.xlsx")
X_test = pd.read_excel("data/X_test.xlsx")
y_test = pd.read_excel("data/X_test.xlsx")

# save a test, train val set as excel
train = pd.concat([X_train, y_train], axis=1)
train.to_excel("data/train.xlsx", index=False)

val = pd.concat([X_val, y_val], axis=1)
val.to_excel("data/val.xlsx", index=False)

test = pd.concat([X_test, y_test], axis=1)
test.to_excel("data/test.xlsx", index=False)


In [None]:
def trigger_retraining_with_added_data(
    url,
    feature_set,
    limit=3,
    model_list=["xgb", "lasso", "ridge", "rf", "elasticnet", "linear"],
):
    print(url)
    #retrain_data = get_dataset_items(url, limit)
    print("Retraining data successfully scraped.")
    # write_data_to_excel(retrain_data, "data/retrain_train_data.xlsx")
    print("Retraining data successfully written to excel.")

    
    new_df = pd.read_excel(r"data/retrain_train_data.xlsx")
    new_df = preprocess_data(new_df)
    print("Done with raw preprocessing.")


    ############################# FELIX HILFE BENÖTIGT :) ##################################
    
    new_df.to_excel("data/retrain_train_data_preprocessed.xlsx", index=False)
    df_new = preprocess_data_for_model(new_df, feature_set)

    X_train_recent = pd.read_excel("data/X_train_recent.xlsx")
    y_train_recent = pd.read_excel("data/y_train_recent.xlsx")
    X_val = pd.read_excel("data/X_val.xlsx")
    y_val = pd.read_excel("data/y_val.xlsx")
    X_test = pd.read_excel("data/X_test.xlsx")
    y_test = pd.read_excel("data/X_test.xlsx")

    train_recent = pd.concat([X_train_recent, y_train_recent], axis=1)
    train_recent = pd.concat([train_recent, df_new], axis=0)
    train_recent = train_recent.reset_index(drop=True)
    train_recent = train_recent.drop_duplicates()
    X_train_recent = train_recent.drop(["Object_price"], axis=1)
    y_train_recent = train_recent["Object_price"]
    X_train_recent.to_excel("data/X_train_recent.xlsx", index=False)
    y_train_recent.to_excel("data/y_train_recent.xlsx", index=False)
    print("Retraining data successfully added to training data.")

    now = datetime.now()
    mlflow_set_experiment(f"retrain-{now.strftime('%m-%d-%H-%M-%S')}")

    
    results = pd.DataFrame(
        columns=["model", "mae", "mse", "r2", "r2_test", "mae_test", "mse_test"]
    )


    model = None

    for model_name in model_list:
        
        if model_name == "xgb":
            mlflow.xgboost.autolog()
        else:
            mlflow.sklearn.autolog()

        if model_name == "lasso":
            model = train_and_eval_lasso(X_train_recent, y_train_recent, X_val, y_val)
        elif model_name == "ridge":
            model = train_and_eval_ridge(X_train_recent, y_train_recent, X_val, y_val)
        elif model_name == "rf":
            model = train_and_eval_rf(X_train, y_train, X_val, y_val)
        elif model_name == "xgb":
            model = train_and_eval_xgb(X_train, y_train, X_val, y_val)
        elif model_name == "elasticnet":
            model = train_and_eval_elasticnet(X_train, y_train, X_val, y_val)
        elif model_name == "linear":
            model = train_and_eval_linear(X_train, y_train, X_val, y_val)
        elif model_name == "baseline-rent":
            avg_price = baseline_rent(X_val, y_val)
            baseline_preds = X_val["LivingSpace"] * avg_price
            baseline_preds_test = X_test["LivingSpace"] * avg_price

            mae = mean_absolute_error(y_val, baseline_preds)
            mse = mean_squared_error(y_val, baseline_preds)
            r2 = r2_score(y_val, baseline_preds)
            mae_train = mean_absolute_error(y_train, baseline_preds)
            mse_train = mean_squared_error(y_train, baseline_preds)
            r2_train = r2_score(y_train, baseline_preds)
            mae = mean_absolute_error(y_val, baseline_preds)
            mse = mean_squared_error(y_val, baseline_preds)
            r2 = r2_score(y_val, baseline_preds)
            

            mlflow.log_metric("mae", mae)
            mlflow.log_metric("mse", mse)
            mlflow.log_metric("r2", r2)
            mlflow.log_metric("mae_train", mae_train)
            mlflow.log_metric("mse_train", mse_train)
            mlflow.log_metric("r2_train", r2_train)
            mlflow.log_metric("mae_test", mean_absolute_error(y_test, baseline_preds_test))
            mlflow.log_metric("mse_test", mean_squared_error(y_test, baseline_preds_test))
            mlflow.log_metric("r2_test", r2_score(y_test, baseline_preds_test))

            mlflow.log_metric(
                "mae_test", mean_absolute_error(y_test, baseline_preds_test)
            )
            mlflow.log_metric(
                "mse_test", mean_squared_error(y_test, baseline_preds_test)
            )
            mlflow.log_metric("r2_test", r2_score(y_test, baseline_preds_test))
            return model, mae, mse, r2, mae_train, mse_train, r2_train
        else:
            print(
                "Model not found. Model_name must be 'lasso', 'ridge', 'rf', 'xgb', 'elasticnet', 'linear', 'baseline_buy' or 'baseline_rent' or conigure the pipeline manually."
            )

        print(f"Training {model_name} model done...")
        print(f"Evaluating and Logging {model_name} model to MLFlow...")

        pred_train = model.predict(X_train_recent)
        preds = model.predict(X_val)
        pred_test = model.predict(X_test)

        mae_train = mean_absolute_error(y_train_recent, pred_train)
        mse_train = mean_squared_error(y_train_recent, pred_train)
        r2_train = r2_score(y_train_recent, pred_train)

        mae_test = mean_absolute_error(y_test, pred_test)
        mse_test = mean_squared_error(y_test, pred_test)
        r2_test = r2_score(y_test, pred_test)

        mae_val = mean_absolute_error(y_val, preds)
        mse_val = mean_squared_error(y_val, preds)
        r2_val = r2_score(y_val, preds)

        mlflow.log_metric("mae_test", mae_test)
        mlflow.log_metric("mse_test", mse_test)
        mlflow.log_metric("r2_test", r2_test)
        mlflow.log_metric("mae_train", mae_train)
        mlflow.log_metric("mse_train", mae_train)
        mlflow.log_metric("r2_train", mae_train)
        mlflow.log_metric("mae", mae_val)
        mlflow.log_metric("mse", mse_val)
        mlflow.log_metric("r2", r2_val)

        results = results.append(
            {
                "model": model_name,
                "mae": mae_val,
                "mse": mse_val,
                "r2": r2_val,
                "mae_test": mae_test,
                "mse_test": mse_test,
                "r2_test": r2_test,
                "mae_train": mae_train,
                "mse_train": mse_train,
                "r2_train": r2_train,
            },
            ignore_index=True,
        )

        with open(f"models/{model_name}.pkl", "wb") as file:
            pickle.dump(model, file)
        print(f"Model {model_name} saved locally as pickle file.")
        print("Done with train")
        mlflow.end_run()
        return results
    
trigger_retraining_with_added_data(url="https://www.immowelt.de/liste/wuerzburg/wohnungen/mieten?d=true&r=10&sd=DESC&sf=RELEVANCE&sp=1", feature_set = getFeatureSetApp())
