# Find best regression augmentation parameters

This notebooks does experiments to find the best augmentation parameters for the regression models and evaluates them.

In [1]:
import os

import pandas as pd
import mlflow
from dotenv import load_dotenv
import plotly.express as px

from cross_validate_regression import cross_validate_regression_model

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [2]:
load_dotenv()
mlflow.set_tracking_uri(os.environ.get("MLFLOW_TRACKING_URL"))

In [3]:
df_raw_merged = pd.read_parquet("./data/my_datasets/raw_merged.parquet")
all_device_uuids = df_raw_merged["device_uuid"].unique()

In [4]:
def load_exp_results(exp_name):
    run_path = "/home/nkuechen/Documents/Thesis"
    os.chdir(run_path)
    print(os.getcwd())

    reg_id = mlflow.get_experiment_by_name(exp_name)
    if reg_id is None:
        print(f"Could not find experiment {exp_name}")
        return None
    else:
        output_csv = f"code/thesis_code/data/aug_effect/{exp_name}.csv"
        command = f"mlflow experiments csv -o {output_csv} -x {reg_id.experiment_id}"
        os.system(command)
        return pd.read_csv(output_csv)

## Einfluss von Augmentationsparametern auf die `mdt`

In [5]:
features = [
    "battery_level_percent",
    "radio_level_percent",
    "air_temperature",
    "battery_diff",
    "battery_level_percent_rolling_median_5",
    "battery_diff_rolling_median_5",
    "battery_level_percent_rolling_median_50",
    "battery_diff_rolling_median_50",
    "v01.49",
    "v01.66",
    "v01.70",
    "1.0",
    "0572 2620",
    "0572 2622",
    "0572 2623",
    "label",
]

In [6]:
reg_experiments = {}
for param, param_bool, param_values in [
    ("max_noise", "add_noise", [0, 10, 20]),
    ("max_noise_temperature", "add_noise_temperature", [0, 5]),
    ("random_max_time_warp_percent", "random_warp_status_times", [0, 0.5, 1]),
]:
    REG_EXP = f"reg_{param}"
    reg_experiments[REG_EXP] = (param, param_bool, param_values)
    if mlflow.get_experiment_by_name(REG_EXP) is None:
        for n_dev in [63, 63, 63, 63, 63]:
            for model_class in [
                "LinearRegression",
                "DecisionTreeRegressor",
                "XGBRegressor",
            ]:
                if model_class == "LinearRegression":
                    model = LinearRegression(n_jobs=8)
                elif model_class == "DecisionTreeRegressor":
                    model = DecisionTreeRegressor()
                elif model_class == "XGBRegressor":
                    model = XGBRegressor()
                for param_value in param_values:
                    cross_validate_regression_model(
                        raw_merged_df=df_raw_merged,
                        model=model,
                        n_dev=n_dev,
                        pred_hor=1,
                        n_aug=1,
                        train_df_params={
                            param_bool: True,
                            param: param_value,
                        },
                        features=features,
                        all_device_uuids=all_device_uuids,
                        by_metric="metrics.mdt",
                        mlflow_experiment=REG_EXP,
                    )
    else:

        print(f"Skipping {REG_EXP} because the experiment already exists.")

Skipping reg_max_noise because the experiment already exists.
Skipping reg_max_noise_temperature because the experiment already exists.
Skipping reg_random_max_time_warp_percent because the experiment already exists.


In [7]:
print(reg_experiments)

{'reg_max_noise': ('max_noise', 'add_noise', [0, 10, 20]), 'reg_max_noise_temperature': ('max_noise_temperature', 'add_noise_temperature', [0, 5]), 'reg_random_max_time_warp_percent': ('random_max_time_warp_percent', 'random_warp_status_times', [0, 0.5, 1])}


In [15]:
column_to_description = {
    "max_noise": "Maximales Jittering auf Batteriewerte",
    "max_noise_temperature": "Maximales Jittering auf Lufttemperatur",
    "random_max_time_warp_percent": "Maximales Jittering auf Messpunktabstände",
}

params_to_better_params = {
    "max_noise": "<i>max_jittering_battery_level</i>",
    "max_noise_temperature": "<i>max_jittering_air_temperature</i>",
    "random_max_time_warp_percent": "<i>max_jittering_measurement_interval</i>",
}

In [33]:
for reg_exp, (param, param_bool, param_values) in reg_experiments.items():
    reg_exp_results = load_exp_results(reg_exp)
    if reg_exp_results is not None:
        try:
            fig = px.box(
                reg_exp_results,
                x=f"params.{param}",
                y="metrics.mdt",
                color="params.model_class",
                title=f"Einfluss des Parameters <i>{params_to_better_params[param]}</i> auf die MDT.",
                width=600,
                height=600,
            )
            fig.update_yaxes(title="MDT")
            fig.update_xaxes(title=params_to_better_params[param])
            fig.update_layout(legend_title="Modelltyp")
            fig.show()

            medians = {}
            for model_class, model_group in reg_exp_results.groupby(
                by="params.model_class"
            ):
                medians[model_class] = {}
                for param_value, param_group in model_group.groupby(
                    by=f"params.{param}"
                ):
                    medians[model_class][param_value] = param_group[
                        "metrics.mdt"
                    ].median()
            print(medians)
            medians_df = pd.DataFrame(medians).T
            pd.options.display.float_format = "{:.1f}".format
            print(medians_df)

        except ValueError:
            print(f"{reg_exp} does not seem to be finished.")
            raise

/home/nkuechen/Documents/Thesis
Experiment with ID 907516279037110457 has been exported as a CSV to file: code/thesis_code/data/aug_effect/reg_max_noise.csv.


{'DecisionTreeRegressor': {0: 39.77072447804155, 10: 40.49281898656898, 20: 40.65698712825729}, 'LinearRegression': {0: 39.64587355212355, 10: 42.28440825883354, 20: 41.64832121491911}, 'XGBRegressor': {0: 35.57647450356363, 10: 34.83665966386555, 20: 39.303858992558176}}
                        0    10   20
DecisionTreeRegressor 39.8 40.5 40.7
LinearRegression      39.6 42.3 41.6
XGBRegressor          35.6 34.8 39.3
/home/nkuechen/Documents/Thesis
Experiment with ID 809829503745210599 has been exported as a CSV to file: code/thesis_code/data/aug_effect/reg_max_noise_temperature.csv.


{'DecisionTreeRegressor': {0: 41.20805583305584, 5: 40.15080450784965}, 'LinearRegression': {0: 40.38803231652069, 5: 40.26538894174176}, 'XGBRegressor': {0: 35.58075998075998, 5: 36.176890166028095}}
                         0    5
DecisionTreeRegressor 41.2 40.2
LinearRegression      40.4 40.3
XGBRegressor          35.6 36.2
/home/nkuechen/Documents/Thesis
Experiment with ID 267473491601564638 has been exported as a CSV to file: code/thesis_code/data/aug_effect/reg_random_max_time_warp_percent.csv.


{'DecisionTreeRegressor': {0.0: 40.15993753118072, 0.5: 41.08961131810308, 1.0: 42.19875016504918}, 'LinearRegression': {0.0: 40.515867003367006, 0.5: 42.10963108189495, 1.0: 41.56926256613757}, 'XGBRegressor': {0.0: 35.95459550957739, 0.5: 36.86746594155131, 1.0: 37.55167752274607}}
                       0.0  0.5  1.0
DecisionTreeRegressor 40.2 41.1 42.2
LinearRegression      40.5 42.1 41.6
XGBRegressor          36.0 36.9 37.6


In [None]:
best_params = {
    "LinearRegression": {
        "max_noise": 20,
        "add_noise": True,
        "max_noise_temperature": 0,
        "add_noise_temperature": True,
        "random_max_time_warp_percent": 1.0,
        "random_warp_status_times": True,
    },
    "DecisionTreeRegressor": {
        "max_noise": 10,
        "add_noise": True,
        "max_noise_temperature": 0,
        "add_noise_temperature": True,
        "random_max_time_warp_percent": 0.5,
        "random_warp_status_times": True,
    },
    "XGBRegressor": {
        "max_noise": 20,
        "add_noise": True,
        "max_noise_temperature": 5,
        "add_noise_temperature": True,
        "random_max_time_warp_percent": 1.0,
        "random_warp_status_times": True,
    },
}

for n_dev in [10, 20, 40]:
    EXP_NAME = "reg_best_n_aug"
    for i in range(5):
        for model_class, train_df_params in best_params.items():
            for n_aug in [1, 3, 5, 10]:
                if model_class == "LinearRegression":
                    model = LinearRegression(n_jobs=8)
                elif model_class == "DecisionTreeRegressor":
                    model = DecisionTreeRegressor()
                elif model_class == "XGBRegressor":
                    model = XGBRegressor()
                cross_validate_regression_model(
                    raw_merged_df=df_raw_merged,
                    model=model,
                    n_dev=n_dev,
                    pred_hor=1,
                    features=features,
                    n_aug=n_aug,
                    train_df_params=train_df_params,
                    all_device_uuids=all_device_uuids,
                    by_metric="metrics.mdt",
                    mlflow_experiment=EXP_NAME,
                )

/home/nkuechen/Documents/Thesis/mlruns
!! Creating new split...
###Split 1/4!
Dropped 0 rows in 0 cycles.
29 cycles left.
Dropped 1517 rows in 6 cycles.
37 cycles left.
mdt=41.027027027027025; med_dt=21.0; std=42.64317895022898
###Split 2/4!
Dropped 628 rows in 2 cycles.
13 cycles left.
Dropped 955 rows in 9 cycles.
49 cycles left.
mdt=37.04081632653061; med_dt=22.0; std=39.13883843656506
###Split 3/4!
Dropped 495 rows in 2 cycles.
20 cycles left.
Dropped 1410 rows in 9 cycles.
35 cycles left.
mdt=47.34285714285714; med_dt=31.0; std=44.0553393491648
###Split 4/4!
Dropped 863 rows in 2 cycles.
14 cycles left.
Dropped 322 rows in 5 cycles.
32 cycles left.
mdt=35.03125; med_dt=22.5; std=35.54148196268927
/home/nkuechen/Documents/Thesis/mlruns
!! Creating new split...
###Split 1/4!
Dropped 0 rows in 0 cycles.
27 cycles left.
Dropped 1463 rows in 9 cycles.
40 cycles left.
mdt=27.291666666666668; med_dt=19.0; std=22.63124530132818
###Split 2/4!
Dropped 0 rows in 0 cycles.
29 cycles left.
D

In [30]:
EXP_NAME = "reg_best_n_aug"
reg_exp_results = load_exp_results(EXP_NAME)
if reg_exp_results is not None:
    try:
        for n_dev, n_dev_group in reg_exp_results.groupby(by="params.n_dev"):
            n_dev_group["Augmentationsanteil"] = n_dev_group["params.n_aug"].astype(str)
            n_dev_group.sort_values(by="params.n_aug", inplace=True)
            print(n_dev)
            fig = px.box(
                n_dev_group,
                x="Augmentationsanteil",
                y="metrics.mdt",
                color="params.model_class",
                title=f"Einfluss des Augmentationsanteils auf die MDT ({n_dev} Trainingsgeräte).",
                width=1000,
                height=600,
            )
            fig.update_yaxes(title="MDT")
            fig.update_layout(legend_title="Modelltyp")
            fig.show()

            medians = {}
            for model_class, model_group in n_dev_group.groupby(
                by="params.model_class"
            ):
                medians[model_class] = {}
                for param_value, param_group in model_group.groupby(by=f"params.n_aug"):
                    medians[model_class][param_value] = param_group[
                        "metrics.mdt"
                    ].median()
            print(medians)
            medians_df = pd.DataFrame(medians).T
            pd.options.display.float_format = "{:.1f}".format
            print(medians_df)

    except ValueError:
        print(f"{EXP_NAME} does not seem to be finished.")
        raise

/home/nkuechen/Documents/Thesis
Experiment with ID 397025654456766034 has been exported as a CSV to file: code/thesis_code/data/aug_effect/reg_best_n_aug.csv.
10


{'DecisionTreeRegressor': {1: 40.14879123692488, 3: 37.22439872788711, 5: 39.18798672622202, 10: 38.83487448009507}, 'LinearRegression': {1: 40.1104876241037, 3: 40.82748314825521, 5: 39.1446920946921, 10: 36.63231035657506}, 'XGBRegressor': {1: 33.78362855461134, 3: 35.10179122574956, 5: 39.32321441480518, 10: 36.17763037615979}}
                        1    3    5    10
DecisionTreeRegressor 40.1 37.2 39.2 38.8
LinearRegression      40.1 40.8 39.1 36.6
XGBRegressor          33.8 35.1 39.3 36.2
20


{'DecisionTreeRegressor': {1: 41.23205388381283, 3: 38.76062779879136, 5: 39.808306513112825, 10: 39.55857907166047}, 'LinearRegression': {1: 42.62668950736917, 3: 40.7453202661536, 5: 40.3874440330855, 10: 38.29556623931624}, 'XGBRegressor': {1: 37.25011363636364, 3: 37.14435513673318, 5: 38.343813649490805, 10: 37.24205173747857}}
                        1    3    5    10
DecisionTreeRegressor 41.2 38.8 39.8 39.6
LinearRegression      42.6 40.7 40.4 38.3
XGBRegressor          37.3 37.1 38.3 37.2
40


{'DecisionTreeRegressor': {1: 42.51075409265064, 3: 41.029348530920856, 5: 39.24079482677618, 10: 38.87556022408964}, 'LinearRegression': {1: 44.238411007161005, 3: 40.92522806190716, 5: 38.51840512070776, 10: 38.68167850121108}, 'XGBRegressor': {1: 39.93310582786389, 3: 39.16195878562495, 5: 40.43190345905804, 10: 39.56659883007247}}
                        1    3    5    10
DecisionTreeRegressor 42.5 41.0 39.2 38.9
LinearRegression      44.2 40.9 38.5 38.7
XGBRegressor          39.9 39.2 40.4 39.6
63


{'DecisionTreeRegressor': {1: 42.228750944822366, 3: 39.325163906142166, 5: 38.01100185984135, 10: 37.551867041498376}, 'LinearRegression': {1: 42.85456032331032, 3: 40.26500711697722, 5: 38.57652474695082, 10: 36.73352573352573}, 'XGBRegressor': {1: 39.86637856897145, 3: 39.39952468702469, 5: 38.95437448726922, 10: 39.58585761085761}}
                        1    3    5    10
DecisionTreeRegressor 42.2 39.3 38.0 37.6
LinearRegression      42.9 40.3 38.6 36.7
XGBRegressor          39.9 39.4 39.0 39.6
