# Regression Cross Validation Evaluation

This notebook evaluates the results of the cross validation of the best regression models.

In [1]:
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
import utilities.latex_figures as latex_figs

In [2]:
cv_df = pd.read_csv("./data/cross_validation/reg_cross_validation.csv")
cv_df["params.n_dev"] = cv_df["params.n_dev"].astype(str)
cv_df.sort_values(by="params.n_dev", inplace=True)

In [3]:
metric_name_to_metric = {
    "metrics.mdt": "MDT",
    "metrics.med_dt": "MedDT",
}
model_class_to_model = {
    "LinearRegression": "Lineare Regression",
    "DecisionTreeRegressor": "Decision Tree Regression",
    "XGBRegressor": "XGBoosting Regression",
}
model_class_to_abbr = {
    "LinearRegression": "lr",
    "DecisionTreeRegressor": "dt",
    "XGBRegressor": "xgb",
}

In [4]:
def calculate_and_print_improvements(group: pd.DataFrame, by_metric: str) -> None:
    for n_dev, sub_group in group.groupby(by="params.n_dev"):
        best_with = sub_group[sub_group["Augmentation"] == "Ja"].sort_values(by=by_metric, ascending=by_metric=="metrics.ibs").iloc[0]
        best_with_value = best_with[by_metric]
        best_with_out = sub_group[sub_group["Augmentation"] == "Nein"].sort_values(by=by_metric, ascending=by_metric=="metrics.ibs").iloc[0]
        best_with_out_value = best_with_out[by_metric]
        improvement = best_with_value - best_with_out_value
        improvement_percent = (improvement / best_with_out_value) * 100
        with_str = f"{n_dev} & {best_with['params.n_aug']} & {best_with_value:.1f} ({'+' if improvement_percent > 0 else ''}{improvement_percent:.1f} \\%) \\\\".replace(".", ",")
        with_out_str = f"{n_dev} & {best_with_out['params.n_aug']} & {best_with_out_value:.1f} \\\\".replace(".", ",")
        print(with_str)
        print("\\hline")
        print(with_out_str)
        print("\\hline")

In [5]:
for by_metric, metric_group in cv_df.groupby(by="params.by_metric"):
    print(by_metric)
    subfig_grid = latex_figs.LatexSubfigureGrid(
        caption=f"Vergleich des \\gls{{cv}}-{metric_name_to_metric[by_metric]}s der besten Modelle.",
        label=f"comp_reg_cv_{by_metric.split('.')[1]}",
    )

    for model_class, model_group in metric_group.groupby(by="params.model_class"):
        model_group["Augmentation"] = model_group["params.n_aug"] != 0
        model_group["Augmentation"] = model_group["Augmentation"].replace(
            {True: "Ja", False: "Nein"}
        )
        model_group.sort_values(by=["Augmentation", "params.n_dev"], inplace=True)

        print(model_class)
        calculate_and_print_improvements(model_group, by_metric)

        fig = px.box(
            model_group,
            x="params.n_dev",
            y=by_metric,
            color="Augmentation",
            points="all",
            title=f"Vergleich des Cross-Validation {metric_name_to_metric[by_metric]}s der besten {model_class_to_model[model_class]}-Modelle mit und ohne Augmentation.",
            width=1000,
            height=600,
            range_y=[0, 50] if by_metric == "metrics.mdt" else [0, 35],
        )
        fig.update_layout(
            xaxis_title="Anzahl der Trainingsgeräte",
            yaxis_title=metric_name_to_metric[by_metric],
        )
        fig.show()

        subfig = latex_figs.LatexSubfigure(
            f"resources/figures/regression_cv/{by_metric.split('.')[1]}_{model_class_to_abbr[model_class]}_cv.png",
            fig,
            caption=f"Vergleich des \\gls{{cv}}-\\gls{{{by_metric.split('.')[1]}}}s der besten \\gls{{{model_class_to_abbr[model_class]}}}-Modelle.",
            label=f"comp_reg_cv_{by_metric.split('.')[1]}_{model_class_to_abbr[model_class]}",
        )
        subfig_grid.add_subfigure(subfig)
        subfig_grid.add_newline()
    subfig_grid.save_figure()
    subfig_grid.write_latex_code_to_file(f"comp_reg_cv_{by_metric.split('.')[1]}.tex")

metrics.mdt
DecisionTreeRegressor
10 & 1 & 41,7 (+4,8 \%) \\
\hline
10 & 0 & 39,8 \\
\hline
20 & 1 & 40,8 (+1,2 \%) \\
\hline
20 & 0 & 40,3 \\
\hline
40 & 1 & 42,2 (+2,5 \%) \\
\hline
40 & 0 & 41,1 \\
\hline
63 & 1 & 43,2 (+0,4 \%) \\
\hline
63 & 0 & 43,0 \\
\hline


LinearRegression
10 & 1 & 44,3 (+45,9 \%) \\
\hline
10 & 0 & 30,3 \\
\hline
20 & 1 & 44,4 (+44,3 \%) \\
\hline
20 & 0 & 30,8 \\
\hline
40 & 1 & 45,1 (+33,9 \%) \\
\hline
40 & 0 & 33,7 \\
\hline
63 & 1 & 44,0 (+37,3 \%) \\
\hline
63 & 0 & 32,0 \\
\hline


XGBRegressor
10 & 5 & 39,2 (+1,5 \%) \\
\hline
10 & 0 & 38,7 \\
\hline
20 & 1 & 39,4 (+5,5 \%) \\
\hline
20 & 0 & 37,3 \\
\hline
40 & 1 & 40,6 (+12,5 \%) \\
\hline
40 & 0 & 36,1 \\
\hline
63 & 1 & 40,3 (+2,4 \%) \\
\hline
63 & 0 & 39,3 \\
\hline


Saving subfigure to "/home/nkuechen/Documents/Thesis/latex/Bachelor Thesis/resources/figures/regression_cv/mdt_dt_cv.png"...
Done!
Saving subfigure to "/home/nkuechen/Documents/Thesis/latex/Bachelor Thesis/resources/figures/regression_cv/mdt_lr_cv.png"...
Done!
Saving subfigure to "/home/nkuechen/Documents/Thesis/latex/Bachelor Thesis/resources/figures/regression_cv/mdt_xgb_cv.png"...
Done!
Writing latex code to /home/nkuechen/Documents/Thesis/latex/Bachelor Thesis/comp_reg_cv_mdt.tex
Done!
metrics.med_dt
DecisionTreeRegressor
10 & 1 & 24,1 (-9,0 \%) \\
\hline
10 & 0 & 26,5 \\
\hline
20 & 1 & 27,4 (+10,6 \%) \\
\hline
20 & 0 & 24,8 \\
\hline
40 & 1 & 27,5 (+4,3 \%) \\
\hline
40 & 0 & 26,4 \\
\hline
63 & 1 & 26,2 (+2,4 \%) \\
\hline
63 & 0 & 25,6 \\
\hline


LinearRegression
10 & 1 & 26,9 (-5,3 \%) \\
\hline
10 & 0 & 28,4 \\
\hline
20 & 1 & 25,5 (-0,5 \%) \\
\hline
20 & 0 & 25,6 \\
\hline
40 & 1 & 27,5 (-6,4 \%) \\
\hline
40 & 0 & 29,4 \\
\hline
63 & 1 & 26,1 (-8,3 \%) \\
\hline
63 & 0 & 28,5 \\
\hline


XGBRegressor
10 & 1 & 23,1 (+9,5 \%) \\
\hline
10 & 0 & 21,1 \\
\hline
20 & 1 & 24,0 (+2,7 \%) \\
\hline
20 & 0 & 23,4 \\
\hline
40 & 5 & 26,9 (+0,5 \%) \\
\hline
40 & 0 & 26,8 \\
\hline
63 & 1 & 27,5 (+4,3 \%) \\
\hline
63 & 0 & 26,4 \\
\hline


Saving subfigure to "/home/nkuechen/Documents/Thesis/latex/Bachelor Thesis/resources/figures/regression_cv/med_dt_dt_cv.png"...
Done!
Saving subfigure to "/home/nkuechen/Documents/Thesis/latex/Bachelor Thesis/resources/figures/regression_cv/med_dt_lr_cv.png"...
Done!
Saving subfigure to "/home/nkuechen/Documents/Thesis/latex/Bachelor Thesis/resources/figures/regression_cv/med_dt_xgb_cv.png"...
Done!
Writing latex code to /home/nkuechen/Documents/Thesis/latex/Bachelor Thesis/comp_reg_cv_med_dt.tex
Done!


In [6]:
# Filtern und Gruppieren der Daten
grouped = cv_df.groupby(["params.n_dev", "params.n_aug", "params.model_class"])


for metric in ["metrics.mdt", "metrics.med_dt"]:
    # Finden der besten Metrikwerte
    best_metric = grouped[metric].max().reset_index()

    # Bar chart erstellen
    fig = go.Figure()

    pattern_styles = {
        "LinearRegression": "",
        "DecisionTreeRegressor": "x",
        "XGBRegressor": ".",
    }

    # Iteration über die model_class
    for model_class in pattern_styles.keys():
        # Separieren der Daten in die beiden Bedingungen
        min_metrics_aug_0 = best_metric[
            (best_metric["params.n_aug"] == 0)
            & (best_metric["params.model_class"] == model_class)
        ]
        min_metrics_aug_not_0 = best_metric[
            (best_metric["params.n_aug"] != 0)
            & (best_metric["params.model_class"] == model_class)
        ]

        min_metrics_aug_not_0 = (
            min_metrics_aug_not_0.groupby(["params.n_dev", "params.model_class"])
            .max()
            .reset_index()
        )

        # Bars für params.n_aug == 0
        fig.add_trace(
            go.Bar(
                x=min_metrics_aug_0["params.n_dev"],
                y=min_metrics_aug_0[metric],
                name=f"{model_class} params.n_aug == 0",
                marker_color="red",
                marker_pattern_shape=pattern_styles[model_class],
                showlegend=False,  # Legende deaktivieren
            )
        )

        # Bars für params.n_aug != 0
        fig.add_trace(
            go.Bar(
                x=min_metrics_aug_not_0["params.n_dev"],
                y=min_metrics_aug_not_0[metric],
                name=f"{model_class} params.n_aug != 0",
                marker_color="blue",
                marker_pattern_shape=pattern_styles[model_class],
                showlegend=False,  # Legende deaktivieren
            )
        )

    # Hinzufügen von Dummy-Traces für die Legende
    # Farbenlegende
    fig.add_trace(
        go.Bar(
            x=[None],
            y=[None],
            name="Unaugmentiert",
            marker_color="red",
        )
    )

    fig.add_trace(
        go.Bar(
            x=[None],
            y=[None],
            name="Augmentiert",
            marker_color="blue",
        )
    )

    # Musterlegende
    for model_class, pattern in pattern_styles.items():
        fig.add_trace(
            go.Bar(
                x=[None],
                y=[None],
                name=f"Modelltyp {model_class}",
                marker_color="white",
                marker_pattern_shape=pattern,
            )
        )

    # Layout anpassen
    fig.update_layout(
        title=dict(
            text=f"Vergleich der besten {'MDT' if 'mdt' in metric else 'MedDT'} Werte mit der Menge an Trainingsgeräten und dem Modelltyp",
            font=dict(size=24),  # Adjust the size as needed
        ),
        xaxis_title="params.n_dev",
        yaxis_title=metric,
        barmode="group",
        legend_title=dict(text="Legende", font=dict(size=18)),
        legend=dict(font=dict(size=16)),  # Adjust the size as needed
        width=1200,
        height=600,
        bargap=0.1,
        bargroupgap=0.1,
    )
    fig.update_xaxes(
        title=dict(text="Anzahl an Trainingsgeräten", font=dict(size=20)),
    )
    if metric == "metrics.mdt":
        fig.update_yaxes(title=dict(text="MDT in Tagen", font=dict(size=20)))
    else:
        fig.update_yaxes(title=dict(text="MedDT in Tagen", font=dict(size=20)))

    # Plot anzeigen
    fig.show()