In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import os
import sys

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime, timedelta
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

sys.path.insert(0, os.path.join("..", "src"))
sys.path.insert(0, os.path.join("..", "examples"))

from augmentation import TransformData
from data_generators import get_data_from_file
from tools import experiment, run_model_for_raw_and_augmented_data, smape

import warnings

warnings.filterwarnings("ignore")

In [2]:
model = RandomForestRegressor(n_estimators=200, random_state=42)

nums = ["01", "02", "03", "04", "05", "06", "07", "08", "09"]

experiments = []

for num in nums:
    filename = os.path.join("..", "examples", "data", f"df_{num}.csv")
    exp = get_data_from_file(filename)
    experiments.append(exp)

experiments = pd.DataFrame(experiments, columns=["df", "train_test_split"])

gen_x_times_possible = np.arange(1.1, 1.5, 0.1)
is_post_process_possible = [True, False]
pregeneration_frac_possible = np.arange(1.2, 3, 0.3)

res = []
for i, row in experiments.iterrows():
    pivot_result_table = []
    for gen_x_times in gen_x_times_possible:
        for is_post_process in is_post_process_possible:
            for pregeneration_frac in pregeneration_frac_possible:

                tabgan_params = {
                    "gen_x_times": gen_x_times,
                    "is_post_process": is_post_process,
                    "pregeneration_frac": pregeneration_frac,
                    "adversarial_model_params": {"random_state": 42},
                }
                df, train_test_split = row["df"], row["train_test_split"]

                result_raw_data, result_augmented_data = experiment(
                    model=model,
                    df=df,
                    train_test_split=train_test_split,
                    augm=False,
                    tabgan=True,
                    tabgan_params=tabgan_params,
                )
                pivot_result_table.append(
                    [
                        i,
                        gen_x_times,
                        is_post_process,
                        pregeneration_frac,
                        result_raw_data,
                        result_augmented_data,
                    ]
                )

    pivot_result_table = pd.DataFrame(
        data=pivot_result_table,
        columns=[
            "experiment",
            "gen_x_times",
            "is_post_process",
            "pregeneration_frac",
            "raw_data_mape",
            "augmented_data_mape",
        ],
    )

    pivot_result_table[["raw_data_mape", "augmented_data_mape"]] /= pivot_result_table[
        ["raw_data_mape", "augmented_data_mape"]
    ].mean()

    pivot_result_table = pivot_result_table.sort_values("augmented_data_mape")
    pivot_result_table["exp rang"] = range(len(pivot_result_table))
    res.append(pivot_result_table)

res = pd.concat(res)
print("done")

Fitting CTGAN transformers for each column:   0%|          | 0/13 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/500 [00:00<?, ?it/s]

Tabgan error: 'numpy.float64' object cannot be interpreted as an integer. Tabgun prediction = -1



AttributeError: 'DataFrame' object has no attribute 'pred_augm'

In [None]:
res_copy = res.copy()
res

In [None]:
res = res[res.is_post_process == True]

In [None]:
res["Gen_x_times"] = res["gen_x_times"].copy()
res["Pregeneration_frac"] = res["pregeneration_frac"].copy()
res = res.groupby(["gen_x_times", "pregeneration_frac"]).mean()
res

In [None]:
piv = res[["exp rang", "Gen_x_times", "Pregeneration_frac"]].pivot(
    index="Gen_x_times", columns="Pregeneration_frac", values="exp rang"
)
plt.figure(figsize=(12, 8))
sns.heatmap(piv, annot=True)

In [None]:
tabgan_best_params = {
    "gen_x_times": 1.4,
    "is_post_process": True,
    "pregeneration_frac": 1.5,
    "adversarial_model_params": {"random_state": 42},
}

In [None]:
experiments = []
for num in ["07", "08", "09"]:
    filename = os.path.join("..", "examples", "data", f"df_{num}.csv")
    exp = get_data_from_file(filename)
    exp[0]["num"] = num
    experiments.append(exp)

experiments = pd.DataFrame(experiments, columns=["df", "train_test_split"])

In [None]:
res = []
rr = []

for i, row in experiments.iterrows():
    print(f"experiment {i} in processing...")
    df, train_test_split = row["df"], row["train_test_split"]

    e = run_model_for_raw_and_augmented_data(
        model, df, train_test_split, tabgan=True, tabgan_params=tabgan_best_params
    )

    e = e[~np.isnan(e.y)]
    e["exp"] = i
    rr.append(e)
    result_raw_data = smape(e.y, e.pred_raw)
    result_augmented_data = smape(e.y, e.pred_augm)

    res.append([result_raw_data, result_augmented_data])

res = pd.DataFrame(
    data=res,
    columns=["raw_data_mape", "augmented_data_mape"],
)
rr = pd.concat(rr)
res
print("done")

In [None]:
res

In [None]:
for i in rr.exp.unique():
    d = rr[rr.exp == i]
    plt.figure(figsize=(15, 6))
    plt.plot(d.time, d.pred_raw, "o", markersize=4, label="pred_raw")
    plt.plot(d.time, d.pred_augm, "o", markersize=5, label="pred_tabgan")
    plt.plot(d.time, d.y, "v", markersize=5, label="y_true")
    plt.legend()
    plt.show()