# Synthetic data generation for smart service systems

In this notebook, we demonstrate how to generate synthetic data using parts of the WaterBench dataset as test sensors. 

In [1]:
# stdlib
import sys
import warnings

# synthcity absolute
import synthcity.logger as log
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import TimeSeriesDataLoader

log.add(sink=sys.stderr, level="INFO")
warnings.filterwarnings("ignore")



  from .autonotebook import tqdm as notebook_tqdm


In [2]:

from synthcity.utils.datasets.time_series.google_stocks import GoogleStocksDataloader
import requests
import pandas as pd
import numpy as np
import io
from sklearn.preprocessing import MinMaxScaler
import janitor

# URL = "https://raw.githubusercontent.com/PacktPublishing/Learning-Pandas-Second-Edition/master/data/goog.csv"
# s = requests.get(URL, timeout=5).content
df = pd.read_csv("./data/637_train_x.csv")
df_datetime = df['datetime']
df = df.select_columns('datetime', slice('et(t-12)', 'p(t+0)'))
dfy = pd.read_csv("./data/637_train_y.csv")
dfy = dfy.select_columns(slice('q(t+0)', 'q(t+11)'))
df = pd.concat([df, dfy], axis=1)
df = df.iloc[0:70,:]
df = pd.DataFrame(df.values[::-1], columns=df.columns)

T = (
    pd.to_datetime(df["datetime"], infer_datetime_format=True)
    .astype(np.int64)
    .astype(np.float64)
    / 10**9
)
T = pd.Series(MinMaxScaler().fit_transform(T.values.reshape(-1, 1)).squeeze())
old_columns = df.columns
df = df.drop(columns=["datetime"])
#df = pd.DataFrame(MinMaxScaler().fit_transform(df), columns=df.columns)

# Build dataset
dataX = []
dataT = []
outcome = []

seq_len = 10

# Cut data by sequence length
for i in range(0, len(df) - seq_len - 1):
    df_seq = df.loc[i : i + seq_len - 1]
    horizons = T.loc[i : i + seq_len - 1]
    out = df["q(t-1)"].loc[i + seq_len]

    dataX.append(df_seq)
    dataT.append(horizons.values.tolist())
    outcome.append(out)

# Mix Data (to make it similar to i.i.d)
idx = np.random.permutation(len(dataX))

temporal_data = []
observation_times = []
for i in range(len(dataX)):
    temporal_data.append(dataX[idx[i]])
    observation_times.append(dataT[idx[i]])

outcome = pd.DataFrame(outcome, columns=["q_next"])
static_data = pd.DataFrame(np.zeros((len(temporal_data), 0)))

loader = TimeSeriesDataLoader(
        temporal_data=temporal_data,
        observation_times=observation_times,
        static_data=static_data,
        outcome=outcome,
    )
loader.dataframe()


Unnamed: 0,seq_id,seq_time_id,seq_temporal_et(t-1),seq_temporal_et(t-10),seq_temporal_et(t-11),seq_temporal_et(t-12),seq_temporal_et(t-2),seq_temporal_et(t-3),seq_temporal_et(t-4),seq_temporal_et(t-5),...,seq_temporal_q(t-12),seq_temporal_q(t-2),seq_temporal_q(t-3),seq_temporal_q(t-4),seq_temporal_q(t-5),seq_temporal_q(t-6),seq_temporal_q(t-7),seq_temporal_q(t-8),seq_temporal_q(t-9),seq_out_q_next
0,0,0.550725,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,237.00,234.00,234.00,234.00,234.00,234.00,234.75,234.00,234.75,236.25
1,0,0.536232,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,237.00,234.00,234.00,234.00,234.00,234.75,234.00,234.75,237.00,236.25
2,0,0.521739,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,235.50,234.00,234.00,234.00,234.75,234.00,234.75,237.00,237.00,236.25
3,0,0.507246,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,235.50,234.00,234.00,234.75,234.00,234.75,237.00,237.00,237.00,236.25
4,0,0.492754,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,234.00,234.00,234.75,234.00,234.75,237.00,237.00,237.00,237.00,236.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,58,0.086957,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,231.00,238.00,237.00,234.75,234.75,234.00,234.00,231.75,231.00,234.00
586,58,0.072464,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,231.00,237.00,234.75,234.75,234.00,234.00,231.75,231.00,231.00,234.00
587,58,0.057971,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,234.00,234.75,234.75,234.00,234.00,231.75,231.00,231.00,230.00,234.00
588,58,0.043478,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,234.75,234.75,234.00,234.00,231.75,231.00,231.00,230.00,231.00,234.00


In [3]:
syn_model = Plugins().get("timegan")

syn_model.fit(loader)

[2023-10-05T12:32:06.260412+0000][42077][CRITICAL] module disabled: /home/azureuser/Repos/envs/synthcity/lib/python3.10/site-packages/synthcity/plugins/generic/plugin_goggle.py
[2023-10-05T12:32:06.260412+0000][42077][CRITICAL] module disabled: /home/azureuser/Repos/envs/synthcity/lib/python3.10/site-packages/synthcity/plugins/generic/plugin_goggle.py
Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
distutils Version classes are deprecated. Use packaging.version instead.
distutils Version classes are deprecated. Use packaging.version instead.
[2023-10-05T12:32:06.542300+0000][42077][CRITICAL] load failed: /home/azureuser/Repos/envs/synthcity/lib/python3.10/site-packages/torchaudio/lib/libtorchaudio.so: undefined symbol: _ZNK3c107SymBool10guard_boolEPKcl


<synthcity.plugins.time_series.plugin_timegan.TimeGANPlugin at 0x7f8c4fb43d90>

In [31]:
syn_df = syn_model.generate(count=30000).dataframe()
syn_df

Unnamed: 0,seq_id,seq_time_id,seq_temporal_et(t-1),seq_temporal_et(t-10),seq_temporal_et(t-11),seq_temporal_et(t-12),seq_temporal_et(t-2),seq_temporal_et(t-3),seq_temporal_et(t-4),seq_temporal_et(t-5),...,seq_temporal_q(t-12),seq_temporal_q(t-2),seq_temporal_q(t-3),seq_temporal_q(t-4),seq_temporal_q(t-5),seq_temporal_q(t-6),seq_temporal_q(t-7),seq_temporal_q(t-8),seq_temporal_q(t-9),seq_out_q_next
0,0,1.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,237.00,233.25,232.50,233.25,233.25,233.25,233.25,233.25,233.25,233.25
1,0,1.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,237.00,236.25,236.25,236.25,233.25,233.25,236.25,236.25,237.00,233.25
2,0,1.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,233.25,233.25,234.00,233.25,236.25,233.25,233.25,234.00,234.00,233.25
3,0,1.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,233.25,236.25,236.25,236.25,236.25,233.25,236.25,237.00,234.00,233.25
4,0,1.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,233.25,233.25,236.25,236.25,234.00,233.25,233.25,233.25,237.00,233.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179907,29999,1.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,234.00,236.25,233.25,236.25,236.25,236.25,236.25,236.25,233.25,233.25
179908,29999,1.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,233.25,233.25,234.00,236.25,236.25,236.25,233.25,234.00,237.00,233.25
179909,29999,1.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,237.00,236.25,236.25,233.25,236.25,236.25,236.25,233.25,237.00,233.25
179910,29999,1.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,233.25,233.25,236.25,236.25,234.00,233.25,233.25,233.25,233.25,233.25


In [30]:
# save synthetic data to csv file
dates = df_datetime[:syn_df.index.size]
syn_df = pd.concat([dates,syn_df], axis=1)
syn_df.columns = syn_df.columns.str.replace('seq_temporal_', '')
syn_df.set_index('datetime')
syn_df = syn_df[old_columns]
syn_df_x = syn_df.select_columns('datetime', slice('et(t-12)', 'p(t+0)'))
syn_df_x.to_csv('637_synthetic_x.csv', index=False)
syn_df_y = syn_df.select_columns('datetime', slice('q(t+0)', 'q(t+11)'))
syn_df_y.to_csv('637_synthetic_y.csv', index=False)


In [None]:
# synthcity absolute
from synthcity.benchmark import Benchmarks

score = Benchmarks.evaluate(
    [
        (f"test_{model}", model, {})
        for model in ["timegan"]
    ],
    loader,
    synthetic_size=1000,
    repeats=2,
    task_type="time_series",  # time_series_survival or time_series
)

Benchmarks.print(score)