In [None]:
import os
import pandas as pd
import numpy as np
from tgan.model import TGANModel
from tensorflow import keras  # type: ignore
import tensorflow as tf

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [None]:
RAND_SEED = 48
np.random.seed(RAND_SEED)
tf.random.set_random_seed(RAND_SEED)

In [None]:
class TGANTrainer():
    def __init__(self, seq_len, name, csv, out):
        self.name = name
        self.data = self.load_data(csv, seq_len)
        self.out_paths = self.prep_out_dir(out)

        self.continuous_columns = self.data.columns
        self.parameters = {
            # These parameters are tuned for each dataset separately
            'gpu': '0',
            'batch_size': 128,
            'num_dis_layers': 3,
            'num_dis_hidden': 300,
            'z_dim': seq_len,
            'output': self.out_paths['parent'],
            'steps_per_epoch': 100,
        }
        self.real = None

    def run_epoch(self, max_epoch):
        self.tgan = TGANModel(self.continuous_columns, **self.parameters)
        self.tgan.max_epoch = max_epoch

        # create and fit model
        self.tgan.fit(self.data)

        # sample and save sample
        self.sample = self.tgan.sample(
            int(len(self.data) * 1.5))[:len(self.data)]

        # PCA & t-SNE
        self.pca_tsne()

        # # save sample
        self.save_data()

    def prep_out_dir(self, path):
        print('Preparing output directory')

        paths = {'parent': path}
        out_dirs = ['models', 'figures', 'data']

        for dir in out_dirs:
            dir_path = os.path.join(path, dir)
            if not os.path.exists(dir_path):
                os.mkdir(dir_path)
            paths[dir] = dir_path

        return paths

    def load_data(self, csv, seq_len):
        data = pd.read_csv(csv)

        # # ita_teleco & china_unicom
        data.columns = [0, 1]
        x = data[1].to_numpy()

        # # Filtered_Grid_01
        # x = data['Internet_Activity'].rolling(10).mean().dropna().to_numpy()

        # # 1_40bucks data
        # data.columns = [0]
        # dataX = data[0].rolling(15).mean().dropna().to_numpy()

        dataX = x[:int(len(x)/seq_len) * seq_len]

        dataX = np.reshape(dataX, (-1, seq_len))
        # dataX = MinMaxScaler().fit_transform(dataX).dropna()

        output = pd.DataFrame(dataX, columns=list(range(seq_len)))
        return output

    def save_fig(self, data, sample, out):
        out_path = lambda x = '': os.path.join(
            self.out_paths['figures'], f'{out}{f"_{x}" if x else x}')
        print(f'Saving {out}...')

        for d in [data, sample]:
            name = 'data' if d is data else 'sample'
            df = pd.DataFrame(d)
            df.to_csv(f'{out_path(name)}.csv', index=False, header=False)

        plt.clf()
        plt.scatter(data[:, 0], data[:, 1], alpha=0.2, label="Original")
        plt.scatter(sample[:, 0], sample[:, 1], alpha=0.2, label="Synthetic")
        plt.legend()
        plt.savefig(f'{out_path()}.png', dpi=300)

        print(f'done')

    def save_model(self, epoch):
        print('Saving model...')
        self.tgan.save(os.path.join(
            self.out_paths['models'], f'{self.name}_model.h5'), force=True)
        print('done')

    def pca_tsne(self):
        # scale data
        scaled_data = StandardScaler().fit_transform(self.data)
        scaled_sample = StandardScaler().fit_transform(self.sample)

        print('Calculating PCA and t-SNE')
        # pca
        data_x = PCA(n_components=2, random_state=RAND_SEED).fit_transform(
            scaled_data)
        sample_x = PCA(n_components=2, random_state=RAND_SEED).fit_transform(
            scaled_sample)

        self.save_fig(data_x, sample_x, 'PCA')

        # tsne
        data_tsne = TSNE(
            n_components=2, random_state=RAND_SEED).fit_transform(data_x)
        sample_tsne = TSNE(
            n_components=2, random_state=RAND_SEED).fit_transform(sample_x)

        self.save_fig(data_tsne, sample_tsne, 'tSNE')

    def save_data(self):
        def out(x): return os.path.join(
            self.out_paths['data'], f'{x}.csv')

        if self.real is None:
            self.real = self.data.to_numpy()
            self.real = self.real.flatten()
            self.real.tofile(out('real'), '\n')

        fake = self.sample.to_numpy()
        fake = fake.flatten()
        fake.tofile(out('fake'), '\n')

        # for visualization purposes
        # date and time are arbitrary
        start_date = pd.to_datetime('2021-11-01 00:00:00')
        timeframe = pd.date_range(
            start=start_date, periods=len(fake), freq='10T')

        df = pd.DataFrame(list(zip(timeframe, self.real, fake)),
                          columns=['ts', 'real', 'fake'])
        df.to_csv(out('ts_real_fake'), index=False)


In [None]:
csv = 'ita_teleco.csv'
seq_len = 6

name = csv.split('.')[0]
out_path = f'out/{name}'

csv_path = os.path.join('data', csv)
os.makedirs(out_path, exist_ok=True)

trainer = TGANTrainer(seq_len, name, csv_path, out_path)

trainer.run_epoch(1)

trainer.save_model(name)