In [17]:
from os import path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
from ydata_synthetic.preprocessing.timeseries import processed_stock
from ydata_synthetic.synthesizers.timeseries import TimeGAN


data = pd.read_csv('C:/Users/raghu/CS6140/PEMS-BAY.csv')

# Convert timestamp to datetime and set as index
data['Timestamp'] = pd.to_datetime(data['Timestamp'])
data.rename(columns={'Timestamp': 'timestamp'}, inplace=True)
data.set_index('timestamp', inplace=True)

data = data.interpolate(method='time')
print("original data shape:", data.shape)
# Scale the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

print("scaled_data shape:", scaled_data.shape)
print(scaled_data)

original data shape: (52116, 325)
scaled_data shape: (52116, 325)
[[0.8880597  0.93729373 0.90309556 ... 0.94896552 0.88432836 0.91520861]
 [0.89054726 0.93234323 0.90444145 ... 0.94344828 0.88059701 0.90713324]
 [0.89054726 0.9339934  0.89905787 ... 0.94344828 0.87686567 0.91386272]
 ...
 [0.8880597  0.92244224 0.87079408 ... 0.94344828 0.89054726 0.89636608]
 [0.89800995 0.91584158 0.86944818 ... 0.94758621 0.89054726 0.92059219]
 [0.88930348 0.91089109 0.87483176 ... 0.94758621 0.89054726 0.91520861]]


In [18]:
seq_len = 100  # Use appropriate sequence length
n_seq = scaled_data.shape[1]  # Number of features

# Reshape the data for TimeGAN (samples, timesteps, features)
reshaped_data = []
for i in range(len(scaled_data) - seq_len):
    reshaped_data.append(scaled_data[i:i + seq_len])
reshaped_data = np.array(reshaped_data)

print("Reshaped data shape:", reshaped_data.shape)

Reshaped data shape: (52016, 100, 325)


In [19]:
print(reshaped_data)

[[[0.8880597  0.93729373 0.90309556 ... 0.94896552 0.88432836 0.91520861]
  [0.89054726 0.93234323 0.90444145 ... 0.94344828 0.88059701 0.90713324]
  [0.89054726 0.9339934  0.89905787 ... 0.94344828 0.87686567 0.91386272]
  ...
  [0.89179104 0.93729373 0.90174966 ... 0.93931034 0.89303483 0.9179004 ]
  [0.89303483 0.94389439 0.89502019 ... 0.93931034 0.89552239 0.92328398]
  [0.89427861 0.93894389 0.89636608 ... 0.93793103 0.89179104 0.91655451]]

 [[0.89054726 0.93234323 0.90444145 ... 0.94344828 0.88059701 0.90713324]
  [0.89054726 0.9339934  0.89905787 ... 0.94344828 0.87686567 0.91386272]
  [0.88432836 0.93234323 0.90040377 ... 0.94344828 0.88059701 0.90982503]
  ...
  [0.89303483 0.94389439 0.89502019 ... 0.93931034 0.89552239 0.92328398]
  [0.89427861 0.93894389 0.89636608 ... 0.93793103 0.89179104 0.91655451]
  [0.8818408  0.93729373 0.88694482 ... 0.93793103 0.89054726 0.91251682]]

 [[0.89054726 0.9339934  0.89905787 ... 0.94344828 0.87686567 0.91386272]
  [0.88432836 0.932343

In [None]:
# Specific parameters for TimeGAN
# hidden_dim = 24
# gamma = 1
# noise_dim = 32
# dim = 128
# batch_size = 128
# log_step = 100
# learning_rate = 5e-4
# epochs = 10

hidden_dim = 64
gamma = 1
noise_dim = 128
dim = 256
batch_size = 32
log_step = 100
learning_rate = 1e-4
epochs = 30

# Define model parameters using ModelParameters class from ydata_synthetic
gan_args = ModelParameters(
    batch_size=batch_size,
    lr=learning_rate,
    noise_dim=noise_dim,
    layers_dim=dim
)

# Define the training parameters using TrainParameters from the library
train_args = TrainParameters(
    epochs=epochs
)


# Initialize the TimeGAN synthesizer
synthesizer = TimeGAN(
    model_parameters=gan_args,
    hidden_dim=hidden_dim,
    seq_len=seq_len,
    n_seq=n_seq,
    gamma=gamma
)

# Fit the synthesizer to the PeMS-Bay dataset
synthesizer.train(reshaped_data, train_args.epochs)

# Generate synthetic data
synthetic_data = synthesizer.sample(100)  # Generate 100 samples

# Reshape the synthetic data back to original shape if needed
synthetic_data = synthetic_data.reshape(-1, n_seq)
synthetic_data = scaler.inverse_transform(synthetic_data)

print(synthetic_data.shape)

Emddeding network training:   3%|▎         | 1/30 [01:32<44:28, 92.03s/it]

In [None]:
print(type(synthetic_data))
print(synthetic_data)

In [None]:
# Save timestamps from the original data
timestamps = data.index

# Downsample the data to every 15 minutes
data = data.resample('5T').mean()

# Handle missing values
data = data.interpolate(method='time')

# Save sensor IDs
sensor_ids = data.columns

In [None]:
# Generate correct timestamps
start_timestamp = timestamps[0]
end_timestamp = start_timestamp + pd.Timedelta(minutes=5 * (synthetic_data.shape[0] - 1))
full_timestamps = pd.date_range(start=start_timestamp, end=end_timestamp, freq='5T')

# Ensure the length of full_timestamps matches the synthetic_data
assert len(full_timestamps) == synthetic_data.shape[0], "The length of generated timestamps must match the synthetic data"

# Add the 'timestamp' column
synthetic_data_df = pd.DataFrame(synthetic_data, columns=sensor_ids)
synthetic_data_df['timestamp'] = full_timestamps

# Ensure 'timestamp' is the first column
synthetic_data_df = synthetic_data_df[['timestamp'] + sensor_ids.tolist()]

# Print the head of the DataFrame
print(synthetic_data_df.head())

# Write the DataFrame to a CSV file
output_file_path = 'synthetic_data_timegan_02.csv'
synthetic_data_df.to_csv(output_file_path, index=False)

print(f"Synthetic data has been written to {output_file_path}")
