In [1]:
from sdv.metadata import SingleTableMetadata
from sdv.sampling import Condition
from sdv.sequential import PARSynthesizer
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")


In [2]:
file = "../DataSet/powerconsumption.csv"
data = pd.read_csv(file, sep=',', parse_dates=[0])
print(data.dtypes)


Datetime                  datetime64[ns]
Temperature                      float64
Humidity                         float64
WindSpeed                        float64
GeneralDiffuseFlows              float64
DiffuseFlows                     float64
PowerConsumption_Zone1           float64
PowerConsumption_Zone2           float64
PowerConsumption_Zone3           float64
dtype: object


In [3]:
import pandas as pd
from sdv.metadata import SingleTableMetadata

# Add 'Sequence_ID' as a new column (assuming it's unique for each row)
data['Sequence_ID'] = range(1, len(data) + 1)

# Create a new 'Sequence_Key' column
data['Sequence_Key'] = data.groupby('Datetime').ngroup()

# Detect metadata from the dataframe
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data)

# Ensure 'Datetime' is treated as datetime
metadata.update_column(column_name="Datetime", sdtype="datetime")

# Set 'Sequence_ID' as the primary key and 'Sequence_Key' as the sequence key
metadata.update_column('Sequence_ID', sdtype='id')  # Sequence_ID should be of type 'id'
metadata.set_primary_key('Sequence_ID')  # Set 'Sequence_ID' as primary key
metadata.update_column('Sequence_Key', sdtype='id')  # Sequence Key should be id type
metadata.set_sequence_key(column_name='Sequence_Key')  # Set sequence key

# Optionally, print the metadata to verify
print(metadata)


{
    "sequence_key": "Sequence_Key",
    "primary_key": "Sequence_ID",
    "columns": {
        "Datetime": {
            "sdtype": "datetime"
        },
        "Temperature": {
            "sdtype": "numerical"
        },
        "Humidity": {
            "sdtype": "numerical"
        },
        "WindSpeed": {
            "sdtype": "numerical"
        },
        "GeneralDiffuseFlows": {
            "sdtype": "numerical"
        },
        "DiffuseFlows": {
            "sdtype": "numerical"
        },
        "PowerConsumption_Zone1": {
            "sdtype": "numerical"
        },
        "PowerConsumption_Zone2": {
            "sdtype": "numerical"
        },
        "PowerConsumption_Zone3": {
            "sdtype": "numerical"
        },
        "Sequence_ID": {
            "sdtype": "id"
        },
        "Sequence_Key": {
            "sdtype": "id"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}


In [4]:
metadata


{
    "sequence_key": "Sequence_Key",
    "primary_key": "Sequence_ID",
    "columns": {
        "Datetime": {
            "sdtype": "datetime"
        },
        "Temperature": {
            "sdtype": "numerical"
        },
        "Humidity": {
            "sdtype": "numerical"
        },
        "WindSpeed": {
            "sdtype": "numerical"
        },
        "GeneralDiffuseFlows": {
            "sdtype": "numerical"
        },
        "DiffuseFlows": {
            "sdtype": "numerical"
        },
        "PowerConsumption_Zone1": {
            "sdtype": "numerical"
        },
        "PowerConsumption_Zone2": {
            "sdtype": "numerical"
        },
        "PowerConsumption_Zone3": {
            "sdtype": "numerical"
        },
        "Sequence_ID": {
            "sdtype": "id"
        },
        "Sequence_Key": {
            "sdtype": "id"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}

In [5]:
context_columns = [ 'Temperature', 'Humidity', 'WindSpeed']


In [6]:
# custom_synthesizer = PARSynthesizer(
#     metadata,
#     epochs=1,
#     context_columns=context_columns,
#     enforce_min_max_values=True,
#     verbose=True
# )

# custom_synthesizer.fit(data)


In [7]:
# from sdv.tabular import CTGAN

# # Initialize and fit the CTGAN model
# model = CTGAN()
# model.fit(df)

# # Generate synthetic data
# synthetic_data = model.sample()

# # Optional: Convert back Datetime to original format if necessary
# synthetic_data['Datetime'] = pd.to_datetime(synthetic_data['Datetime'], unit='s')


In [1]:
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv("../../DataSet/powerconsumption.csv", parse_dates=["Datetime"])

# Add small random noise to each numeric column to generate synthetic data
def add_noise(df, noise_level=0.02):
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    for col in numeric_columns:
        noise = np.random.normal(0, noise_level * df[col].std(), size=len(df))
        df[col] = np.abs(df[col] + noise)  # Ensuring values stay positive
    return df
# Add noise and generate synthetic data
synthetic_data = add_noise(df.copy())

# Create new timestamps for synthetic data (next hour after the original data)
synthetic_data["Datetime"] = pd.date_range(
    start=df["Datetime"].max() + pd.Timedelta(minutes=10), 
    periods=len(df),  
    freq="10min"
)

# Combine original and synthetic data
df_expanded = pd.concat([df, synthetic_data], ignore_index=True)

# Save to new CSV
df_expanded.to_csv("expanded_dataset.csv", index=False)

print("New dataset saved as 'expanded_dataset.csv'.")


New dataset saved as 'expanded_dataset.csv'.


In [2]:
df_expanded.tail()


Unnamed: 0,Datetime,Temperature,Humidity,WindSpeed,GeneralDiffuseFlows,DiffuseFlows,PowerConsumption_Zone1,PowerConsumption_Zone2,PowerConsumption_Zone3
104827,2018-12-29 23:10:00,6.995109,72.525194,0.155856,1.012306,3.481106,30964.569795,26854.13665,14920.861107
104828,2018-12-29 23:20:00,6.978757,72.920152,0.119492,2.180571,1.526512,30547.636276,26242.977958,14147.422884
104829,2018-12-29 23:30:00,6.916151,72.527927,0.08662,1.751282,2.689484,29682.071637,25376.004416,13673.383865
104830,2018-12-29 23:40:00,6.827798,73.084556,0.150214,7.989211,3.949462,28905.990071,24680.230401,13426.476605
104831,2018-12-29 23:50:00,6.580144,73.956875,0.135392,6.715635,1.436543,28436.654103,23989.440259,13321.555341


In [9]:
synthetic_data.tail()


Unnamed: 0,Datetime,Temperature,Humidity,WindSpeed,GeneralDiffuseFlows,DiffuseFlows,PowerConsumption_Zone1,PowerConsumption_Zone2,PowerConsumption_Zone3
52411,2018-12-29 23:10:00,7.0607,72.005969,0.07623,-4.023478,3.5347,31018.769888,26724.773464,14792.110257
52412,2018-12-29 23:20:00,6.959632,72.066226,0.072165,0.207339,-1.486588,30423.954944,26206.261514,14392.69095
52413,2018-12-29 23:30:00,6.997115,72.680981,0.175463,1.804606,2.061184,29645.071662,25377.148519,13838.338513
52414,2018-12-29 23:40:00,6.908843,73.155584,-0.004116,2.225365,-2.339974,28858.070425,24637.427282,13504.3222
52415,2018-12-29 23:50:00,6.628463,74.486987,0.106163,-3.427215,2.786137,28166.170838,24083.270569,13410.525328
