In [58]:
#Packages and Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sdv.metadata import SingleTableMetadata
from sdv.single_table import TVAESynthesizer

from sklearn.preprocessing import MinMaxScaler

In [59]:
#Load dataset 
df = pd.read_csv("data.csv")

df.head()
print("Original Shape: {}".format(df.shape))

Original Shape: (174, 452)


In [60]:
#Create metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)
metadata.save_to_json("darwin_metadata_700.json")

#Train TVAE model
synthesizer = TVAESynthesizer(metadata, epochs = 500)
synthesizer.fit(df)

#Generate synthetic rows
synthetic_data = synthesizer.sample(700)
print("Synthetic Data Shape: ", synthetic_data.shape)




Synthetic Data Shape:  (700, 452)


In [61]:
#Combine real + synthetic data
df_extend = pd.concat([df, synthetic_data])
print("Extended dataset shape: ", df_extend)

Extended dataset shape:                  ID  air_time1  disp_index1  gmrt_in_air1  gmrt_on_paper1  \
0             id_1       5160     0.000013    120.804174       86.853334   
1             id_2      51980     0.000016    115.318238       83.448681   
2             id_3       2600     0.000010    229.933997      172.761858   
3             id_4       2130     0.000010    369.403342      183.193104   
4             id_5       2310     0.000007    257.997131      111.275889   
..             ...        ...          ...           ...             ...   
695  sdv-id-WLEXsv       6160     0.000002    283.284183      153.728763   
696  sdv-id-GOfdKS       2228     0.000002    290.385030      154.784786   
697  sdv-id-cbuKnJ       5366     0.000013    240.910721      151.717560   
698  sdv-id-rVzTeU       1802     0.000028    310.105671      113.755988   
699  sdv-id-crsCgI       7601     0.000006    273.814408      164.421736   

     max_x_extension1  max_y_extension1  mean_acc_in_air1  mea

In [62]:
#Define features and targets
X = df_extend.drop(["class", "ID"], axis=1).values
y = df_extend["class"]

#Check missing values before
print("Missing Data Count: {}".format(pd.DataFrame(X).isnull().sum().sum()))

#Artificially adding missing values (AI Assisted)
rng = np.random.default_rng(42)
mask = rng.random(X.shape) < 0.01
X_missing = X.astype(float)
X_missing[mask] = np.nan

#Check missing values after
print("Missing Data Count: {}".format(pd.DataFrame(X_missing).isnull().sum().sum()))

Missing Data Count: 0
Missing Data Count: 3979


In [63]:
#Rebuild the DF with missing values
df_extend_missing = pd.DataFrame(X_missing, columns=df_extend.drop(["class", "ID"], axis=1).columns)
df_extend_missing["class"] = y.values
df_extend_missing["ID"] = df_extend["ID"].values 

#Save to CSV
df_extend_missing.to_csv("data_extended_withMissing_800.csv", index=False)

In [66]:
# Make sure both datasets have the same columns
assert all(df.columns == df_extend.columns), "Columns must match!"

# Calculate mean for each feature
original_means = df.drop(["class", "ID"], axis=1).mean()
synthetic_means = df_extend.drop(["class", "ID"], axis=1).mean()

# Combine into a comparison table
comparison = pd.DataFrame({
    "Original Mean": original_means,
    "Synthetic Mean": synthetic_means,
    "Difference": synthetic_means - original_means
})

print(comparison)


                  Original Mean  Synthetic Mean    Difference
air_time1           5664.166667     3985.169336 -1.678997e+03
disp_index1            0.000010        0.000010  5.219972e-07
gmrt_in_air1         297.666685      244.887936 -5.277875e+01
gmrt_on_paper1       200.504413      181.763707 -1.874071e+01
max_x_extension1    1977.965517     1704.236842 -2.737287e+02
...                         ...             ...           ...
num_of_pendown25      85.839080       82.197941 -3.641140e+00
paper_time25       43109.712644    39952.294050 -3.157419e+03
pressure_mean25     1629.585962     1654.998803  2.541284e+01
pressure_var25    163061.767360   155677.087005 -7.384680e+03
total_time25      164203.327586   103482.596110 -6.072073e+04

[450 rows x 3 columns]
