In [None]:
# link & mount to google drive to import data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

**Load in data**

#### Old con dataframe

In [None]:
# import pre-processed data from google drive and store in dataframe using pandas function
# split the dataset into 6 different datasets, one for each class
# GAN model will generate samples for one class at a time, therefore dataframe needs to be split by class. Will be joined again after all samples have been generated.

d_Botnet = pd.read_csv('/content/drive/My Drive/Colab Notebooks/progettoTesi/Dataset2017/improved/Pre-processed_versions/35/d_Botnet35.csv')
d_Bruteforce = pd.read_csv('/content/drive/My Drive/Colab Notebooks/progettoTesi/Dataset2017/improved/Pre-processed_versions/35/d_BruteForce35.csv')

**Drop label category from each dataframe**

In [None]:
#colonna superflua perché tanto ho già diviso per classi... la label di ogni df è uguale obv

dfBotnet = d_Botnet.drop(['L_BENIGN','L_Infiltration_Portscan','L_DDoS','L_DoS','L_BruteForce','L_Botnet'], axis=1)
dfBruteforce = d_Botnet.drop(['L_BENIGN','L_Infiltration_Portscan','L_DDoS','L_DoS','L_BruteForce','L_Botnet'], axis=1)

In [None]:
df = dfBotnet #dfBruteforce

In [None]:
pd.options.display.float_format = '{:,.20f}'.format

```
BENIGN                   1594422
Infiltration_Portscan     230853
DoS                       171620
DDoS                       95133
BruteForce                  7005
Botnet                       736
```


## **Define model architecture**

model architecture adapted from https://github.com/ydataai/ydata-synthetic - a model for generating credit card data

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
# Allow matplotlib images to render immediately.
%matplotlib inline

import seaborn as sns

from tensorflow.keras.layers import Input, Dense, Dropout, LeakyReLU, BatchNormalization
from tensorflow.keras import Model

In [None]:
from tensorflow import keras
!pip install keras-tuner --upgrade
import keras_tuner as kt

Collecting keras-tuner
  Downloading keras_tuner-1.4.6-py3-none-any.whl (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.9/128.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.6 kt-legacy-1.0.5


In [None]:
!pip install sdv
from sdv.evaluation.single_table import get_column_plot
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import run_diagnostic

Collecting sdv
  Downloading sdv-1.9.0-py2.py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.0/140.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3<2,>=1.15.0 (from sdv)
  Downloading boto3-1.34.17-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<2,>=1.18 (from sdv)
  Downloading botocore-1.34.17-py3-none-any.whl (11.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.9/11.9 MB[0m [31m101.5 MB/s[0m eta [36m0:00:00[0m
Collecting copulas<0.10,>=0.9.0 (from sdv)
  Downloading copulas-0.9.2-py2.py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ctgan<0.9,>=0.8 (from sdv)
  Downloading ctgan-0.8.0-py2.py3-none-any.whl (27 kB)
Collecting deepecho<0.6,>=0.5 (from sdv)
 

Modello GAN

In [None]:
def plot_history(d1_hist, d2_hist, g_hist, a1_hist, a2_hist):
 # plot loss
 plt.subplot(2, 1, 1)
 plt.plot(d1_hist, label='d-loss-real')
 plt.plot(d2_hist, label='d-loss-fake')
 plt.plot(g_hist, label='g-loss')
 plt.legend()
 # plot discriminator accuracy
 plt.subplot(2, 1, 2)
 plt.plot(a1_hist, label='acc-real')
 plt.plot(a2_hist, label='acc-fake')
 plt.legend()

 plt.show()

In [None]:
alpha_leakyRelu = 0.01 #@param
momentumBN = 0.8 #@param
dropoutRate = 0.1 #@param

In [None]:
#classe del modello intero GAN
class GANTuner():

    def __init__(self, gan_args):
        [self.batch_size, self.noise_dim, self.data_dim, layers_dim] = gan_args

        self.hyperparameters = HyperParameters()
        self.hyperparameters.Choice('lr_d', [0.001, 0.0001, 0.00001], default=0.001)
        self.hyperparameters.Choice('lr_g', [0.001, 0.0001, 0.00001], default=0.001)

        self.generator = Generator(self.batch_size).\
            build_model(input_shape=(self.noise_dim,), dim=layers_dim, data_dim=self.data_dim)

        self.discriminator = Discriminator(self.batch_size).\
            build_model(input_shape=(self.data_dim,), dim=layers_dim)

        self.tuner = RandomSearch(
            self.build_hypermodel,
            objective='generator_loss',
            max_trials=5,  # Set this to the desired number of trials
            directory='my_tuning_directory',
            project_name='gan_tuning')

    def build_hypermodel(self, hp):
        lr_d = hp.Choice('lr_d', [0.001, 0.0001, 0.00001], default=0.001)
        lr_g = hp.Choice('lr_g', [0.001, 0.0001, 0.00001], default=0.001)

        # Generator
        input_noise = Input(shape=(self.noise_dim,))
        generated_data = self.generator(input_noise)

        # Discriminator
        input_real_data = Input(shape=(self.data_dim,))
        validity = self.discriminator(input_real_data)

        # Combined model
        self.discriminator.trainable = False
        combined_output = self.discriminator(generated_data)

        # Model
        model = Model(inputs=[input_noise, input_real_data], outputs=[generated_data, validity, combined_output])

        optimizer_discriminator = tf.keras.optimizers.Adam(lr_d, 0.5)
        optimizer_generator = tf.keras.optimizers.Adam(lr_g, 0.5)

        model.compile(
            loss=['binary_crossentropy', 'binary_crossentropy', 'binary_crossentropy'],
            optimizer=[optimizer_generator, optimizer_discriminator],
            metrics=['accuracy']
        )

        return model

    def run_tuner(self, data):
      noise = np.random.normal(size=(self.batch_size, self.noise_dim))  # Example noise batch
      real_data_batch = self.get_data_batch(data, self.batch_size)  # Modify this according to your data shape
      # Ensure that the tuner search gets data batches with correct shapes
      self.tuner.search([noise, real_data_batch], epochs=5, validation_split=0.2)


    def get_data_batch(self, train, batch_size, seed=0):
        start_i = (batch_size * seed) % len(train)
        stop_i = start_i + batch_size
        shuffle_seed = (batch_size * seed) // len(train)
        np.random.seed(shuffle_seed)
        train_ix = np.random.choice(list(train.index), replace=False, size=len(train))  # wasteful to shuffle every time
        train_ix = list(train_ix) + list(train_ix)  # duplicate to cover ranges past the end of the set
        x = train.loc[train_ix[start_i: stop_i]].values
        return np.reshape(x, (batch_size, -1))

    def print_architecture(self):
      print("Generator Architecture:")
      self.generator.summary()
      print("\nDiscriminator Architecture:")
      self.discriminator.summary()
      print("\nCombined Model Architecture:")
      self.combined.summary()
      print("\n====================================================================================\n")


    def train(self, data, train_arguments):
        [cache_prefix, epochs, sample_interval] = train_arguments

        data_cols = data.columns

        # Adversarial ground truths
        valid = np.ones((self.batch_size, 1))
        fake = np.zeros((self.batch_size, 1))

        d1_hist, d2_hist, g_hist, a1_hist, a2_hist = list(), list(), list(), list(), list()

        for epoch in range(epochs):
            # Discriminator
            batch_data = self.get_data_batch(data, self.batch_size)
            noise = tf.random.normal((self.batch_size, self.noise_dim))

            # Generate a batch of new samples
            gen_data = self.generator.predict(noise)

            # Train the discriminator
            d_loss_real, d_acc1 = self.discriminator.train_on_batch(batch_data, valid)
            d_loss_fake, d_acc2 = self.discriminator.train_on_batch(gen_data, fake)
            #d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # Generator
            noise = tf.random.normal((self.batch_size, self.noise_dim))
            # Train the generator (to have the discriminator label samples as valid)
            g_loss = self.combined.train_on_batch(noise, valid)

            #Therefore, using the loss values directly is not recommended in GANs.
            #Instead, metrics such as the Inception Score, Frechet Inception Distance (FID score),
            #and perceptual similarity measures (LPIPS) are used for interpreting the results.
            #PERFORMANCE
            d1_hist.append(d_loss_real)
            d2_hist.append(d_loss_fake)
            g_hist.append(g_loss)
            a1_hist.append(d_acc1)
            a2_hist.append(d_acc2)
            # Plot the progress
            print('==>> %d, dRLoss=%.3f, dFLoss=%.3f gLoss=%.3f, accR=%d, accF=%d' % (epoch, d_loss_real, d_loss_fake, g_loss, int(100*d_acc1), int(100*d_acc2)))
        plot_history(d1_hist, d2_hist, g_hist, a1_hist, a2_hist)


#DISCRIMINATORE
class Discriminator():
    def __init__(self,batch_size):
        self.batch_size=batch_size

    def build_model(self, input_shape, dim):
        input = Input(shape=input_shape, batch_size=self.batch_size)
        x = Dense(dim * 4)(input)
        x = LeakyReLU(alpha=alpha_leakyRelu)(x)
        x = Dropout(dropoutRate)(x)
        x = Dense(dim * 2)(x)
        x = LeakyReLU(alpha=alpha_leakyRelu)(x)
        x = Dropout(dropoutRate)(x)
        x = Dense(dim)(x)
        x = LeakyReLU(alpha=alpha_leakyRelu)(x)
        x = Dense(1, activation='sigmoid')(x)
        return Model(inputs=input, outputs=x)

In [None]:
#GENERATOR INIZIALE


class Generator():
    def __init__(self, batch_size):
        self.batch_size = batch_size

    def build_model(self, input_shape, dim, data_dim):
        input_layer = Input(shape=input_shape, batch_size=self.batch_size)
        x = Dense(dim)(input_layer)
        x = LeakyReLU(alpha=alpha_leakyRelu)(x)
        x = BatchNormalization(momentum=momentumBN)(x)
        x = Dense(dim * 2)(x)
        x = LeakyReLU(alpha=alpha_leakyRelu)(x)
        x = BatchNormalization(momentum=momentumBN)(x)
        x = Dense(dim * 4)(x)
        x = LeakyReLU(alpha=alpha_leakyRelu)(x)
        x = BatchNormalization(momentum=momentumBN)(x)
        output_layer = Dense(data_dim, activation='sigmoid')(x) # Use sigmoid activation in the output layer to constrain values to [0, 1]
        return Model(inputs=input_layer, outputs=output_layer)

Batch Normalization (BN) and dropout are techniques that serve different purposes:

* Batch Normalization (BN):
> BN is often used to normalize the activations in a layer, which can help with training stability and speed up convergence. It is particularly useful in deep neural networks.
In the generator, BN can help with avoiding mode collapse and ensuring that the generator learns a diverse set of samples.



* Dropout:
> Dropout is a regularization technique where randomly selected neurons are ignored during training. It helps prevent overfitting by introducing noise and reducing reliance on specific neurons.
In the discriminator, dropout can help prevent the discriminator from becoming too specialized to the training data and encourage the learning of more robust features.


While BN and dropout are commonly used in GANs, they are not mandatory.

## Training & Generazione

**Generating class data**

```
BENIGN                   1594422
Infiltration_Portscan     230853
DoS                       171620
DDoS                       95133
BruteForce                  7005
Botnet                       736
```




```
Given that your original data has 35 features
--> you might want to experiment with sizes around 35 or slightly larger <--
to ensure that the input noise vector captures additional variations while respecting the underlying structure of the data
```



In [None]:
data_columns = df.columns
df[data_columns] = df[data_columns]

#generator parameters
batch_size = 256 #@param
lr_d=0.0001 #@param
lr_g=0.0003 #@param
noise_dim = 16  #@param

data_dim=35 #numeroFeatures
layers_dim=64 #@param


#training parameters
epochs = 8000 #@param
log_step = 100

In [None]:
generator_parameters = [batch_size, lr_d, lr_g, noise_dim, input_shape, dim]
training_parameters = ['', epochs, log_step]

#### Keras Tuner

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
gan_args = [batch_size, noise_dim, data_dim, layers_dim]

gan_tuner = GANTuner(gan_args)
gan_tuner.run_tuner(df)

Reloading Tuner from my_tuning_directory/gan_tuning/tuner0.json

Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
0.001             |0.001             |lr_d
0.0001            |1e-05             |lr_g



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/keras_tuner/src/engine/base_tuner.py", line 273, in _try_run_and_update_trial
    self._run_and_update_trial(trial, *fit_args, **fit_kwargs)
  File "/usr/local/lib/python3.10/dist-packages/keras_tuner/src/engine/base_tuner.py", line 238, in _run_and_update_trial
    results = self.run_trial(trial, *fit_args, **fit_kwargs)
  File "/usr/local/lib/python3.10/dist-packages/keras_tuner/src/engine/tuner.py", line 314, in run_trial
    obj_value = self._build_and_fit_model(trial, *args, **copied_kwargs)
  File "/usr/local/lib/python3.10/dist-packages/keras_tuner/src/engine/tuner.py", line 232, in _build_and_fit_model
    model = self._try_build(hp)
  File "/usr/local/lib/python3.10/dist-packages/keras_tuner/src/engine/tuner.py", line 164, in _try_build
    model = self._build_hypermodel(hp)
  File "/usr/local/lib/python3.10/dist-packages/keras_tuner/src/engine/tuner.py", line 155, in _build_hypermodel
    model

RuntimeError: Number of consecutive failures exceeded the limit of 3.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/keras_tuner/src/engine/base_tuner.py", line 273, in _try_run_and_update_trial
    self._run_and_update_trial(trial, *fit_args, **fit_kwargs)
  File "/usr/local/lib/python3.10/dist-packages/keras_tuner/src/engine/base_tuner.py", line 238, in _run_and_update_trial
    results = self.run_trial(trial, *fit_args, **fit_kwargs)
  File "/usr/local/lib/python3.10/dist-packages/keras_tuner/src/engine/tuner.py", line 314, in run_trial
    obj_value = self._build_and_fit_model(trial, *args, **copied_kwargs)
  File "/usr/local/lib/python3.10/dist-packages/keras_tuner/src/engine/tuner.py", line 232, in _build_and_fit_model
    model = self._try_build(hp)
  File "/usr/local/lib/python3.10/dist-packages/keras_tuner/src/engine/tuner.py", line 164, in _try_build
    model = self._build_hypermodel(hp)
  File "/usr/local/lib/python3.10/dist-packages/keras_tuner/src/engine/tuner.py", line 155, in _build_hypermodel
    model = self.hypermodel.build(hp)
  File "<ipython-input-24-a2fa11951c82>", line 41, in build_hypermodel
    combined_output = self.combined(input_noise)
AttributeError: 'GANTuner' object has no attribute 'combined'


## Training

In [None]:
synthesizer = GAN(generator_parameters)
#synthesizer.print_architecture()

In [None]:
synthesizer.train(df, training_parameters)

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
==>> 208, dRLoss=0.562, dFLoss=0.502 gLoss=2.877, accR=96, accF=78
==>> 209, dRLoss=0.564, dFLoss=0.495 gLoss=2.936, accR=93, accF=76
==>> 210, dRLoss=0.564, dFLoss=0.520 gLoss=2.608, accR=95, accF=75
==>> 211, dRLoss=0.557, dFLoss=0.515 gLoss=2.426, accR=95, accF=76
==>> 212, dRLoss=0.569, dFLoss=0.512 gLoss=2.791, accR=94, accF=77
==>> 213, dRLoss=0.573, dFLoss=0.555 gLoss=2.884, accR=91, accF=72
==>> 214, dRLoss=0.562, dFLoss=0.532 gLoss=2.698, accR=94, accF=77
==>> 215, dRLoss=0.562, dFLoss=0.516 gLoss=2.549, accR=93, accF=75
==>> 216, dRLoss=0.570, dFLoss=0.506 gLoss=2.744, accR=92, accF=80
==>> 217, dRLoss=0.560, dFLoss=0.522 gLoss=2.622, accR=93, accF=80
==>> 218, dRLoss=0.565, dFLoss=0.500 gLoss=2.635, accR=93, accF=82
==>> 219, dRLoss=0.561, dFLoss=0.525 gLoss=2.566, accR=93, accF=74
==>> 220, dRLoss=0.557, dFLoss=0.496 gLoss=2.556, accR=94, accF=82
==>> 221, dRLoss=0.551, dFLoss=0.544 gLoss=2.570, accR=95, accF=7

In [None]:
models = {'GAN': ['GAN', False, synthesizer.generator]}



```
By setting a seed, you can ensure that the randomness introduced
by the use of random number generators is consistent across different runs of your code,
which can help make your results more reliable and reproducible
```



In [None]:
np.random.seed(3) # used for retestability

num_rows_gen = 1000 #quante righe verranno generate generate
noise = np.random.normal(0, 1, (num_rows_gen, input_noise))

[model_name, with_class, generator_model] = models['GAN']

X = generator_model.predict(noise)

In [None]:
# Create a dataframe with generated samples, so can be used for evaluation later
gen_samples = pd.DataFrame(X, columns=data_columns)

In [None]:
print(df.shape)
print(gen_samples.shape)

In [None]:
df.head()

In [None]:
gen_samples.head()

#Valutazione

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 6))
sns.heatmap(df.corr(), annot=False, ax=ax[0], cmap="Blues")
sns.heatmap(gen_samples.corr(), annot=False, ax=ax[1], cmap="Blues")
ax[0].set_title("Dati reali")
ax[1].set_title("Dati generati")

In [None]:
real_data = df.to_numpy()
generated_data = gen_samples.to_numpy()

In [None]:
from scipy.stats import wasserstein_distance

# Flatten the arrays for 1D distribution
real_flat = real_data.flatten()
generated_flat = generated_data.flatten()

# Compute Wasserstein distance
w_distance = wasserstein_distance(real_flat, generated_flat)

print('Wasserstein Distance delle distribuzioni:', w_distance)



```
Interpretation can depend on the scale and nature of your data, but in general:
## Closer to 0: Indicates a smaller difference between distributions.
```



In [None]:
real = df['FlowDuration'].to_numpy()
gen = gen_samples['FlowDuration'].to_numpy()

w_dist_feature = wasserstein_distance(real, gen)
print("Wasserstein Distance per la feature FlowDuration:", w_dist_feature)



```
# Wasserstein Distance for BwdPacketLengthStd feature: 0.0008046525194385199
# Wasserstein Distance for FlowDuration feature: 0.0003101878491190463
# Wasserstein Distance for PacketLengthVariance feature: 0.023556970948095415
# Wasserstein Distance for DstPort feature: 0.0016220070848744499
```



In [None]:
from scipy.stats import ks_2samp


column_to_check = 0

# Perform the KS test for the selected column
ks_statistic, ks_p_value = ks_2samp(real_data[:, column_to_check], generated_data[:, column_to_check])

# Print the results
print(f"KS Statistic: {ks_statistic}")
print(f"P-value: {ks_p_value}")

# Interpret the results
#A higher alpha means you are more lenient in terms of what level of evidence you require to reject the null hypothesis.
alpha = 0.1  # significance level
if ks_p_value > alpha:
    print("Le distribuzioni sono simili (fail to reject the null hypothesis)")
else:
    print("Le distribuzioni sono differenti (reject the null hypothesis)")

In [None]:
# Choose the column you want to check
# 0 = FlowDuration
# 1 = BwdPacketLengthStd
# 2 = PacketLengthVariance
# 3 = DstPort

In [None]:
column_to_compare = 0

# Extract the selected column for each array
real_column = real_data[:, column_to_compare]
generated_column = generated_data[:, column_to_compare]

# Plot the KDE for both distributions
sns.kdeplot(real_column, label='Dati reali', fill=True)
sns.kdeplot(generated_column, label='Dati generati', fill=True)

# Add labels and legend
plt.title(f'Confronto distribuzioni della feature: {df.columns[column_to_compare]}')
plt.xlabel('Valori')
plt.ylabel('Densità')
plt.legend()
#salva in Drive
#images_dir = '/content/drive/MyDrive/Colab Notebooks/progettoTesi/grafici_pdf'
#plt.savefig(f"{images_dir}/distributionFeatureFlowDuration_genWGAN_t2.pdf", dpi=200, format='pdf') ###!!!!!!!!!!!!!! CAMBIARE TENTATIVI CON N°
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error

real_data_first_column = real_data[:, 0]
gen_data_first_column = generated_data[:, 0]

# Downsample gen_data_first_column to match the size of real_data_first_column - 736
downsampled_gen_data_first_column = np.random.choice(gen_data_first_column, size=(len(real_data_first_column),), replace=False)

# Calculate the Mean Squared Error
mse = mean_squared_error(real_data_first_column, downsampled_gen_data_first_column)

print(f"Mean Squared Error: {mse}")



```
# 0 --> Mean Squared Error: 2.803826646413654e-07
# 1 --> Mean Squared Error: 3.4297804631933104e-06
# 2 --> Mean Squared Error: 0.011631663888692856
# 3 --> Mean Squared Error: 4.149306732870173e-06
```





```
Interpretation:

*   A smaller MSE indicates that the generated data is closer to the real data in terms of the selected feature.
*  Closer to 0 suggests better performance for that specific feature.


```



In [None]:
def gaussian_kernel(x, y, sigma=1.0):
    # Gaussian kernel function
    return np.exp(-np.linalg.norm(x - y) ** 2 / (2 * (sigma ** 2)))

def mmd(real_data, generated_data, kernel=gaussian_kernel):
    m = len(real_data)
    n = len(generated_data)

    # Compute the MMD statistic
    mmd_statistic = 0.0
    for i in range(m):
        for j in range(m):
            mmd_statistic += kernel(real_data[i], real_data[j])

    for i in range(n):
        for j in range(n):
            mmd_statistic += kernel(generated_data[i], generated_data[j])

    for i in range(m):
        for j in range(n):
            mmd_statistic -= 2 * kernel(real_data[i], generated_data[j])

    mmd_statistic /= (m * (m - 1)) + (n * (n - 1)) - 2 * m * n
    return mmd_statistic


mmd_statistic = mmd(real_data[:, 0], generated_data[:, 0])

print("MMD:", mmd_statistic)



```
you would want the MMD to be close to zero, as it implies that the generated samples are very similar to the real data.

Negative MMD: The MMD itself doesn't have a fixed sign, so negative values are not uncommon. The magnitude is more important,
and in this case, the small magnitude indicates similarity.
```



In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 6))
ax[0].scatter(df.iloc[:, 0], df.iloc[:, 1])
ax[1].scatter(gen_samples.iloc[:, 0], gen_samples.iloc[:, 1])
ax[0].set_title("Dati reali")
ax[1].set_title("Dati generati")

#Vecchi metodi valutazione
*   https://github.com/sdv-dev/SDV
*   https://docs.sdv.dev/sdv



In [None]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)

In [None]:
quality_report = evaluate_quality(
    real_data=df,
    synthetic_data=gen_samples,
    metadata=metadata)

In [None]:
diagnostic_report = run_diagnostic(
    real_data=df,
    synthetic_data=gen_samples,
    metadata=metadata)