<a href="https://colab.research.google.com/github/MAY2704/ML_usecases/blob/main/Test_data_generate/Synthetic_test_data_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **This is the preprocess stage to load necessary pre-requisites**
# **🛫**

In [None]:
#!pip install tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Hyperparameters (adjust as needed)
latent_dim = 100  # Dimension of the latent space
batch_size = 32  # Number of transactions per training batch
epochs = 10  # Number of training epochs

# **This is an example stage to load sample test data for patterns**

In [None]:



# Local Test Data (replace with your actual data)
local_test_data = [
    {"amount": 100, "category": "debit", "date": "2024-06-15", "customer_id": "anonymized_id1"},
    {"amount": 50, "category": "credit", "date": "2024-06-12", "customer_id": "anonymized_id2"},
    {"amount": 70, "category": "credit", "date": "2024-06-13", "customer_id": "anonymized_id3"},
    {"amount": 55, "category": " debit", "date": "2024-06-17", "customer_id": "anonymized_id4"},
    {"amount": 45, "category": "credit", "date": "2024-06-08", "customer_id": "anonymized_id5"},
    {"amount": 56, "category": "credit", "date": "2024-06-09", "customer_id": "anonymized_id6"},
    {"amount": 90, "category": "credit", "date": "2024-06-11", "customer_id": "anonymized_id7"},
    {"amount": 189, "category": "credit", "date": "2024-06-14", "customer_id": "anonymized_id8"},
    {"amount": 1000, "category": "credit", "date": "2024-06-15", "customer_id": "anonymized_id9"},
    {"amount": 978, "category": "credit", "date": "2024-06-11", "customer_id": "anonymized_id10"},
    {"amount": 45, "category": "credit", "date": "2024-06-19", "customer_id": "anonymized_id11"},
    {"amount": 123, "category": "credit", "date": "2024-06-20", "customer_id": "anonymized_id12"},
    {"amount": 78, "category": "debit", "date": "2024-06-21", "customer_id": "anonymized_id13"},
    {"amount": 33, "category": "credit", "date": "2024-06-22", "customer_id": "anonymized_id14"},
    {"amount": 87, "category": "credit", "date": "2024-06-17", "customer_id": "anonymized_id15"},
    {"amount": 11, "category": "debit", "date": "2024-06-16", "customer_id": "anonymized_id16"},
    {"amount": 15, "category": "credit", "date": "2024-06-15", "customer_id": "anonymized_id17"},
    {"amount": 19, "category": "debit", "date": "2024-06-14", "customer_id": "anonymized_id18"},
    {"amount": 20, "category": "credit", "date": "2024-06-12", "customer_id": "anonymized_id19"},
    {"amount": 21, "category": "debit", "date": "2024-06-12", "customer_id": "anonymized_id20"},
    {"amount": 78, "category": "credit", "date": "2024-06-15", "customer_id": "anonymized_id21"},
    {"amount": 45, "category": "credit", "date": "2024-06-11", "customer_id": "anonymized_id22"},
    {"amount": 18, "category": "credit", "date": "2024-06-03", "customer_id": "anonymized_id23"},
    {"amount": 156, "category": "credit", "date": "2024-06-10", "customer_id": "anonymized_id24"},
    {"amount": 198, "category": "debit", "date": "2024-06-21", "customer_id": "anonymized_id25"},
    {"amount": 189, "category": "credit", "date": "2024-06-22", "customer_id": "anonymized_id26"},
    {"amount": 100, "category": "credit", "date": "2024-06-12", "customer_id": "anonymized_id27"},
    {"amount": 198, "category": "debit", "date": "2024-06-11", "customer_id": "anonymized_id28"},
    {"amount": 10, "category": "credit", "date": "2024-06-15", "customer_id": "anonymized_id29"},
    {"amount": 76, "category": "credit", "date": "2024-06-14", "customer_id": "anonymized_id30"},
    {"amount": 34, "category": "credit", "date": "2024-06-09", "customer_id": "anonymized_id31"},
    {"amount": 39, "category": "debit", "date": "2024-06-08", "customer_id": "anonymized_id32"}
]
def load_and_preprocess_data(local_data):
    # Convert local data to a Pandas DataFrame (optional)
    data = pd.DataFrame(local_data)

    # Handle missing values (example: fill with mean)
    # data = data.fillna(data.mean())  # Or other appropriate imputation techniques

    # Deal with outliers (example: capping or removing)
    for col in data.columns:
        if pd.api.types.is_numeric_dtype(data[col]):
            q1 = data[col].quantile(0.25)
            q3 = data[col].quantile(0.75)
            iqr = q3 - q1
            data.loc[data[col] < (q1 - 1.5 * iqr), col] = q1 - 1.5 * iqr
            data.loc[data[col] > (q3 + 1.5 * iqr), col] = q3 + 1.5 * iqr

    # Encode categorical features (example: one-hot encoding)
    categorical_cols = [col for col in data.columns if data[col].dtype == 'object']
    data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
    # Scale numerical features (example: standardization)
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    data[data.columns[data.dtypes != 'object']] = scaler.fit_transform(data[data.columns[data.dtypes != 'object']])

    # Return features
    features = data  # Assuming all columns are features for GAN training

    return features

# Load and preprocess data
real_transactions = load_and_preprocess_data(local_test_data)






# **Define the generator model to generate synthetic test data**

In [None]:

# Define the Generator Network
def build_generator():
  model = tf.keras.Sequential([
      layers.Dense(128, activation='relu', input_shape=(latent_dim,)),
      layers.Dense(256, activation='relu'),
      layers.Dense(512, activation='relu'),
      layers.Dense(real_transactions.shape[1], activation='sigmoid'),  # Adjust output size based on features
  ])
  return model


generator = build_generator()  # Create the generator object here



# **Define discriminator model to detect issues with generated test data**

In [None]:
# Define the Discriminator Network
def build_discriminator():
    model = tf.keras.Sequential([
        layers.Dense(256, activation='relu', input_shape=(real_transactions.shape[1],)),
        layers.Dropout(0.3),
        layers.Dense(512, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid'),
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model


discriminator = build_discriminator()



# **Train the generator & discriminator, so discriminator can 'fight' with generator to detect antipatterns in generated test data**

# **Generator vs Discriminator 🤜🤛**

In [None]:
# Define the combined GAN model for training (same as before)
discriminator.trainable = False
gan_model = tf.keras.Sequential([generator, discriminator])
gan_model.compile(loss='binary_crossentropy', optimizer='adam')

def generate_batches(data, batch_size):
  # Create an empty list to store batches
  batches = []

  # Loop through the data in chunks of batch_size
  for i in range(0, len(data), batch_size):
    # Get a batch of data
    batch = data[i:i + batch_size]
    # Append the batch to the list
    batches.append(batch)

  return batches
def train_gan(epochs):
  # Noise vector dimension matching latent space
  noise_dim = latent_dim

  for epoch in range(epochs):
    # For each epoch
    batch_count = 0
    for real_transactions_batch in generate_batches(real_transactions, batch_size):
      # Generate random noise for the generator
      noise = tf.random.normal(shape=(batch_size, noise_dim))
      # Generate synthetic transactions using the noise
      generated_transactions = generator(noise)
      # Train the Discriminator: Maximize ability to distinguish real from fake

      # Define real labels (ones) for real transactions
      real_labels = tf.ones((batch_size, 1))
      # Reshape real_labels to match batch size (if needed)
      real_labels = tf.reshape(real_labels[:batch_size], (-1, 1))  # Select first 'batch_size' labels and reshape
      print("real_transactions_batch:", real_transactions_batch.shape)
      print("real_labels:", real_labels.shape)
      # Concatenate along the feature dimension (usually the last dimension)
      concatenated_data = tf.concat([real_transactions_batch, real_labels], axis=-1)
      # Define fake labels (zeros) for generated transactions
      fake_labels = tf.zeros((batch_size, 1))
      print("fake_labels:", fake_labels.shape)

      # Train the discriminator on real transactions
      discriminator_loss_real = discriminator.train_on_batch(real_transactions_batch, real_labels)


      # Train the discriminator on synthetic transactions
      generated_labels = tf.zeros((batch_size, 1))  # The discriminator tries to identify these as fake
      discriminator_loss_fake = discriminator.train_on_batch(generated_transactions, generated_labels)

      # Calculate total discriminator loss
      discriminator_loss = 0.5 * (discriminator_loss_real + discriminator_loss_fake)

      # Train the generator: Maximize the discriminator's mistakes
      noise = tf.random.normal(shape=(batch_size, noise_dim))
      gan_loss = gan_model.train_on_batch(noise, tf.ones((batch_size, 1)))

      print(f'Epoch: {epoch}, Discriminator Loss: {discriminator_loss}, GAN Loss: {gan_loss}')

      synthetic_samples = generator.predict(np.random.rand(1000, latent_dim))
      return synthetic_samples


synthetic_samples = train_gan(epochs)
synthetic_samples_df = pd.DataFrame(synthetic_samples)

# Define a function to reverse the scaling
def reverse_scaling(scaled_data, original_data):
    scaler = StandardScaler()
    scaler.fit_transform(original_data)
    return scaler.inverse_transform(scaled_data)


def reverse_encoding(encoded_data, original_data):
    categorical_cols = [col for col in original_data.columns if original_data[col].dtype == 'object']
    for col in categorical_cols:
        encoded_cols = [c for c in encoded_data.columns if c.startswith(col)]
        encoded_data[col] = encoded_data[encoded_cols].idxmax(axis=1)
        encoded_data[col] = encoded_data[col].str.replace(col+'_', '')
        encoded_data = encoded_data.drop(columns=encoded_cols)
    return encoded_data
# Define a function to do the postprocessing
def postprocess_data(synthetic_samples_df, real_transactions):
    # Reverse the scaling
    reverse_scaled_data = reverse_scaling(synthetic_samples_df, real_transactions)
    # Reverse the encoding
    postprocessed_data = reverse_encoding(reverse_scaled_data, real_transactions)
    return postprocessed_data
postprocessed_samples = postprocess_data(synthetic_samples_df, real_transactions)




real_transactions_batch: (32, 48)
real_labels: (32, 1)
fake_labels: (32, 1)
Epoch: 0, Discriminator Loss: 0.7802577018737793, GAN Loss: 0.5898131728172302


# **Run the model and print synthetic samples**

In [None]:
print("Synthetic test data samples are created")

print(postprocessed_samples)

Synthetic test data samples are created
[[0.5774429  0.5150787  0.47595927 ... 0.5540758  0.460086   0.50050735]
 [0.56590027 0.53782755 0.47319967 ... 0.5671035  0.48615906 0.4954995 ]
 [0.5383592  0.50552714 0.48761857 ... 0.561852   0.46817887 0.501293  ]
 ...
 [0.5504726  0.50230396 0.49552804 ... 0.5517026  0.46639284 0.48651636]
 [0.5503055  0.5111661  0.49693635 ... 0.54041886 0.47098148 0.48282596]
 [0.55840707 0.5270952  0.4649615  ... 0.5473233  0.48966175 0.4898357 ]]
