In [1]:
import pandas as pd
import numpy as np 
import tensorflow as tf
import matplotlib.pyplot as plt
import time
from tensorflow.keras import layers, Model
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

2024-04-25 15:16:40.233206: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Create TableGan Model 

In [2]:
df = pd.read_csv("com_salary.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 909 entries, 0 to 908
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              909 non-null    object 
 1   OttoneuID         909 non-null    int64  
 2   FG MajorLeagueID  851 non-null    float64
 3   FG MinorLeagueID  902 non-null    object 
 4   MLB Org           896 non-null    object 
 5   Position(s)       909 non-null    object 
 6   Avg Salary        909 non-null    float64
 7   Median Salary     909 non-null    float64
 8   Min Salary        909 non-null    int64  
 9   Max Salary        909 non-null    int64  
 10  Last 10           909 non-null    float64
 11  Roster%           909 non-null    float64
 12  Team              853 non-null    object 
 13  POS               909 non-null    object 
 14  ADP               909 non-null    float64
 15  rPTS              909 non-null    float64
 16  PTS               909 non-null    float64
 1

In [4]:
df.head()

Unnamed: 0,Name,OttoneuID,FG MajorLeagueID,FG MinorLeagueID,MLB Org,Position(s),Avg Salary,Median Salary,Min Salary,Max Salary,...,POS,ADP,rPTS,PTS,aPOS,Dollars,Adjusted,Cost,PlayerId,value
0,Juan Soto,23717,20123.0,sa906282,NYY,OF,60.31,60.0,34,84,...,OF/DH,10.57,1132.535865,53.429347,21.885623,76.314969,112.280793,,20123,0.0
1,Mookie Betts,18276,13611.0,sa597889,LAD,2B/SS/OF,58.35,58.0,38,87,...,2B/SS/OF,5.03,1036.172159,40.095221,14.934803,56.030024,82.308998,,13611,0.0
2,Shohei Ohtani,33600,19755.0,,LAD,Util/SP,56.13,56.0,32,88,...,P/DH,12.73,1038.276653,40.386426,12.178202,53.564628,75.049219,54.0,19755,-0.435372
3,Aaron Judge,18312,15640.0,sa549847,NYY,OF,54.17,54.5,33,70,...,OF/DH,11.16,1066.255227,44.257902,21.885623,67.143525,93.070498,50.0,15640,17.143525
4,Freddie Freeman,5680,5361.0,sa390291,LAD,1B,50.76,51.0,27,74,...,1B,8.81,1062.235872,43.701733,13.809086,58.510819,81.663561,47.0,5361,11.510819


In [5]:
selected_df= df[['ADP', 'Avg Salary', 'Median Salary', 'Min Salary', 'Max Salary', 'rPTS', 'PTS', 'Dollars']]


In [6]:
selected_df.head()

Unnamed: 0,ADP,Avg Salary,Median Salary,Min Salary,Max Salary,rPTS,PTS,Dollars
0,10.57,60.31,60.0,34,84,1132.535865,53.429347,76.314969
1,5.03,58.35,58.0,38,87,1036.172159,40.095221,56.030024
2,12.73,56.13,56.0,32,88,1038.276653,40.386426,53.564628
3,11.16,54.17,54.5,33,70,1066.255227,44.257902,67.143525
4,8.81,50.76,51.0,27,74,1062.235872,43.701733,58.510819


## Step 1: Define the Generator
The Generator's role in a GAN is to create synthetic data that is indistinguishable from real data. It learns to do this through the adversarial process with the Discriminator. Here, we define a simple neural network model for the


In [None]:
# Define the Generator
class Generator(Model):
    def __init__(self, z_dim):
        super(Generator, self).__init__()
        self.dense1 = layers.Dense(64, activation='relu')
        self.out = layers.Dense(z_dim, activation='tanh')

    def call(self, x):
        x = self.dense1(x)
        return self.out(x)



## Step 2: Define the Discriminator
The Discriminator acts as a classifier that tries to distinguish real data from fake data produced by the Generator. This class is also defined with a simple architecture, consisting of one hidden layer.


In [None]:
# Define the Discriminator
class Discriminator(Model):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.dense1 = layers.Dense(64, activation='relu')
        self.out = layers.Dense(1, activation='sigmoid')

    def call(self, x):
        x = self.dense1(x)
        return self.out(x)

## Step 3: Define the TableGAN Model
Here we integrate the Generator and Discriminator into a complete GAN model. The TableGAN class manages the training loop where both models are trained in an adversarial setup. This structure includes methods for compiling the model and defining the training step, utilizing TensorFlow's capabilities.


In [None]:
# Define the TableGAN model
class TableGAN(Model):
    def __init__(self, input_dim, z_dim):
        super(TableGAN, self).__init__()
        self.generator = Generator(z_dim)
        self.discriminator = Discriminator(input_dim)

    def compile(self, g_optimizer, d_optimizer, loss_function):
        super(TableGAN, self).compile()
        self.g_optimizer = g_optimizer
        self.d_optimizer = d_optimizer
        self.loss_function = loss_function

    def train_step(self, real_data):
        batch_size = tf.shape(real_data)[0]
        z_dim = self.generator.layers[-1].output_shape[-1]

        random_latent_vectors = tf.random.normal(shape=(batch_size, z_dim))

        with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
            generated_data = self.generator(random_latent_vectors)
            real_output = self.discriminator(real_data)
            fake_output = self.discriminator(generated_data)

            gen_loss = self.loss_function(tf.ones_like(fake_output), fake_output)
            real_loss = self.loss_function(tf.ones_like(real_output), real_output)
            fake_loss = self.loss_function(tf.zeros_like(fake_output), fake_output)
            disc_loss = real_loss + fake_loss

        gradients_of_generator = gen_tape.gradient(gen_loss, self.generator.trainable_variables)
        gradients_of_discriminator = disc_tape.gradient(disc_loss, self.discriminator.trainable_variables)

        self.g_optimizer.apply_gradients(zip(gradients_of_generator, self.generator.trainable_variables))
        self.d_optimizer.apply_gradients(zip(gradients_of_discriminator, self.discriminator.trainable_variables))

        return {'gen_loss': gen_loss, 'disc_loss': disc_loss}

## Step 4: Data Loading and Preprocessing
Before training the model, it's necessary to load and preprocess your data. This step involves reading a CSV file, detecting numerical and categorical columns, and applying appropriate transformations such as scaling for numerical data and one-hot encoding for categorical data.


In [None]:
# Load and preprocess the data
data = pd.read_csv('path_to_your_dataset.csv')
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])
processed_data = preprocessor.fit_transform(data)


)



## Step 5: Initialize and Train the TableGAN Model
Finally, initialize the TableGAN model with dimensions derived from the preprocessed data, compile it with chosen optimizers and loss function, and train the model. Here, we specify the learning rates, the loss function, and the training parameters such as the number of epochs and batch size.


In [1]:
# Initialize the model and compile
input_dim = processed_data.shape[1]
z_dim = 50
tablegan = TableGAN(input_dim=input_dim, z_dim=z_dim)
tablegan.compile(
    g_optimizer=Adam(1e-4),
    d_optimizer=Adam(1e-4),
    loss_function=BinaryCrossentropy(from_logits=True)


# Train the model
epochs = 20
batch_size = 16

# Start timing
start_time = time.time()


tablegan.fit(processed_data, epochs=epochs, batch_size=batch_size)


# End timing
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"Training completed in {elapsed_time:.2f} seconds.")

SyntaxError: incomplete input (2227514722.py, line 8)

## Step 6: Generate Synthetic Data
Using the trained generator, we will generate synthetic data by feeding it random noise. This data is intended to mimic the statistical properties of the training dataset.


In [None]:
# Generate synthetic data
random_latent_vectors = tf.random.normal(shape=(1000, z_dim))  # Generate 1000 samples
synthetic_data = tablegan.generator(random_latent_vectors).numpy()


## Step 4: Convert to DataFrame
Convert the synthetic data into a pandas DataFrame. This step is necessary to prepare the data for easy export and use in other systems.


In [None]:
# Convert to DataFrame
synthetic_df = pd.DataFrame(synthetic_data, columns=[f"feature_{i+1}" for i in range(synthetic_data.shape[1])])


## Step 5: Save to CSV
Save the synthetic data to a CSV file. This file can be used for further analysis, sharing, or as input to other analytical tools and systems.


In [None]:
# Write to CSV
synthetic_df.to_csv('synthetic_data.csv', index=False)

print("Synthetic data written to synthetic_data.csv successfully!")
