In [1]:
import pandas as pd
import numpy as np 
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import layers, Model
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

2024-04-22 14:17:19.576629: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Create TableGan Model 

In [5]:
df = pd.read_csv("com_salary.csv")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 909 entries, 0 to 908
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              909 non-null    object 
 1   OttoneuID         909 non-null    int64  
 2   FG MajorLeagueID  851 non-null    float64
 3   FG MinorLeagueID  902 non-null    object 
 4   MLB Org           896 non-null    object 
 5   Position(s)       909 non-null    object 
 6   Avg Salary        909 non-null    float64
 7   Median Salary     909 non-null    float64
 8   Min Salary        909 non-null    int64  
 9   Max Salary        909 non-null    int64  
 10  Last 10           909 non-null    float64
 11  Roster%           909 non-null    float64
 12  Team              853 non-null    object 
 13  POS               909 non-null    object 
 14  ADP               909 non-null    float64
 15  rPTS              909 non-null    float64
 16  PTS               909 non-null    float64
 1

In [7]:
df.head()

Unnamed: 0,Name,OttoneuID,FG MajorLeagueID,FG MinorLeagueID,MLB Org,Position(s),Avg Salary,Median Salary,Min Salary,Max Salary,...,POS,ADP,rPTS,PTS,aPOS,Dollars,Adjusted,Cost,PlayerId,value
0,Juan Soto,23717,20123.0,sa906282,NYY,OF,60.31,60.0,34,84,...,OF/DH,10.57,1132.535865,53.429347,21.885623,76.314969,112.280793,,20123,0.0
1,Mookie Betts,18276,13611.0,sa597889,LAD,2B/SS/OF,58.35,58.0,38,87,...,2B/SS/OF,5.03,1036.172159,40.095221,14.934803,56.030024,82.308998,,13611,0.0
2,Shohei Ohtani,33600,19755.0,,LAD,Util/SP,56.13,56.0,32,88,...,P/DH,12.73,1038.276653,40.386426,12.178202,53.564628,75.049219,54.0,19755,-0.435372
3,Aaron Judge,18312,15640.0,sa549847,NYY,OF,54.17,54.5,33,70,...,OF/DH,11.16,1066.255227,44.257902,21.885623,67.143525,93.070498,50.0,15640,17.143525
4,Freddie Freeman,5680,5361.0,sa390291,LAD,1B,50.76,51.0,27,74,...,1B,8.81,1062.235872,43.701733,13.809086,58.510819,81.663561,47.0,5361,11.510819


In [8]:
selected_df= df[['ADP', 'Avg Salary', 'Median Salary', 'Min Salary', 'Max Salary', 'rPTS', 'PTS', 'Dollars']]


In [9]:
selected_df.head()

Unnamed: 0,ADP,Avg Salary,Median Salary,Min Salary,Max Salary,rPTS,PTS,Dollars
0,10.57,60.31,60.0,34,84,1132.535865,53.429347,76.314969
1,5.03,58.35,58.0,38,87,1036.172159,40.095221,56.030024
2,12.73,56.13,56.0,32,88,1038.276653,40.386426,53.564628
3,11.16,54.17,54.5,33,70,1066.255227,44.257902,67.143525
4,8.81,50.76,51.0,27,74,1062.235872,43.701733,58.510819


Define Model Components
We’ll define two main components: the Generator and the Discriminator, which will be used to build the GAN architecture. Below is a simplified setup:

In [17]:
class Generator(Model):
    def __init__(self, z_dim):
        super(Generator, self).__init__()
        self.dense1 = layers.Dense(128, activation='relu')
        self.dense2 = layers.Dense(128, activation='relu')
        self.out = layers.Dense(z_dim, activation='tanh')  # Output dimension should match real data structure

    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        return self.out(x)

class Discriminator(Model):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.dense1 = layers.Dense(128, activation='relu')
        self.dense2 = layers.Dense(128, activation='relu')
        self.out = layers.Dense(1, activation='sigmoid')

    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        return self.out(x)

Automate Data Preprocessing
Before integrating these models into a GAN framework, ensure your data is correctly preprocessed. Here’s how you can automate the detection of numerical and categorical columns and preprocess them:

In [19]:
# Detect columns
numerical_cols = selected_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = selected_df.select_dtypes(include=['object', 'category']).columns.tolist()

# Set up preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

processed_data = preprocessor.fit_transform(selected_df)

Step 3: Set Up the TableGAN Model
After preprocessing, integrate the components into a GAN structure, ensuring to correctly manage the input and output dimensions:

In [21]:
class TableGAN(Model):
    def __init__(self, input_dim, z_dim):
        super(TableGAN, self).__init__()
        self.generator = Generator(z_dim=z_dim)
        self.discriminator = Discriminator(input_dim=input_dim)

    def compile(self, g_optimizer, d_optimizer, loss_function):
        super(TableGAN, self).compile()
        self.g_optimizer = g_optimizer
        self.d_optimizer = d_optimizer
        self.loss_function = loss_function

    def train_step(self, real_data):
        batch_size = tf.shape(real_data)[0]
        z_dim = self.generator.layers[-1].output_shape[-1]  # Adjust this based on your network's output layer

        random_latent_vectors = tf.random.normal(shape=(batch_size, z_dim))

        with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
            generated_data = self.generator(random_latent_vectors)
            real_output = self.discriminator(real_data)
            fake_output = self.discriminator(generated_data)

            gen_loss = self.loss_function(tf.ones_like(fake_output), fake_output)
            real_loss = self.loss_function(tf.ones_like(real_output), real_output)
            fake_loss = self.loss_function(tf.zeros_like(fake_output), fake_output)
            disc_loss = real_loss + fake_loss

        gradients_of_generator = gen_tape.gradient(gen_loss, self.generator.trainable_variables)
        gradients_of_discriminator = disc_tape.gradient(disc_loss, self.discriminator.trainable_variables)

        self.g_optimizer.apply_gradients(zip(gradients_of_generator, self.generator.trainable_variables))
        self.d_optimizer.apply_gradients(zip(gradients_of_discriminator, self.discriminator.trainable_variables))

        return {'gen_loss': gen_loss, 'disc_loss': disc_loss}

# Initialize and compile your model
input_dim = processed_data.shape[1]  # From your preprocessed data
z_dim = 100  # Dimensionality of the generator's input space
tablegan = TableGAN(input_dim=input_dim, z_dim=z_dim)
tablegan.compile(
    g_optimizer=tf.keras.optimizers.Adam(1e-4),
    d_optimizer=tf.keras.optimizers.Adam(1e-4),
    loss_function=tf.keras.losses.BinaryCrossentropy(from_logits=True)
)
