In [4]:
import pandas as pd
import numpy as np 
#import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import layers, Model
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

Create TableGan Model 

In [5]:
class Generator(Model):
    def __init__(self, z_dim):
        super(Generator, self).__init__()
        self.dense1 = layers.Dense(128, activation='relu')
        self.dense2 = layers.Dense(128, activation='relu')
        self.out = layers.Dense(z_dim, activation='tanh')  # Assuming output dimension matches feature space

    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        return self.out(x)

class Discriminator(Model):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.dense1 = layers.Dense(128, activation='relu')
        self.dense2 = layers.Dense(128, activation='relu')
        self.out = layers.Dense(1, activation='sigmoid')

    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        return self.out(x)

In [6]:
class TableGAN(Model):
    def __init__(self, input_dim, z_dim):
        super(TableGAN, self).__init__()
        self.generator = Generator(z_dim)
        self.discriminator = Discriminator(input_dim)

    def compile(self, g_optimizer, d_optimizer, loss_function):
        super(TableGAN, self).compile()
        self.g_optimizer = g_optimizer
        self.d_optimizer = d_optimizer
        self.loss_function = loss_function

    def train_step(self, real_data):
        batch_size = tf.shape(real_data)[0]
        random_latent_vectors = tf.random.normal(shape=(batch_size, self.generator.input_shape[-1]))

        with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
            generated_data = self.generator(random_latent_vectors)
            real_output = self.discriminator(real_data)
            fake_output = self.discriminator(generated_data)

            # Calculate loss
            gen_loss = self.loss_function(tf.ones_like(fake_output), fake_output)
            real_loss = self.loss_function(tf.ones_like(real_output), real_output)
            fake_loss = self.loss_function(tf.zeros_like(fake_output), fake_output)
            disc_loss = real_loss + fake_loss

        # Calculate gradients
        gradients_of_generator = gen_tape.gradient(gen_loss, self.generator.trainable_variables)
        gradients_of_discriminator = disc_tape.gradient(disc_loss, self.discriminator.trainable_variables)

        # Apply gradients
        self.g_optimizer.apply_gradients(zip(gradients_of_generator, self.generator.trainable_variables))
        self.d_optimizer.apply_gradients(zip(gradients_of_discriminator, self.discriminator.trainable_variables))

        return {'gen_loss': gen_loss, 'disc_loss': disc_loss}


In [7]:
### Now we load the sample CSV and proceed to step 3 via source

In [8]:
df = pd.read_csv("com_salary.csv")

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 909 entries, 0 to 908
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              909 non-null    object 
 1   OttoneuID         909 non-null    int64  
 2   FG MajorLeagueID  851 non-null    float64
 3   FG MinorLeagueID  902 non-null    object 
 4   MLB Org           896 non-null    object 
 5   Position(s)       909 non-null    object 
 6   Avg Salary        909 non-null    float64
 7   Median Salary     909 non-null    float64
 8   Min Salary        909 non-null    int64  
 9   Max Salary        909 non-null    int64  
 10  Last 10           909 non-null    float64
 11  Roster%           909 non-null    float64
 12  Team              853 non-null    object 
 13  POS               909 non-null    object 
 14  ADP               909 non-null    float64
 15  rPTS              909 non-null    float64
 16  PTS               909 non-null    float64
 1

In [10]:
df.head()

Unnamed: 0,Name,OttoneuID,FG MajorLeagueID,FG MinorLeagueID,MLB Org,Position(s),Avg Salary,Median Salary,Min Salary,Max Salary,...,POS,ADP,rPTS,PTS,aPOS,Dollars,Adjusted,Cost,PlayerId,value
0,Juan Soto,23717,20123.0,sa906282,NYY,OF,60.31,60.0,34,84,...,OF/DH,10.57,1132.535865,53.429347,21.885623,76.314969,112.280793,,20123,0.0
1,Mookie Betts,18276,13611.0,sa597889,LAD,2B/SS/OF,58.35,58.0,38,87,...,2B/SS/OF,5.03,1036.172159,40.095221,14.934803,56.030024,82.308998,,13611,0.0
2,Shohei Ohtani,33600,19755.0,,LAD,Util/SP,56.13,56.0,32,88,...,P/DH,12.73,1038.276653,40.386426,12.178202,53.564628,75.049219,54.0,19755,-0.435372
3,Aaron Judge,18312,15640.0,sa549847,NYY,OF,54.17,54.5,33,70,...,OF/DH,11.16,1066.255227,44.257902,21.885623,67.143525,93.070498,50.0,15640,17.143525
4,Freddie Freeman,5680,5361.0,sa390291,LAD,1B,50.76,51.0,27,74,...,1B,8.81,1062.235872,43.701733,13.809086,58.510819,81.663561,47.0,5361,11.510819


In [11]:
selected_df= df[['ADP', 'Avg Salary', 'Median Salary', 'Min Salary', 'Max Salary', 'rPTS', 'PTS', 'Dollars']]


In [12]:
selected_df.head()

Unnamed: 0,ADP,Avg Salary,Median Salary,Min Salary,Max Salary,rPTS,PTS,Dollars
0,10.57,60.31,60.0,34,84,1132.535865,53.429347,76.314969
1,5.03,58.35,58.0,38,87,1036.172159,40.095221,56.030024
2,12.73,56.13,56.0,32,88,1038.276653,40.386426,53.564628
3,11.16,54.17,54.5,33,70,1066.255227,44.257902,67.143525
4,8.81,50.76,51.0,27,74,1062.235872,43.701733,58.510819



# Automatically detect numerical and categorical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()

# Set up the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Preprocess the data
processed_data = preprocessor.fit_transform(data)

#Run prior to detech column types
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()



# Assume 'numerical_cols' and 'categorical_cols' are lists of your numerical and categorical column names
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

In [15]:
# Automatically detect numerical and categorical columns
numerical_cols = selected_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = selected_df.select_dtypes(include=['object', 'category']).columns.tolist()

In [16]:
numerical_cols

['ADP',
 'Avg Salary',
 'Median Salary',
 'Min Salary',
 'Max Salary',
 'rPTS',
 'PTS',
 'Dollars']