In [1]:
import pandas as pd
import numpy as np 
import tensorflow as tf
import matplotlib.pyplot as plt
import time
from tensorflow.keras import layers, Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.stats import ks_2samp, chi2_contingency
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer


ModuleNotFoundError: No module named 'tensorflow'

Create TableGan Model 

In [2]:
df = pd.read_csv("com_salary.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 909 entries, 0 to 908
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              909 non-null    object 
 1   OttoneuID         909 non-null    int64  
 2   FG MajorLeagueID  851 non-null    float64
 3   FG MinorLeagueID  902 non-null    object 
 4   MLB Org           896 non-null    object 
 5   Position(s)       909 non-null    object 
 6   Avg Salary        909 non-null    float64
 7   Median Salary     909 non-null    float64
 8   Min Salary        909 non-null    int64  
 9   Max Salary        909 non-null    int64  
 10  Last 10           909 non-null    float64
 11  Roster%           909 non-null    float64
 12  Team              853 non-null    object 
 13  POS               909 non-null    object 
 14  ADP               909 non-null    float64
 15  rPTS              909 non-null    float64
 16  PTS               909 non-null    float64
 1

In [4]:
df.head()

Unnamed: 0,Name,OttoneuID,FG MajorLeagueID,FG MinorLeagueID,MLB Org,Position(s),Avg Salary,Median Salary,Min Salary,Max Salary,...,POS,ADP,rPTS,PTS,aPOS,Dollars,Adjusted,Cost,PlayerId,value
0,Juan Soto,23717,20123.0,sa906282,NYY,OF,60.31,60.0,34,84,...,OF/DH,10.57,1132.535865,53.429347,21.885623,76.314969,112.280793,,20123,0.0
1,Mookie Betts,18276,13611.0,sa597889,LAD,2B/SS/OF,58.35,58.0,38,87,...,2B/SS/OF,5.03,1036.172159,40.095221,14.934803,56.030024,82.308998,,13611,0.0
2,Shohei Ohtani,33600,19755.0,,LAD,Util/SP,56.13,56.0,32,88,...,P/DH,12.73,1038.276653,40.386426,12.178202,53.564628,75.049219,54.0,19755,-0.435372
3,Aaron Judge,18312,15640.0,sa549847,NYY,OF,54.17,54.5,33,70,...,OF/DH,11.16,1066.255227,44.257902,21.885623,67.143525,93.070498,50.0,15640,17.143525
4,Freddie Freeman,5680,5361.0,sa390291,LAD,1B,50.76,51.0,27,74,...,1B,8.81,1062.235872,43.701733,13.809086,58.510819,81.663561,47.0,5361,11.510819


In [6]:
orig_df = df


In [7]:
# Determine the number of features
number_of_features = orig_df.shape[1]  # Assumes no target variable included; adjust if necessary

## Step 1: Define the Generator
The Generator's role in a GAN is to create synthetic data that is indistinguishable from real data. It learns to do this through the adversarial process with the Discriminator. Here, we define a simple neural network model for the


In [8]:
# Define the Generator
class Generator(Model):
    def __init__(self, z_dim, output_dim):
        super(Generator, self).__init__()
        self.dense1 = Dense(128, activation='relu')
        self.out = Dense(output_dim, activation='tanh')

    def call(self, inputs):
        x = self.dense1(inputs)
        return self.out(x)




## Step 2: Define the Discriminator
The Discriminator acts as a classifier that tries to distinguish real data from fake data produced by the Generator. This class is also defined with a simple architecture, consisting of one hidden layer.


In [9]:
# Define the Discriminator
class Discriminator(Model):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.dense1 = Dense(64, activation='relu')
        self.out = Dense(1, activation='sigmoid')

    def call(self, inputs):
        x = self.dense1(inputs)
        return self.out(x)


## Step 3: Define the TableGAN Model
Here we integrate the Generator and Discriminator into a complete GAN model. The TableGAN class manages the training loop where both models are trained in an adversarial setup. This structure includes methods for compiling the model and defining the training step, utilizing TensorFlow's capabilities.


In [11]:
# Define the TableGAN model
class TableGAN(Model):
    def __init__(self, input_dim, z_dim):
        super(TableGAN, self).__init__()
        self.generator = Generator(z_dim, input_dim)
        self.discriminator = Discriminator(input_dim)
        self.z_dim = z_dim

    def compile(self, g_optimizer, d_optimizer, loss_function):
        super(TableGAN, self).compile()
        self.g_optimizer = g_optimizer
        self.d_optimizer = d_optimizer
        self.loss_function = loss_function

    def train_step(self, real_data):
        batch_size = tf.shape(real_data)[0]
        random_latent_vectors = tf.random.normal(shape=(batch_size, self.z_dim))
        with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
            generated_data = self.generator(random_latent_vectors)
            real_output = self.discriminator(real_data)
            fake_output = self.discriminator(generated_data)
            gen_loss = self.loss_function(tf.ones_like(fake_output), fake_output)
            real_loss = self.loss_function(tf.ones_like(real_output), real_output)
            fake_loss = self.loss_function(tf.zeros_like(fake_output), fake_output)
            disc_loss = real_loss + fake_loss
        gradients_of_generator = gen_tape.gradient(gen_loss, self.generator.trainable_variables)
        gradients_of_discriminator = disc_tape.gradient(disc_loss, self.discriminator.trainable_variables)
        self.g_optimizer.apply_gradients(zip(gradients_of_generator, self.generator.trainable_variables))
        self.d_optimizer.apply_gradients(zip(gradients_of_discriminator, self.discriminator.trainable_variables))
        return {'gen_loss': gen_loss, 'disc_loss': disc_loss}

# Setup and initialization
input_dim = orig_df.shape[1]  # Number of features from the original DataFrame
z_dim = 100  # Latent dimension for the generator
table_gan = TableGAN(input_dim=input_dim, z_dim=z_dim)
table_gan.compile(
    g_optimizer=Adam(1e-4),
    d_optimizer=Adam(1e-4),
    loss_function=BinaryCrossentropy(from_logits=True)
)

# Example use case: simulate real data and train
real_data = np.random.normal(size=(10, input_dim))  # Simulate real data
table_gan.train_step(real_data)


  output, from_logits = _get_logits(


{'gen_loss': <tf.Tensor: shape=(), dtype=float32, numpy=0.9087286>,
 'disc_loss': <tf.Tensor: shape=(), dtype=float32, numpy=1.6692452>}

## Step 4: Data Loading and Preprocessing
Before training the model, it's necessary to load and preprocess your data. This step involves reading a CSV file, detecting numerical and categorical columns, and applying appropriate transformations such as scaling for numerical data and one-hot encoding for categorical data.


In [12]:
# Load and preprocess the data
data = orig_df  # Ensure orig_df is your DataFrame loaded previously

# Selecting numerical and categorical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()

# Define preprocessing for numerical columns: impute missing values then scale
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical columns: impute missing values then encode
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Create a ColumnTransformer to apply the above transformations appropriately
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numerical_cols),
        ('cat', cat_pipeline, categorical_cols)
    ])

# Apply transformations to your data
processed_data = preprocessor.fit_transform(data)






## Step 5: Initialize and Train the TableGAN Model
Finally, initialize the TableGAN model with dimensions derived from the preprocessed data, compile it with chosen optimizers and loss function, and train the model. Here, we specify the learning rates, the loss function, and the training parameters such as the number of epochs and batch size.


In [15]:
# Function to generate synthetic data
def generate_synthetic_data(model, num_samples, z_dim):
    random_latent_vectors = tf.random.normal(shape=(num_samples, z_dim))
    synthetic_data = model.generator(random_latent_vectors).numpy()
    return synthetic_data


In [20]:
# User input for the number of synthetic samples
num_samples = 1000  # Define number of synthetic examples you want

# Generate synthetic data
synthetic_data = generate_synthetic_data(model=table_gan, num_samples=num_samples, z_dim=100)

# Convert the synthetic data to a pandas DataFrame with original column names
synthetic_df = pd.DataFrame(synthetic_data, columns=original_df.columns)
print("Generated synthetic data shape:", synthetic_data.shape)
print("Synthetic DataFrame head:", synthetic_df.head())

# Display the DataFrame
print("Shape of synthetic DataFrame:", synthetic_df.shape)
print(synthetic_df.head())

SyntaxError: positional argument follows keyword argument (177836967.py, line 7)

## Step 5: Save to CSV
Save the synthetic data to a CSV file. This file can be used for further analysis, sharing, or as input to other analytical tools and systems.


In [None]:
# Write to CSV
synthetic_df.to_csv('synthetic_data.csv', index=False)

print("Synthetic data written to synthetic_data.csv successfully!")


Step 5 Load Datasets: Original and Synthetic

In [None]:
Orig_data = orig_df
synthetic_data = synthetic_df


print("Original Data Columns:", Orig_data.columns.tolist())
print("Synthetic Data Columns:", synthetic_data.columns.tolist())


In [None]:
##Define Evaluation Functions
# Evaluation metrics implementation

# Fidelity Metrics
# Fidelity Metrics
def evaluate_fidelity(real_data, synthetic_data, continuous_columns, categorical_columns):
    ks_results = {col: ks_2samp(real_data[col], synthetic_data[col]).statistic for col in continuous_columns}
    chi_squared_results = {col: chi2_contingency(pd.crosstab(real_data[col], synthetic_data[col]))[:2] for col in categorical_columns}
    return {'KS Test': ks_results, 'Chi-Squared Test': chi_squared_results}

# Utility Metrics
def evaluate_predictive_performance(real_data, synthetic_data, target_column, test_size=0.3, random_state=42):
    X_real = real_data.drop(columns=[target_column])
    y_real = real_data[target_column]
    X_synthetic = synthetic_data.drop(columns=[target_column])
    y_synthetic = synthetic_data[target_column]
    model = RandomForestClassifier(n_estimators=100, random_state=random_state)
    model.fit(X_synthetic, y_synthetic)
    predictions = model.predict(X_real)
    return {
        'Accuracy': accuracy_score(y_real, predictions),
        'ROC AUC': roc_auc_score(y_real, model.predict_proba(X_real)[:, 1]),
        'F1 Score': f1_score(y_real, predictions)
    }

def informativeness_test(real_data, synthetic_data, test_size=0.3, random_state=42):
    real_data['is_real'] = 1
    synthetic_data['is_real'] = 0
    combined_data = pd.concat([real_data, synthetic_data], ignore_index=True)
    X = combined_data.drop(columns=['is_real'])
    y = combined_data['is_real']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    classifier = RandomForestClassifier(n_estimators=100, random_state=random_state)
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    return {
        'Accuracy': accuracy_score(y_test, predictions),
        'ROC AUC': roc_auc_score(y_test, classifier.predict_proba(X_test)[:, 1]),
        'F1 Score': f1_score(y_test, predictions)
    

In [None]:
# Identify columns to apply metrics
continuous_columns = original_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = original_df.select_dtypes(include=['object']).columns.tolist()


In [None]:
# Evaluate fidelity
fidelity_results = evaluate_fidelity(original_df, synthetic_df, continuous_columns, categorical_columns)
print("Fidelity Results:", fidelity_results)

# Assuming the target column is correctly named and exists in your DataFrame
predictive_performance_results = evaluate_predictive_performance(original_df, synthetic_df, 'target_column')
print("Predictive Performance Results:", predictive_performance_results)

# Informativeness test
informativeness_results = informativeness_test(original_df, synthetic_df)
print("Informativeness Results:", informativeness_results)


# Evaluate predictive performance
predictive_performance_results = evaluate_predictive_performance(real_data, synthetic_data, 'target_column')
print("Predictive Performance Results:", predictive_performance_results)
