In [3]:
import os

# Set environment variables to control OpenMP and MKL
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
os.environ['KMP_INIT_AT_FORK'] = 'FALSE'
os.environ['MKL_THREADING_LAYER'] = 'GNU'

# Print to confirm setting environment variables
print("Environment variables set. Please restart the Jupyter kernel now.")


Environment variables set. Please restart the Jupyter kernel now.


In [1]:
import pandas as pd
import numpy as np
from ctgan import CTGAN
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy.stats import ks_2samp, chi2_contingency
import time

# Load the original dataset
original_df = pd.read_csv('com_salary.csv')  # Update with the actual path to your dataset

# Selecting numeric and categorical features
numeric_features = original_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = original_df.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



Numeric features: ['OttoneuID', 'FG MajorLeagueID', 'Avg Salary', 'Median Salary', 'Min Salary', 'Max Salary', 'Last 10', 'Roster%', 'ADP', 'rPTS', 'PTS', 'aPOS', 'Dollars', 'Adjusted', 'Cost', 'value']
Categorical features: ['Name', 'FG MinorLeagueID', 'MLB Org', 'Position(s)', 'Team', 'POS', 'PlayerId']


In [2]:
# Define transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Fit and transform the data
start_time = time.time()
prepared_data = preprocessor.fit_transform(original_df)
end_time = time.time()
preprocessing_time = end_time - start_time

# Check if the data is sparse and convert to dense if necessary
if hasattr(prepared_data, "toarray"):
    prepared_data = prepared_data.toarray()

# Check the shape of the prepared data
print(f"Shape of prepared_data: {prepared_data.shape}")

# Get the column names from the transformers
numeric_column_names = numeric_features
categorical_column_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_features)

# Combine all column names
all_column_names = numeric_column_names + list(categorical_column_names)

# Verify the lengths match
print(f"Number of columns in prepared_data: {prepared_data.shape[1]}")
print(f"Number of column names: {len(all_column_names)}")

# Ensure the lengths match before creating the DataFrame
if prepared_data.shape[1] == len(all_column_names):
    prepared_df = pd.DataFrame(prepared_data, columns=all_column_names)
else:
    raise ValueError("Mismatch between prepared data columns and column names.")

print(f"Data preprocessing completed in {preprocessing_time:.2f} seconds.")


Shape of prepared_data: (909, 2901)
Number of columns in prepared_data: 2901
Number of column names: 2901
Data preprocessing completed in 0.11 seconds.


In [None]:
# Initialize the CTGAN model
epochs = 2  # Adjust the number of epochs as needed
ctgan = CTGAN(epochs=epochs)

# Train the CTGAN model
start_time = time.time()
#with threadpool_limits(limits=1, user_api='blas'):
ctgan.fit(prepared_df, list(categorical_column_names))
end_time = time.time()
training_time = end_time - start_time

# Function to generate synthetic data and measure time taken
def generate_synthetic_data(ctgan_model, num_samples, columns):
    start_time = time.time()
    synthetic_data = ctgan_model.sample(num_samples)
    elapsed_time = time.time() - start_time

    # Convert synthetic data to a pandas DataFrame with appropriate column names
    synthetic_df = pd.DataFrame(synthetic_data, columns=columns)
    
    return synthetic_df, elapsed_time

# Specify the number of synthetic samples to generate
additional_rows = 1000  # Example: add 1000 more rows
num_samples = len(original_df) + additional_rows

# Generate synthetic data
synthetic_df, generation_time = generate_synthetic_data(ctgan, num_samples, all_column_names)

# Save the synthetic data to a CSV file
output_file = 'synthetic_data_ctgan.csv'
synthetic_df.to_csv(output_file, index=False)
print(f"Synthetic data written to '{output_file}' successfully with {num_samples} samples.")
print(f"Synthetic data generation completed in {generation_time:.2f} seconds.")
print(f"CTGAN model training completed in {training_time:.2f} seconds.")

# Print some outputs to verify
print("Generated synthetic data shape:", synthetic_df.shape)
print("Sample of synthetic data:")
print(synthetic_df.head())


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux

In [None]:
def evaluate_fidelity(real_data, synthetic_data, continuous_columns, categorical_columns):
    ks_results = {col: ks_2samp(real_data[col], synthetic_data[col]).statistic for col in continuous_columns}
    chi_squared_results = {col: chi2_contingency(pd.crosstab(real_data[col], synthetic_data[col]))[:2] for col in categorical_columns}
    return {'KS Test': ks_results, 'Chi-Squared Test': chi_squared_results}

def evaluate_predictive_performance(real_data, synthetic_data, target_column, test_size=0.3, random_state=42):
    X_real = real_data.drop(columns=[target_column])
    y_real = real_data[target_column]
    X_synthetic = synthetic_data.drop(columns=[target_column])
    y_synthetic = synthetic_data[target_column]
    model = RandomForestClassifier(n_estimators=100, random_state=random_state)
    model.fit(X_synthetic, y_synthetic)
    predictions = model.predict(X_real)
    return {
        'Accuracy': accuracy_score(y_real, predictions),
        'ROC AUC': roc_auc_score(y_real, model.predict_proba(X_real)[:, 1]),
        'F1 Score': f1_score(y_real, predictions)
    }

def informativeness_test(real_data, synthetic_data, test_size=0.3, random_state=42):
    real_data['is_real'] = 1
    synthetic_data['is_real'] = 0
    combined_data = pd.concat([real_data, synthetic_data], ignore_index=True)
    X = combined_data.drop(columns=['is_real'])
    y = combined_data['is_real']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    classifier = RandomForestClassifier(n_estimators=100, random_state=random_state)
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    return {
        'Accuracy': accuracy_score(y_test, predictions),
        'ROC AUC': roc_auc_score(y_test, classifier.predict_proba(X_test)[:, 1]),
        'F1 Score': f1_score(y_test, predictions)
    }


In [None]:
def evaluate_synthetic_data(original_df, synthetic_df, target_column):
    # Identify columns to apply metrics
    continuous_columns = original_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_columns = original_df.select_dtypes(include=['object']).columns.tolist()

    # Evaluate fidelity
    fidelity_results = evaluate_fidelity(original_df, synthetic_df, continuous_columns, categorical_columns)
    print("Fidelity Results:", fidelity_results)

    # Evaluate predictive performance
    if target_column in original_df.columns:
        predictive_performance_results = evaluate_predictive_performance(original_df, synthetic_df, target_column)
        print("Predictive Performance Results:", predictive_performance_results)
    else:
        print(f"Error: Target column '{target_column}' does not exist in the DataFrame.")

    # Evaluate informativeness
    informativeness_results = informativeness_test(original_df, synthetic_df)
    print("Informativeness Results:", informativeness_results)

# Specify your target column
target_column = 'your_target_column'  # Replace with your actual target column name
evaluate_synthetic_data(original_df, synthetic_df, target_column)
