In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.evaluation.single_table import run_diagnostic, evaluate_quality
from sdmetrics.reports.single_table import QualityReport
from sdmetrics.visualization import get_column_plot,set_plotly_config
import mlflow
from sdmetrics.single_table import BinaryAdaBoostClassifier, CategoricalCAP, CategoricalEnsemble, NumericalSVR
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Load the dataset
dataset_path = 'Dataset/bar_pass_prediction.csv'
original_data = pd.read_csv(dataset_path)
# Make a copy of the dataset
data_full = original_data.copy()

TO DISCUSS: WE SHOULD AGREE TOGETHER ON WHICH COLUMNS WE WANT TO KEEP AND DO ALL FURTHER SDG MODELING WITH THE SAME COLUMNS

In [3]:
# List of columns to keep
columns_to_keep = [
    'pass_bar',  'ugpa', 'decile1', 'decile1b',
    'decile3', 'lsat', 'grad', 'fulltime', 'fam_inc',
    'tier', 'race1', 'gender'
]

# Convert 'grad' to binary where 'Y' is 1 and anything else is 0
data_full['grad'] = np.where(data_full['grad'] == 'Y', 1, 0)

df = data_full[columns_to_keep]

In [4]:
# Check how many rows with missing values are there
print(df.shape)
print(df.columns)
print(df.isnull().sum())
display(df.head())

(22407, 12)
Index(['pass_bar', 'ugpa', 'decile1', 'decile1b', 'decile3', 'lsat', 'grad',
       'fulltime', 'fam_inc', 'tier', 'race1', 'gender'],
      dtype='object')
pass_bar       0
ugpa           0
decile1     1092
decile1b    1604
decile3     1604
lsat           0
grad           0
fulltime      34
fam_inc      289
tier          96
race1         16
gender         5
dtype: int64


Unnamed: 0,pass_bar,ugpa,decile1,decile1b,decile3,lsat,grad,fulltime,fam_inc,tier,race1,gender
0,1,3.5,10.0,10.0,10.0,44.0,1,1.0,5.0,4.0,white,female
1,1,3.5,5.0,5.0,4.0,29.0,1,1.0,4.0,2.0,white,female
2,1,3.5,3.0,3.0,2.0,36.0,1,1.0,1.0,3.0,white,male
3,1,3.5,7.0,7.0,4.0,39.0,1,1.0,4.0,3.0,white,male
4,1,3.5,9.0,9.0,8.0,48.0,1,1.0,4.0,5.0,white,male


In [5]:
# Remove all rows with missing values
initial_row_count = df.shape[0]
df = df.dropna()
removed_rows = initial_row_count - df.shape[0]
print(f"Removed {removed_rows} rows with missing values.")

# Check df
print(df.shape)
print(df.columns)
print(df.isnull().sum())
display(df.head())

Removed 1895 rows with missing values.
(20512, 12)
Index(['pass_bar', 'ugpa', 'decile1', 'decile1b', 'decile3', 'lsat', 'grad',
       'fulltime', 'fam_inc', 'tier', 'race1', 'gender'],
      dtype='object')
pass_bar    0
ugpa        0
decile1     0
decile1b    0
decile3     0
lsat        0
grad        0
fulltime    0
fam_inc     0
tier        0
race1       0
gender      0
dtype: int64


Unnamed: 0,pass_bar,ugpa,decile1,decile1b,decile3,lsat,grad,fulltime,fam_inc,tier,race1,gender
0,1,3.5,10.0,10.0,10.0,44.0,1,1.0,5.0,4.0,white,female
1,1,3.5,5.0,5.0,4.0,29.0,1,1.0,4.0,2.0,white,female
2,1,3.5,3.0,3.0,2.0,36.0,1,1.0,1.0,3.0,white,male
3,1,3.5,7.0,7.0,4.0,39.0,1,1.0,4.0,3.0,white,male
4,1,3.5,9.0,9.0,8.0,48.0,1,1.0,4.0,5.0,white,male


In [6]:
# Check types of columns
print(df.dtypes)


pass_bar      int64
ugpa        float64
decile1     float64
decile1b    float64
decile3     float64
lsat        float64
grad          int32
fulltime    float64
fam_inc     float64
tier        float64
race1        object
gender       object
dtype: object


In [7]:
# Which colums are categorical
categorical_columns = df.select_dtypes(include=['object'])
print(categorical_columns.columns)

Index(['race1', 'gender'], dtype='object')


In [8]:
numerical_columns = [ 'pass_bar',  'ugpa', 'decile1', 'decile1b',
                      'decile3', 'lsat', 'grad', 'fulltime', 'fam_inc',
                      'tier'] #for now treat ordinal catagorical variables as numerical

REMARK: Columns: fulltime, dam_income, tier, pass_bar, grad are ordinal categorical variable

TO DISCUSS: Is it acceptable to treat them as numerical columns in general? Do we maybe want to hot encode them? Certain SDG models might deal better with ordinal categorical variables than others. Which test would be appropriate to assess quality of synthetic data bearing inmind we have a few categorical, a lot ordinal catagorical and a few numerical columns?

Be aware that including ordinal categorical variables in a correlation heatmap, like the one you're generating with Seaborn's heatmap function, can be appropriate if the ordinal variables are encoded numerically in a way that reflects their order. This is because correlation calculations require numerical input, and ordinal variables can be thought of as numeric in terms of their ranking or ordering.

Alternative Statistics: If you're concerned about the limitations of using Pearson's correlation with ordinal data, consider using alternative statistics that are designed for ordinal data, such as Spearman's rank correlation coefficient, which is based on the ranks of the data rather than their raw values and can be a better choice for ordinal data. Which other tests???

In [9]:
# This modified function creates synthetic bootstrap samples from the original dataset, 
# including both numerical and categorical columns.

def create_bootstrap_samples(df, sample_size=None, n_samples=1000):
    """
    Creates synthetic bootstrap samples from the original dataset, including both
    numerical and categorical columns.

    Args:
        df: A pandas DataFrame containing the data.
        sample_size: The desired size of each bootstrap sample (default: original data size).
        n_samples: The number of bootstrap samples to generate (default: 1000).

    Returns:
        A pandas DataFrame containing all bootstrap samples concatenated together.
    """
    # If sample_size is not provided, use the length of the DataFrame
    if sample_size is None:
        sample_size = len(df)

    # Create a DataFrame to store bootstrap samples
    bootstrapped_df = pd.DataFrame()

    # Loop n_samples times to generate bootstrap samples
    for _ in range(n_samples):
        # Create a bootstrap sample of sample_size with replacement
        bootstrap_sample = df.sample(n=sample_size, replace=True)
        # Concatenate the bootstrap sample to the DataFrame
        bootstrapped_df = pd.concat([bootstrapped_df, bootstrap_sample])

    # Reset index to avoid duplicate indices from concatenation
    bootstrapped_df.reset_index(drop=True, inplace=True)

    # Return the concatenated DataFrame of bootstrap samples
    return bootstrapped_df




In [10]:
bootstrapped_df = create_bootstrap_samples(df) #runtime 7-8min for n_samples = 1000

In [11]:
# Check bootstrapped_df
print(bootstrapped_df.shape)
print(bootstrapped_df.columns)
print(bootstrapped_df.isnull().sum())
bootstrapped_df.head()

(20512000, 12)
Index(['pass_bar', 'ugpa', 'decile1', 'decile1b', 'decile3', 'lsat', 'grad',
       'fulltime', 'fam_inc', 'tier', 'race1', 'gender'],
      dtype='object')
pass_bar    0
ugpa        0
decile1     0
decile1b    0
decile3     0
lsat        0
grad        0
fulltime    0
fam_inc     0
tier        0
race1       0
gender      0
dtype: int64


Unnamed: 0,pass_bar,ugpa,decile1,decile1b,decile3,lsat,grad,fulltime,fam_inc,tier,race1,gender
0,1,3.8,7.0,7.0,7.0,43.0,1,1.0,4.0,6.0,white,female
1,1,3.2,6.0,5.0,7.0,33.0,1,1.0,4.0,3.0,white,female
2,1,3.5,3.0,2.0,4.0,38.0,1,1.0,3.0,4.0,white,male
3,1,2.9,10.0,10.0,10.0,41.0,1,1.0,4.0,5.0,white,male
4,1,2.5,6.0,6.0,4.0,40.0,1,2.0,3.0,3.0,white,male


In [12]:
# Check df
print(df.shape)
print(df.columns)
print(df.isnull().sum())
df.head()

(20512, 12)
Index(['pass_bar', 'ugpa', 'decile1', 'decile1b', 'decile3', 'lsat', 'grad',
       'fulltime', 'fam_inc', 'tier', 'race1', 'gender'],
      dtype='object')
pass_bar    0
ugpa        0
decile1     0
decile1b    0
decile3     0
lsat        0
grad        0
fulltime    0
fam_inc     0
tier        0
race1       0
gender      0
dtype: int64


Unnamed: 0,pass_bar,ugpa,decile1,decile1b,decile3,lsat,grad,fulltime,fam_inc,tier,race1,gender
0,1,3.5,10.0,10.0,10.0,44.0,1,1.0,5.0,4.0,white,female
1,1,3.5,5.0,5.0,4.0,29.0,1,1.0,4.0,2.0,white,female
2,1,3.5,3.0,3.0,2.0,36.0,1,1.0,1.0,3.0,white,male
3,1,3.5,7.0,7.0,4.0,39.0,1,1.0,4.0,3.0,white,male
4,1,3.5,9.0,9.0,8.0,48.0,1,1.0,4.0,5.0,white,male


TO DISCUSS: WHICH TESTS DO WE RUN TO COMPARE QUALITY OF BOOTSTRAPPED SYNTHETIC DATA WITH GAUSSIAN COUPOLA SYNTHETIC DATA? SHALL WE RESEARCH THIS FOR NEXT WEEK?

In [14]:
from scipy.stats import ks_2samp, chi2_contingency

def evaluate_bootstrapped_data(original_data, bootstrapped_data, numerical_columns, categorical_columns):
    """
    Evaluates the bootstrapped data by comparing it to the original data,
    including both numerical and categorical columns.
  
    Args:
        original_data: A pandas DataFrame containing the original data.
        bootstrapped_data: A pandas DataFrame containing the bootstrapped data.
        numerical_columns: A list of names of numerical columns to evaluate.
        categorical_columns: A list of names of categorical columns to evaluate.
  
    Returns:
        A dictionary containing the evaluation results for each metric.
    """
    evaluation_results = {'numerical': {}, 'categorical': {}}

    # Evaluate numerical columns
    for column in numerical_columns:
        original_column_data = original_data[column]
        bootstrapped_column_data = bootstrapped_data[column]

        # Perform a Kolmogorov-Smirnov test
        ks_statistic, ks_pvalue = ks_2samp(original_column_data, bootstrapped_column_data)

        evaluation_results['numerical'][column] = {
            'original_mean': original_column_data.mean(),
            'bootstrapped_mean': bootstrapped_column_data.mean(),
            'original_std': original_column_data.std(),
            'bootstrapped_std': bootstrapped_column_data.std(),
            'ks_statistic': ks_statistic,
            'ks_pvalue': ks_pvalue
        }

    # Evaluate categorical columns
    for column in categorical_columns:
        original_counts = original_data[column].value_counts(normalize=True)
        bootstrapped_counts = bootstrapped_data[column].value_counts(normalize=True)

        # Perform a Chi-squared test for distribution of categorical variables
        contingency_table = pd.concat([original_counts, bootstrapped_counts], axis=1).fillna(0)
        chi2_statistic, chi2_pvalue, _, _ = chi2_contingency(contingency_table)

        evaluation_results['categorical'][column] = {
            'original_proportions': original_counts.to_dict(),
            'bootstrapped_proportions': bootstrapped_counts.to_dict(),
            'chi2_statistic': chi2_statistic,
            'chi2_pvalue': chi2_pvalue
        }

    # Evaluate correlations
    original_corr = original_data[numerical_columns].corr()
    bootstrapped_corr = bootstrapped_data[numerical_columns].corr()
    corr_diff = (bootstrapped_corr - original_corr).abs().sum().sum() / (len(numerical_columns) ** 2)

    evaluation_results['correlation_difference'] = corr_diff

    return evaluation_results


In [15]:
numerical_columns = ['ugpa', 'decile1', 'decile1b', 'decile3', 'lsat', 'grad', 'fulltime', 'fam_inc', 'tier']
categorical_columns = ['race1', 'gender']

# Assuming 'original_data' is your original DataFrame and 'bootstrapped_data' is a DataFrame containing your synthetic data
evaluation_results = evaluate_bootstrapped_data(df, bootstrapped_df, numerical_columns, categorical_columns)

In [16]:
def print_evaluation_results(evaluation_results):
    """
    Prints the evaluation results in a readable format.

    Args:
        evaluation_results: A dictionary containing the evaluation results for each metric.
    """

    # Print numerical evaluation results
    print("Numerical Columns Evaluation:\n")
    for column, metrics in evaluation_results['numerical'].items():
        print(f"--- {column} ---")
        print(f"Original Mean: {metrics['original_mean']:.2f}")
        print(f"Bootstrapped Mean: {metrics['bootstrapped_mean']:.2f}")
        print(f"Original Std Dev: {metrics['original_std']:.2f}")
        print(f"Bootstrapped Std Dev: {metrics['bootstrapped_std']:.2f}")
        print(f"KS Statistic: {metrics['ks_statistic']:.4f}")
        print(f"KS P-Value: {metrics['ks_pvalue']:.4f}")
        print("")

    # Print categorical evaluation results
    print("Categorical Columns Evaluation:\n")
    for column, metrics in evaluation_results['categorical'].items():
        print(f"--- {column} ---")
        print(f"Original Proportions: {metrics['original_proportions']}")
        print(f"Bootstrapped Proportions: {metrics['bootstrapped_proportions']}")
        print(f"Chi2 Statistic: {metrics['chi2_statistic']:.4f}")
        print(f"Chi2 P-Value: {metrics['chi2_pvalue']:.4f}")
        print("")

    # Print correlation evaluation result
    print("Correlation Structure Evaluation:")
    print(f"Total Absolute Difference in Correlation: {evaluation_results['correlation_difference']:.4f}")
    print("")

# Example usage
print_evaluation_results(evaluation_results)


Numerical Columns Evaluation:

--- ugpa ---
Original Mean: 3.21
Bootstrapped Mean: 3.21
Original Std Dev: 0.40
Bootstrapped Std Dev: 0.40
KS Statistic: 0.0001
KS P-Value: 1.0000

--- decile1 ---
Original Mean: 5.72
Bootstrapped Mean: 5.72
Original Std Dev: 2.78
Bootstrapped Std Dev: 2.78
KS Statistic: 0.0001
KS P-Value: 1.0000

--- decile1b ---
Original Mean: 5.53
Bootstrapped Mean: 5.53
Original Std Dev: 2.85
Bootstrapped Std Dev: 2.85
KS Statistic: 0.0001
KS P-Value: 1.0000

--- decile3 ---
Original Mean: 5.53
Bootstrapped Mean: 5.53
Original Std Dev: 2.85
Bootstrapped Std Dev: 2.85
KS Statistic: 0.0001
KS P-Value: 1.0000

--- lsat ---
Original Mean: 36.71
Bootstrapped Mean: 36.71
Original Std Dev: 5.38
Bootstrapped Std Dev: 5.38
KS Statistic: 0.0001
KS P-Value: 1.0000

--- grad ---
Original Mean: 1.00
Bootstrapped Mean: 1.00
Original Std Dev: 0.00
Bootstrapped Std Dev: 0.00
KS Statistic: 0.0000
KS P-Value: 1.0000

--- fulltime ---
Original Mean: 1.07
Bootstrapped Mean: 1.07
Original

In [None]:
kkkk

In [None]:
def plot_data_and_bootstrapped(df, bootstrapped_data, numerical_columns):
    """
    Plots the distribution of bootstrapped data with original data for comparison.
    Also shows the original mean and the bootstrapped 95% confidence intervals.
    """
    num_plots = len(numerical_columns)
    fig, axes = plt.subplots(num_plots, 1, figsize=(10, 5 * num_plots))

    for i, column in enumerate(numerical_columns):
        ax = axes[i]

        # Calculate the confidence interval bounds
        ci_lower_boot = bootstrapped_data[column].quantile(0.025)
        ci_higher_boot = bootstrapped_data[column].quantile(0.975)

        # Calculate the original mean
        original_mean = df[column].mean()

        # Plot the bootstrapped data distribution
        sns.histplot(bootstrapped_data[column], kde=True, bins=30, ax=ax, color='blue', label='Bootstrapped Distribution')
        
        # Shade the confidence interval area
        ax.axvspan(ci_lower_boot, ci_higher_boot, alpha=0.2, color='red', label='95% CI')

        # Add a line for the original mean
        ax.axvline(x=original_mean, color='green', lw=2, label='Original Mean')

        # Add titles and labels
        ax.set_title(f'Comparison of Bootstrapped and Original Mean for {column}')
        ax.set_xlabel('Value')
        ax.set_ylabel('Density')

        # Add the legend
        ax.legend()

    plt.tight_layout()
    plt.show()