In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.evaluation.single_table import run_diagnostic, evaluate_quality
from sdmetrics.reports.single_table import QualityReport
from sdmetrics.visualization import get_column_plot,set_plotly_config
import mlflow
from sdmetrics.single_table import BinaryAdaBoostClassifier, CategoricalCAP, CategoricalEnsemble, NumericalSVR
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder

 

ImportError: cannot import name 'set_plotly_config' from 'sdmetrics.visualization' (C:\Users\20181588\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sdmetrics\visualization.py)

In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("admissions")


In [None]:
admissions_df = pd.read_csv('../data/bar_pass_prediction.csv')
print(admissions_df.shape)
print(admissions_df.columns)
print(admissions_df.isnull().sum())
display(admissions_df.head())

In [None]:
admissions_sub = admissions_df[['ID','sex', 'race',
       'lsat', 'ugpa', 'DOB_yr', 'grad', 'bar1', 'bar1_yr',
       'bar2', 'bar2_yr', 'fulltime', 'fam_inc', 'age', 'gender', 'Dropout',
       'bar_passed','gpa', 'index6040', 'indxgrp',
       'indxgrp2' ]]


In [None]:
# Remove all rows with missing values
admissions_sub_cleaned = admissions_sub.dropna()
print(admissions_sub_cleaned.shape)
print(admissions_sub_cleaned.isnull().sum())

In [None]:
# Select only the categorical columns
categorical_columns = admissions_sub_cleaned.select_dtypes(include=['object'])

# Loop through each categorical column and print value counts
for column in categorical_columns.columns:
    print(f"Value counts for column '{column}':")
    print(admissions_sub_cleaned[column].value_counts())
    print()

In [None]:
f,ax = plt.subplots(figsize=(20, 20))
sns.heatmap(admissions_sub_cleaned.corr(numeric_only=True), annot=True, linewidths=.5, fmt= '.2f',ax=ax)
plt.show()

In [None]:
admissions_sub_cleaned.dtypes

In [None]:
# Define columns to change their data types to categorical
update_categorical_columns = ['sex', 'race', 'bar1_yr', 'bar2_yr', 'fulltime', 'fam_inc']

# Change data types of categorical columns
admissions_sub_cleaned[update_categorical_columns] = admissions_sub_cleaned[update_categorical_columns].astype('category')


In [None]:
admissions_sub_cleaned.dtypes

In [None]:
# Select numerical columns
numerical_columns = ['lsat', 'ugpa', 'DOB_yr', 'age', 'gpa', 'index6040']

# Select categorical or boolean columns
categorical_columns = ['sex', 'race', 'grad', 'bar1', 'bar1_yr', 'bar2', 'bar2_yr', 'fulltime', 'fam_inc', 'gender', 'Dropout', 'bar_passed', 'indxgrp', 'indxgrp2']

# Plot histograms for numerical columns
for column in numerical_columns:
    plt.figure(figsize=(8, 5))
    sns.histplot(admissions_sub_cleaned[column].dropna(), kde=True)
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

# Plot countplots for categorical or boolean columns
for column in categorical_columns:
    plt.figure(figsize=(8, 5))
    if column == 'bar_passed':  # Special handling for boolean column
        sns.countplot(x=column, data=admissions_sub_cleaned)
    else:
        sns.countplot(x=column, data=admissions_sub_cleaned, order=admissions_sub_cleaned[column].value_counts().index)
    plt.title(f'Countplot of {column}')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
    plt.show()

In [None]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(admissions_sub_cleaned)

In [None]:
print('Auto detected data:\n')
metadata.visualize()

In [None]:
metadata.validate()
#metadata.save_to_json('admissions_metadata.json')

# Gaussian Copula

## Produce the synthetic data using a gaussian copula

In [None]:
synthesizer = GaussianCopulaSynthesizer(metadata,  default_distribution="gaussian_kde")
synthesizer.fit(admissions_sub_cleaned)

In [None]:
synthetic_data_GC = synthesizer.sample(num_rows=2000)
synthetic_data_GC.head()

## Evaluate the produced synthetic data 

In [None]:
diagnostic = run_diagnostic(
    real_data=admissions_sub_cleaned,
    synthetic_data=synthetic_data_GC,
    metadata=metadata
)

In [None]:
quality_report = evaluate_quality(
    admissions_sub_cleaned,
    synthetic_data_GC,
    metadata
)

In [None]:
quality_report.get_details('Column Shapes')

In [None]:
fig = get_column_plot(
    real_data=admissions_sub_cleaned,
    synthetic_data=synthetic_data_GC,
    column_name='bar_passed'
)

fig.show()

In [None]:
fig = get_column_plot(
    real_data=admissions_sub_cleaned,
    synthetic_data=synthetic_data_GC,
    column_name='fam_inc',
    plot_type = 'distplot'
)

fig.show()

In [None]:
fig = get_column_plot(
    real_data=admissions_sub_cleaned,
    synthetic_data=synthetic_data_GC,
    column_name='fulltime'
)

fig.show()

In [None]:
# Create subplots for side-by-side comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot correlation heatmap for the real dataset
sns.heatmap(admissions_sub_cleaned.corr(numeric_only=True), cmap='coolwarm', annot=True, fmt=".2f", ax=axes[0])
axes[0].set_title('Real Dataset Correlation Heatmap')

# Plot correlation heatmap for the synthetic dataset
sns.heatmap(synthetic_data_GC.corr(numeric_only=True), cmap='coolwarm', annot=True, fmt=".2f", ax=axes[1])
axes[1].set_title('Synthetic Dataset Correlation Heatmap')

plt.tight_layout()
plt.show()

In [None]:
BinaryAdaBoostClassifier.compute(admissions_sub_cleaned, synthetic_data_GC, target = 'bar_passed')

In [None]:
score = CategoricalCAP.compute(
    real_data=admissions_sub_cleaned,
    synthetic_data=synthetic_data_GC,
    key_fields=['age', 'fam_inc'],
    sensitive_fields=['indxgrp']
)
score

# SMOTE

In [None]:
admissions_sub_cleaned.columns = admissions_sub_cleaned.columns.astype(str)


# Detect categorical columns automatically
categorical_columns = admissions_sub_cleaned.select_dtypes(include=['object', 'category']).columns.tolist()



# Encode categorical columns using one-hot encoding
encoder = OneHotEncoder(drop='first', sparse=False)  # drop='first' to avoid multicollinearity
X_categorical_encoded = encoder.fit_transform(admissions_sub_cleaned[categorical_columns])
X_categorical_encoded_df = pd.DataFrame(X_categorical_encoded, columns=encoder.get_feature_names_out())

# Select numerical columns
numerical_columns = admissions_sub_cleaned.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Concatenate encoded categorical columns and numerical columns
X = pd.concat([admissions_sub_cleaned[numerical_columns].reset_index(), X_categorical_encoded_df.reset_index()], axis=1)
X.columns = X.columns.astype(str)

# Target variable
y = admissions_sub_cleaned['bar_passed']

# Apply SMOTE to generate synthetic data
smote = SMOTE(sampling_strategy = 'all')
X_synthetic, y_synthetic = smote.fit_resample(X, y)

# Concatenate synthetic features and target variable into a DataFrame
synthetic_data_SMOTE = pd.concat([pd.DataFrame(X_synthetic, columns=X.columns).reset_index(), pd.Series(y_synthetic, name='bar_passed').reset_index()], axis=1)
synthetic_data_SMOTE = synthetic_data_SMOTE.loc[:,~synthetic_data_SMOTE.columns.duplicated()].copy()
real_data_SMOTE = pd.concat([X.reset_index(), pd.Series(y, name='bar_passed').reset_index()], axis=1)
real_data_SMOTE = real_data_SMOTE.loc[:,~real_data_SMOTE.columns.duplicated()].copy()




In [None]:
print(X.shape)
print(y.shape)

In [None]:
print(real_data_SMOTE.shape)
display(real_data_SMOTE.head())

In [None]:
print(synthetic_data_SMOTE.shape)
display(synthetic_data_SMOTE.head())

In [None]:
admissions_sub_cleaned.columns.values

In [None]:
real_data_SMOTE.head()

In [None]:
metadata_SMOTE = SingleTableMetadata()
metadata_SMOTE.detect_from_dataframe(real_data_SMOTE)

In [None]:
diagnostic_SMOTE = run_diagnostic(
    real_data=real_data_SMOTE,
    synthetic_data=synthetic_data_SMOTE, 
    metadata = metadata_SMOTE
)

In [None]:
quality_report_SMOTE = evaluate_quality(
    real_data_SMOTE,
    synthetic_data_SMOTE,
     metadata_SMOTE
)

In [None]:
quality_report_SMOTE.get_details('Column Shapes')

In [None]:
fig = get_column_plot(
    real_data=real_data_SMOTE,
    synthetic_data=synthetic_data_SMOTE,
    column_name='bar_passed'
)

fig.show()

In [None]:
fig = get_column_plot(
    real_data=real_data_SMOTE,
    synthetic_data=synthetic_data_SMOTE,
    column_name='ugpa'
)

fig.show()