# ⭐️ Synthetic Data Experiment for Raw and Manual Bias Correction Modelling

Import all the required packages.

In [1]:
import sys

sys.path.append("./preprocessing_utils")
sys.path.append("./feature_selection_utils")
sys.path.append("./visual_utils")
sys.path.append("./experiment_utils")

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

import experiments_utils
import feature_selection
import preprocessing

# configure pandas settings for data display
pd.options.mode.chained_assignment = None
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

## 📂 Prepare Datasets

Synthetic data is generated under [endometriosis_synthetic_data_generation.ipynb](https://colab.research.google.com/drive/1DVFFCmSvpkftDJpjBzm3cgwbuZAqhNPc#scrollTo=qPn9QCcywSA5).

Load datasets.

In [3]:
endo_data_path = experiments_utils.ENDO_DATA_PREDICTION_PATH  # real data
endo_tvae_model_selected_synth_path = "./synthetic_data/tvae_selected_features_exp_10000_synthetic_data.csv"  # synthetic data generated by TVAE
endo_ctgan_model_selected_synth_path = "./synthetic_data/ctgan_selected_features_exp_10000_synthetic_data.csv"  # synthetic data generated by CTGAN

In [4]:
df_endo_real = pd.read_csv(endo_data_path)
df_endo_tvae_model_selected_synth = pd.read_csv(endo_tvae_model_selected_synth_path)
df_endo_ctgan_model_selected_synth = pd.read_csv(endo_ctgan_model_selected_synth_path)

📌 Features used for prediction.

In [5]:
selected_prediction_cols_model_selected = [
    "pelvic_pain_frequency_between_periods",
    "deep_vaginal_pain_during_intercourse",
    "painful_bowel_movements",
    "unable_to_cope_with_pain",
    "experienced_infertility",
    "family_history_endometriosis_prediction",
    "pelvic_pain_worst",
]

Split the data into training and test sets.

In [6]:
df_endo_real = df_endo_real[experiments_utils.ALL_AVAILABLE_FEATURES]

In [7]:
X_test_all_features = df_endo_real
X_test_all = X_test_all_features[experiments_utils.ALL_AVAILABLE_FEATURES]
y_test_all = df_endo_real["has_endometriosis"]
_, X_test_real_subset, _, y_test_real_subset = train_test_split(
    X_test_all, y_test_all, test_size=0.3, random_state=42
)

## 🧐 Overview of Synthetic Data

In [8]:
datasets = {
    "tvae_model_selected": df_endo_tvae_model_selected_synth,
    "ctgan_model_selected": df_endo_ctgan_model_selected_synth,
}

preds_cols = [
    selected_prediction_cols_model_selected,
    selected_prediction_cols_model_selected,
]

X_train, y_train = {}, {}
i = 0

for name, df in datasets.items():
    X_train_all = df[preds_cols[i]]
    X_train[name], _ = preprocessing.impute_features(
        X_train_all, X_test_all[preds_cols[i]]
    )
    y_train[name] = df["has_endometriosis"]

    print(f"Dataset: {name.upper()}")
    print(
        f"There are {len(y_train[name])} train and {len(y_test_all)} test data samples."
    )
    print(
        f"Train dataset contains {y_train[name].sum()} diagnosed with endometriosis participants, "
        f"and test dataset - {y_test_all.sum()}.\n"
    )
    i += 1

Dataset: TVAE_MODEL_SELECTED
There are 10000 train and 552 test data samples.
Train dataset contains 4181 diagnosed with endometriosis participants, and test dataset - 229.

Dataset: CTGAN_MODEL_SELECTED
There are 10000 train and 552 test data samples.
Train dataset contains 3982 diagnosed with endometriosis participants, and test dataset - 229.



## 🏋️‍♀️ Model Training

Here, we compare the performance of TVAE and CTGAN synthetic data generators. Analysis on this is provided under Section 6.4.1.

### Logistic Regression

In [9]:
lr_model_tvae_model_selected, lr_val_folds_tvae_model_selected = (
    feature_selection.run_logistic_regression(
        X_train["tvae_model_selected"], y_train["tvae_model_selected"], disp=True
    )
)

Best Hyperparameters: {'C': 1, 'penalty': 'l1', 'solver': 'saga'}
Avg F1 Score: 0.8599306780762959


In [10]:
# achieves average f1 score of 0, hence covered by try/except
try:
    lr_model_ctgan_model_selected, lr_val_folds_ctgan_model_selected = (
        feature_selection.run_logistic_regression(
            X_train["ctgan_model_selected"], y_train["ctgan_model_selected"], disp=True
        )
    )
except:
    pass

Best Hyperparameters: None
Avg F1 Score: 0


### Random Forest

In [20]:
rf_model_tvae_model_selected, rf_val_folds_tvae_model_selected = (
    feature_selection.run_rf(
        X_train["tvae_model_selected"], y_train["tvae_model_selected"], disp=True
    )
)

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 6, 'min_samples_split': 2, 'n_estimators': 300}
Avg F1 Score: 0.8626976798356363


In [21]:
rf_model_ctgan_model_selected, rf_val_folds_ctgan_model_selected = (
    feature_selection.run_rf(
        X_train["ctgan_model_selected"], y_train["ctgan_model_selected"], disp=True
    )
)

Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Avg F1 Score: 0.2770882770405222


### XGBoost

In [11]:
xgboost_model_tvae_model_selected, xgb_val_folds_tvae_model_selected = (
    feature_selection.run_xgb(
        X_train["tvae_model_selected"], y_train["tvae_model_selected"], disp=True
    )
)

Best Hyperparameters: {'colsample_bytree': 0.5, 'gamma': 0.1, 'learning_rate': 0.1, 'n_estimators': 100, 'subsample': 1}
Avg F1 Score: 0.8635386229834905


In [12]:
xgboost_model_ctgan_model_selected, xgb_val_folds_ctgan_model_selected = (
    feature_selection.run_xgb(
        X_train["ctgan_model_selected"], y_train["ctgan_model_selected"], disp=True
    )
)

Best Hyperparameters: {'colsample_bytree': 1, 'gamma': 0.05, 'learning_rate': 0.2, 'n_estimators': 300, 'subsample': 0.5}
Avg F1 Score: 0.3269858852678279


### AdaBoost

In [24]:
ada_model_tvae_model_selected, ada_val_folds_tvae_model_selected = (
    feature_selection.run_ada(
        X_train["tvae_model_selected"], y_train["tvae_model_selected"], disp=True
    )
)

Best Hyperparameters: {'algorithm': 'SAMME', 'learning_rate': 0.2, 'n_estimators': 100}
Avg F1 Score: 0.8617205429931307


In [25]:
ada_model_ctgan_model_selected, ada_val_folds_ctgan_model_selected = (
    feature_selection.run_ada(
        X_train["ctgan_model_selected"], y_train["ctgan_model_selected"], disp=True
    )
)

Best Hyperparameters: {'algorithm': 'SAMME', 'learning_rate': 0.01, 'n_estimators': 50}
Avg F1 Score: 0.0


### MLP

In [166]:
mlp_model_tvae_model_selected, _ = feature_selection.run_mlp(
    X_train["tvae_model_selected"], y_train["tvae_model_selected"], disp=True
)

Best Hyperparameters: {'mlp__activation': 'tanh', 'mlp__alpha': 0.001, 'mlp__batch_size': 128, 'mlp__beta_1': 0.9, 'mlp__beta_2': 0.999, 'mlp__early_stopping': True, 'mlp__hidden_layer_sizes': (50, 50), 'mlp__learning_rate_init': 0.1, 'mlp__max_iter': 500, 'mlp__solver': 'adam'}
Avg F1 Score: 0.8632697737612123


In [168]:
mlp_model_ctgan_model_selected, _ = feature_selection.run_mlp(
    X_train["ctgan_model_selected"], y_train["ctgan_model_selected"], disp=True
)

Best Hyperparameters: {'mlp__activation': 'tanh', 'mlp__alpha': 0.0001, 'mlp__batch_size': 32, 'mlp__beta_1': 0.95, 'mlp__beta_2': 0.9999, 'mlp__early_stopping': True, 'mlp__hidden_layer_sizes': (100,), 'mlp__learning_rate_init': 0.1, 'mlp__max_iter': 500, 'mlp__solver': 'adam'}
Avg F1 Score: 0.2431281326945219


## 🎯 Non-Causal Modelling Evaluation

This modeling approach is described in Section 5.3 (Non-Causal Correction).

The results are discussed in the Section 6.4.2.

### On entire real dataset

In [13]:
# extract the entire real data and impute missing values in accordance to data generated by TVAE
_, X_test_tvae_all = preprocessing.impute_features(
    X_train["tvae_model_selected"], X_test_all[selected_prediction_cols_model_selected]
)

In [14]:
# evaluate synthetic-data-trained model on entire real dataset
y_test_pred_all = feature_selection.evaluate_model_performance(
    xgboost_model_tvae_model_selected, X_test_tvae_all, y_test_all
)

Confusion Matrix:
[[268  55]
 [ 47 182]]
Accuracy: 0.8152
Recall: 0.7948
Specificity: 0.8297
F1-Score: 0.7811


### On external dataset

In [15]:
# extract the external datasey and impute missing values in accordance to data generated by TVAE
_, X_test_tvae_real_subset = preprocessing.impute_features(
    X_train["tvae_model_selected"],
    X_test_real_subset[selected_prediction_cols_model_selected],
)

In [16]:
# evaluate synthetic-data-trained model on external dataset
y_test_pred_real_subset = feature_selection.evaluate_model_performance(
    xgboost_model_tvae_model_selected, X_test_tvae_real_subset, y_test_real_subset
)

Confusion Matrix:
[[88 14]
 [ 9 55]]
Accuracy: 0.8614
Recall: 0.8594
Specificity: 0.8627
F1-Score: 0.8271


## 🛠️ Manual Bias Correction Modelling

This modeling approach is described in Section 5.4 (Manual Bias Correction).

The results are discussed in the Section 6.4.3.

### On entire real dataset

In [17]:
# set actual and predicted labels of endometriosis to determine False Positives and False Negatives
misclassified = pd.DataFrame(X_test_all, columns=X_test_all.columns)
misclassified["Actual"] = y_test_all.values
misclassified["Predicted"] = y_test_pred_all

In [18]:
false_positives = misclassified[
    (misclassified["Actual"] == 0) & (misclassified["Predicted"] == 1)
]
false_negatives = misclassified[
    (misclassified["Actual"] == 1) & (misclassified["Predicted"] == 0)
]

#### False Positives

In [19]:
print(f"False Positives: {len(false_positives)} cases")

False Positives: 55 cases


In [20]:
false_positives_cond_mentioned = (
    experiments_utils.false_positives_exclusion_criteria_counter(false_positives)
)

21 participants reported another complaint: ['Uterine fibroids', 'Suspected endometriosis, awaiting surgery', 'Uterine fibroids, PCOS ', 'Polycystic ovarian syndrome', 'Uterine fibroids', 'Uterine fibroids, cysts, adenoymeosis', 'suspected endo, been told by three doctors but still on waitlist', 'PCOS', 'Adenomyosis', 'Uterine fibroids, Pcos', 'Uterine fibroids', 'None of the above, ', 'Uterine fibroids', 'Waiting for diagnosis of endo', 'Uterine fibroids', 'Uterine fibroids, PCOS ', "Suspected uterine fibroids and endometriosis. my mother was diagnosed with fibroids and I have multiple relatives with endometriosis and PCOS but I haven't been able to seek diagnosis due to other health issues", 'Uterine fibroids, Adenomyosis ', 'Awaiting diagnosis', 'Uterine fibroids', 'Ovarian cysts and possible endometriosis yet to be confirmed']

12 use hormonal contraception.

17 use hormonal treatments for pain relief.

17 use prescribed painkillers.



In [21]:
false_positives = false_positives[
    ~false_positives["all_conditions_mentioned"].isin(false_positives_cond_mentioned)
]
print(
    f"After removing those who reported other gynaecological complaint, there are {len(false_positives)} cases of false positives left."
)

After removing those who reported other gynaecological complaint, there are 34 cases of false positives left.


In [22]:
false_positives = false_positives[false_positives["takes_hormones_for_pain"] == 0]
print(
    f"After removing those who take hormonal treatments for pain relief, there are {len(false_positives)} cases of false positives left."
)

After removing those who take hormonal treatments for pain relief, there are 25 cases of false positives left.


In [23]:
false_positives = false_positives[
    false_positives["takes_hormones_only_for_contracep"] == 0
]
print(
    f"After removing those who take hormonal contraception, there are {len(false_positives)} cases of false positives left."
)

After removing those who take hormonal contraception, there are 19 cases of false positives left.


#### False Negatives

In [24]:
print(f"False Negatives: {len(false_negatives)} cases")

False Negatives: 47 cases


In [25]:
experiments_utils.false_negatives_exclusion_criteria_counter(false_negatives)

Reported another complaint: 'I am currently being treated for my PCOS and had surgery for endometriosis. Additionally, I am currently breastfeeding and have experienced a decrease in symptoms compared to 2 years ago before I got pregnant.'

9 use hormonal contraception.

16 use hormonal treatments for pain relief.

3 participants are aged 45-54.

16 participants were pregnant.

5 use prescribed painkillers.



In [26]:
false_negatives = false_negatives[false_negatives["takes_hormones_for_pain"] == 0]
print(
    f"After removing those who take hormonal treatments for pain relief, there are {len(false_negatives)} cases of false negatives left."
)

After removing those who take hormonal treatments for pain relief, there are 31 cases of false negatives left.


In [27]:
false_negatives = false_negatives[
    false_negatives["takes_hormones_only_for_contracep"] == 0
]
print(
    f"After removing those who take hormonal contraception, there are {len(false_negatives)} cases of false negatives left."
)

After removing those who take hormonal contraception, there are 22 cases of false negatives left.


In [28]:
false_negatives_cond_mentioned = [
    "I am currently being treated for my PCOS and had surgery for endometriosis. Additionally, I am currently breastfeeding and have experienced a decrease in symptoms compared to 2 years ago before I got pregnant.",
]
false_negatives = false_negatives[
    ~false_negatives["suggestions_questions"].isin(false_negatives_cond_mentioned)
]
print(
    f"After removing those who reported other gynaecological complaint, there are {len(false_negatives)} cases of false negatives left."
)

After removing those who reported other gynaecological complaint, there are 21 cases of false negatives left.


In [29]:
false_negatives = false_negatives[false_negatives["age_45_54"] != 1]
print(
    f"After removing those aged 45-54, there are {len(false_negatives)} cases of false negatives left."
)

After removing those aged 45-54, there are 18 cases of false negatives left.


In [30]:
false_negatives = false_negatives[false_negatives["was_pregnant"] != 1]
print(
    f"After removing those who were pregnant, there are {len(false_negatives)} cases of false negatives left."
)

After removing those who were pregnant, there are 12 cases of false negatives left.


#### Evaluation

Hence, we reduced the number of false positives from 55 to 19 and false negatives from 47 to 12.

The resulting metrics:

* Confusion Matrix:
[[268  19]
 [ 12 182]]

* Accuracy: 0.9355
* Recall: 0.9381
* Specificity: 0.9338
* F1-Score: 0.9215

### On external dataset

In [31]:
misclassified = pd.DataFrame(X_test_real_subset, columns=X_test_real_subset.columns)
misclassified["Actual"] = y_test_real_subset.values
misclassified["Predicted"] = y_test_pred_real_subset

In [32]:
false_positives = misclassified[
    (misclassified["Actual"] == 0) & (misclassified["Predicted"] == 1)
]
false_negatives = misclassified[
    (misclassified["Actual"] == 1) & (misclassified["Predicted"] == 0)
]

#### False Positives

In [33]:
print(f"False Positives: {len(false_positives)} cases")

False Positives: 14 cases


In [34]:
false_positives_cond_mentioned = (
    experiments_utils.false_positives_exclusion_criteria_counter(false_positives)
)

6 participants reported another complaint: ['Waiting for diagnosis of endo', 'PCOS', 'suspected endo, been told by three doctors but still on waitlist', 'Uterine fibroids', 'Uterine fibroids, cysts, adenoymeosis', 'Uterine fibroids']

5 use hormonal contraception.

4 use hormonal treatments for pain relief.

5 use prescribed painkillers.



In [35]:
false_positives = false_positives[
    ~false_positives["all_conditions_mentioned"].isin(false_positives_cond_mentioned)
]
print(
    f"After removing those who reported other gynaecological complaint, there are {len(false_positives)} cases of false positives left."
)

After removing those who reported other gynaecological complaint, there are 8 cases of false positives left.


In [36]:
# Note: Mistyped false negatives instead of false positives in the print statement. Cell was rerun, so the order may not match.
false_positives = false_positives[false_positives["takes_hormones_for_pain"] == 0]
print(
    f"After removing those who take hormonal treatments for pain relief, there are {len(false_positives)} cases of false positives left."
)

After removing those who take hormonal treatments for pain relief, there are 6 cases of false positives left.


In [37]:
false_positives = false_positives[
    false_positives["takes_hormones_only_for_contracep"] == 0
]
print(
    f"After removing those who take hormonal contraception, there are {len(false_positives)} cases of false positives left."
)

After removing those who take hormonal contraception, there are 4 cases of false positives left.


#### False Negatives

In [38]:
print(f"False Negatives: {len(false_negatives)} cases")

False Negatives: 9 cases


In [39]:
experiments_utils.false_negatives_exclusion_criteria_counter(false_negatives)

Reported another complaint: 'nan'

2 use hormonal contraception.

4 use hormonal treatments for pain relief.

0 participants are aged 45-54.

5 participants were pregnant.

0 use prescribed painkillers.



In [40]:
false_negatives = false_negatives[false_negatives["takes_hormones_for_pain"] == 0]
print(
    f"After removing those who take hormonal treatments for pain relief, there are {len(false_negatives)} cases of false negatives left."
)

After removing those who take hormonal treatments for pain relief, there are 5 cases of false negatives left.


In [41]:
false_negatives = false_negatives[
    false_negatives["takes_hormones_only_for_contracep"] != 1
]
print(
    f"After removing those who take hormonal contraception, there are {len(false_negatives)} cases of false negatives left."
)

After removing those who take hormonal contraception, there are 3 cases of false negatives left.


In [42]:
false_negatives_cond_mentioned = [
    "I have not been formally diagnosed with endometriosis, but often I have been told by specialists here in the US I just must deal with the pain, the bleeding, etc. I've been to so many so-called specialists who tell me I just need to get over it. My current gynecologist thinks I have adenomyosis. I cannot take hormonal BC as it makes me violently depressed and angry, and doesn't help anyway. Thank you Kateryna! I hope you are staying safe!"
]
false_negatives = false_negatives[
    ~false_negatives["suggestions_questions"].isin(false_negatives_cond_mentioned)
]
print(
    f"After removing those who reported other gynaecological complaint, there are {len(false_negatives)} cases of false negatives left."
)

After removing those who reported other gynaecological complaint, there are 2 cases of false negatives left.


In [43]:
false_negatives = false_negatives[false_negatives["was_pregnant"] != 1]
print(
    f"After removing those who were pregnant, there are {len(false_negatives)} cases of false negatives left."
)

After removing those who were pregnant, there are 1 cases of false negatives left.


#### Evaluation

Hence, we reduced the number of false positives from 14 to 4 and false negatives from 9 to 1.

The resulting metrics:

* Confusion Matrix:
[[88 4]
 [ 1 55]]

* Accuracy: 0.9662
* Recall: 0.9821
* Specificity: 0.9565
* F1-Score: 0.9565