# ⭐️ Data Preprocessing

Import all the required packages.

In [38]:
import sys

sys.path.append("./experiment_utils")
sys.path.append("./preprocessing_utils")
sys.path.append("./feature_selection_utils")

In [39]:
import pandas as pd
import numpy as np
import preprocessing
import feature_selection
import experiments_utils

pd.options.mode.chained_assignment = None
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

np.random.seed(42)

## Data Preparation

Load dataset.

In [40]:
df_endo = pd.read_csv(experiments_utils.ENDO_DATA_GFORMS_PATH)
df_endo = df_endo.drop(columns=["Позначка часу"])

Rename columns.

In [41]:
df_endo = preprocessing.rename_columns(df_endo)

In [42]:
print(
    f"Initial dataset comprises {df_endo.shape[0]} observations and {df_endo.shape[1]} features."
)

Initial dataset comprises 572 observations and 76 features.


Remove the participants who do not experience bleeding.

In [43]:
df_endo = df_endo[df_endo["period_duration"] != "0 days (no bleeding)"]
print(
    f"Dataset comprises {df_endo.shape[0]} after dropping observations where women experiences no bleeding."
)

Dataset comprises 561 after dropping observations where women experiences no bleeding.


Remove participants who filled in less than 75% of the survey.

In [44]:
df_endo = df_endo.dropna(thresh=len(df_endo.columns) * (1 - 0.25))
print(
    f"Dataset comprises {df_endo.shape[0]} after dropping observations where participant did not fill in at least 25% of the survey."
)

Dataset comprises 552 after dropping observations where participant did not fill in at least 25% of the survey.


Calculate the proportion of missing values.  

We do not consider unanswered suggestions and questions field as missing values. 

If a participant does not indicate the presence of a condition in their family, we assume either they are aware that it is not present in their family or they do not know whether it (actually) exists.

In [45]:
excluded_columns = [
    "suggestions_questions",
    "family_history_endometriosis",
    "family_history_fibroids",
    "family_history_pcos",
    "family_history_infertility",
    "family_history_heavy_bleeding",
    "family_history_pelvic_pain",
]

df_filtered = df_endo.drop(columns=excluded_columns)
missing_ratio = df_filtered.isna().sum().sum() / df_filtered.size * 100
print(f"Missing ratio: {missing_ratio:.2f}%.")

Missing ratio: 0.55%.


## Feature preprocessing

In [46]:
# preprocess columns
df_endo = preprocessing.preprocess_gyn_data(df_endo)

Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


In [47]:
# based on all conditions mentioned, create binary columns for endometriosis and fibroids
df_endo["has_endometriosis"] = (
    df_endo["all_conditions_mentioned"]
    .str.contains("Endometriosis", na=False)
    .astype(int)
)
df_endo["has_fibroids"] = (
    df_endo["all_conditions_mentioned"]
    .str.contains("Uterine fibroids", na=False)
    .astype(int)
)

In [48]:
print(
    f"Dataset comprises a total of {len(df_endo)} participants: {len(df_endo[df_endo['has_endometriosis'] == 1])} diagnosed with endometriosis and {len(df_endo[df_endo['has_endometriosis'] == 0])} not."
)

Dataset comprises a total of 552 participants: 229 diagnosed with endometriosis and 323 not.


In [49]:
print(
    f"Dataset comprises a total of {len(df_endo)} participants: {len(df_endo[df_endo['has_fibroids'] == 1])} diagnosed with uterine fibroids and {len(df_endo[df_endo['has_fibroids'] == 0])} not."
)

Dataset comprises a total of 552 participants: 98 diagnosed with uterine fibroids and 454 not.


In [50]:
df_endo["has_endo_and_ufib"] = df_endo["has_endometriosis"] & df_endo["has_fibroids"]
print(
    f"Dataset comprises a total of {len(df_endo)} participants: {len(df_endo[df_endo['has_endo_and_ufib'] == 1])} diagnosed with both endoemtriosis and uterine fibroids."
)

Dataset comprises a total of 552 participants: 50 diagnosed with both endoemtriosis and uterine fibroids.


In [51]:
# save prepared data
df_endo.to_csv(
    experiments_utils.ENDO_DATA_PREDICTION_PATH, encoding="utf-8", index=False
)

## Positive-Unlabeled Learning for Estimating Family History of Endoemtriosis

The methodology of code cells below is described in Section 5.2 (Handling Bias in Family History of Endometriosis) and the results are discussed in Section 6.1 (Handling Uncertainty in Family History of Endometriosis).


If family_history_endometriosis = 1, we can confidently assume the person is aware of the condition in their family. However, if it is 0, it does not necessarily mean the condition is absent - it could indicate either a true absence or a lack of knowledge about its presence.

In [52]:
pu_prediction_features = experiments_utils.PU_PREDICTION_FEATURES
df_preds = df_endo[pu_prediction_features]

### Selecting participants for training

In [53]:
X = df_preds.drop(columns=["family_history_endometriosis"])
y = np.where(df_preds["family_history_endometriosis"] == 1, 1, 0)

positive_indices = np.where(y == 1)[0]
labeled_positive_indices = X.index[positive_indices][
    np.random.choice(
        len(positive_indices), size=int(0.6 * len(positive_indices)), replace=False
    )
]
print(
    f"{len(labeled_positive_indices)} women with family history of endometriosis were selected for training."
)

85 women with family history of endometriosis were selected for training.


In [54]:
X_labeled = X.loc[labeled_positive_indices]
y_labeled = np.ones(X_labeled.shape[0])

In [55]:
unlabeled_indices = np.where(y != 1)[0]
X_unlabeled = X.iloc[unlabeled_indices]
y_unlabeled = y[unlabeled_indices]
negative_size = int(0.20 * X_unlabeled.shape[0])
negative_indices = np.random.choice(
    X_unlabeled.index, size=negative_size, replace=False
)

X_negatives = X_unlabeled.loc[negative_indices]
y_negatives = np.zeros(X_negatives.shape[0])
print(
    f"{len(X_negatives)} women without family history of endometriosis were selected for training."
)

82 women without family history of endometriosis were selected for training.


In [56]:
train_indices = np.hstack([labeled_positive_indices, negative_indices])
X_train = pd.DataFrame(
    np.vstack([X_labeled, X_negatives]), columns=X.columns, index=train_indices
)
y_train = np.hstack([y_labeled, y_negatives])

In [57]:
remaining_indices = X_unlabeled.index.difference(negative_indices)
X_remaining_unlabeled = pd.DataFrame(
    X_unlabeled.loc[remaining_indices], columns=X.columns, index=remaining_indices
)

In [58]:
X_train = pd.DataFrame(X_train, columns=X.columns, index=train_indices)
X_remaining_unlabeled = pd.DataFrame(
    X_remaining_unlabeled, columns=X.columns, index=remaining_indices
)
y_remaining_unlabeled = y_unlabeled[~X_unlabeled.index.isin(negative_indices)]

In [59]:
X_train, X_remaining_unlabeled = preprocessing.impute_features(
    X_train, X_remaining_unlabeled
)

### Model Training

#### Logistic Regression

In [60]:
lr_model, _ = feature_selection.run_logistic_regression(X_train, y_train, disp=True)

Best Hyperparameters: {'C': 0.01, 'l1_ratio': 0.5, 'penalty': 'elasticnet', 'solver': 'saga'}
Avg F1 Score: 0.6746666666666666


#### Random Forest

In [61]:
rf_model, _ = feature_selection.run_rf(X_train, y_train, disp=True)

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 8, 'min_samples_split': 2, 'n_estimators': 300}
Avg F1 Score: 0.6882706235647412


#### XGBoost

In [62]:
xgb_model, _ = feature_selection.run_xgb(X_train, y_train, disp=True)

Best Hyperparameters: {'colsample_bytree': 0.7, 'gamma': 0.05, 'learning_rate': 0.2, 'n_estimators': 100, 'subsample': 0.7}
Avg F1 Score: 0.6923147340889276


#### AdaBoost

In [63]:
ada_model, _ = feature_selection.run_ada(X_train, y_train, disp=True)

Best Hyperparameters: {'algorithm': 'SAMME', 'learning_rate': 0.01, 'n_estimators': 100}
Avg F1 Score: 0.6564323522278181


#### MLP

In [64]:
mlp_model, _ = feature_selection.run_mlp(X_train, y_train, disp=True, pu_learning=True)

Best Hyperparameters: {'mlp__activation': 'relu', 'mlp__alpha': 0.0001, 'mlp__batch_size': 8, 'mlp__beta_1': 0.99, 'mlp__beta_2': 0.999, 'mlp__early_stopping': True, 'mlp__hidden_layer_sizes': (100, 100, 50), 'mlp__learning_rate_init': 0.001, 'mlp__max_iter': 500, 'mlp__solver': 'adam'}
Avg F1 Score: 0.7169092169092168


#### TabPFN

Please note that TabPFn runs under [TabPFN_endometriosis_experiment.ipynb](https://colab.research.google.com/drive/1S9i1o-kvCWtUDNY7kDj0AAR88KAaJCEo#scrollTo=FXdTtXVeqzgD).

It achieves average F1 score of 0.6399.

### Model Evaluation

We select best-performing model from model-training stage.

In [65]:
family_hist_preds = feature_selection.evaluate_model_performance(
    mlp_model, X_remaining_unlabeled, y_remaining_unlabeled
)

Confusion Matrix:
[[181 147]
 [  0   0]]
Accuracy: 0.5518
Recall: 0.0000
Specificity: 0.5518
F1-Score: 0.0000


Setting updated family history of endometriosis.

In [66]:
X_remaining_unlabeled["family_history_endometriosis_prediction"] = family_hist_preds
X_train["family_history_endometriosis_prediction"] = y_train

In [67]:
df_preds["family_history_endometriosis_prediction"] = 0
df_preds.loc[remaining_indices, "family_history_endometriosis_prediction"] = (
    X_remaining_unlabeled["family_history_endometriosis_prediction"]
)
df_preds.loc[
    df_preds["family_history_endometriosis"] == 1,
    "family_history_endometriosis_prediction",
] = 1

In [68]:
df_endo["family_history_endometriosis_prediction"] = df_preds[
    "family_history_endometriosis_prediction"
]

In [69]:
# save prepared data with new feature - family history of endometriosis prediction
df_endo.to_csv(
    experiments_utils.ENDO_DATA_PREDICTION_PATH, encoding="utf-8", index=False
)