## Machine Learning Approach to Identify risk factors for Antepartum Hemorrhage (APH) 

### Importing the necessary libraries

In [89]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

In [90]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


### Reading the Input File

In [91]:
# Specify the file path
df = pd.read_excel(r"C:\Users\Manasa Madabhushi\Python_Projects\4501_AMANHI_With_USG.xlsx")
df.head(3)

Unnamed: 0,WHOWID,ORIG_ID,PARTICIPANT_ID,PW_AGE,PW_EDUCATION,PREV_SB,PREV_MIS,PREV_PTB,PREV_MULTIP,PREV_CS,...,DBP4,UDIP_PROT4,DEL_DATE,GAGEBRTH,TYPEDELIV,age_death_b1,age_death_b2,age_death_b3,APH,MAT_WEIGHT
0,20-016580,16580,AMANHIT-20916,36,10.0,1,2,0,1,0,...,69.0,0.0,2014-10-31,271.0,Normally through the vagina,,,,0.0,45.8
1,20-016683,16683,AMANHIT-22194,32,10.0,0,0,0,0,0,...,73.0,0.0,2015-01-06,274.0,Normally through the vagina,,,,0.0,
2,20-016685,16685,AMANHIT-22712,18,6.0,0,1,0,0,0,...,70.0,0.0,2015-01-31,290.0,Normally through the vagina,,,,0.0,68.0


In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4501 entries, 0 to 4500
Data columns (total 51 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   WHOWID          4501 non-null   object        
 1   ORIG_ID         4501 non-null   int64         
 2   PARTICIPANT_ID  4501 non-null   object        
 3   PW_AGE          4501 non-null   int64         
 4   PW_EDUCATION    4485 non-null   float64       
 5   PREV_SB         4501 non-null   int64         
 6   PREV_MIS        4501 non-null   int64         
 7   PREV_PTB        4501 non-null   int64         
 8   PREV_MULTIP     4501 non-null   int64         
 9   PREV_CS         4501 non-null   int64         
 10  WEALTH_INDEX    4501 non-null   object        
 11  SINGLE_TWIN     4501 non-null   int64         
 12  GRAVIDITY       4501 non-null   int64         
 13  PARITY          4501 non-null   int64         
 14  LABOUR_HTN      4501 non-null   int64         
 15  LABO

### Drop the APH rows which are Empty

In [93]:
import pandas as pd

# Assuming your DataFrame is named df
print(df.isnull().sum())  # This will show the count of missing values in each column


WHOWID               0
ORIG_ID              0
PARTICIPANT_ID       0
PW_AGE               0
PW_EDUCATION        16
PREV_SB              0
PREV_MIS             0
PREV_PTB             0
PREV_MULTIP          0
PREV_CS              0
WEALTH_INDEX         0
SINGLE_TWIN          0
GRAVIDITY            0
PARITY               0
LABOUR_HTN           0
LABOUR_24            0
BIRTH_OUTCOME        0
BABY_SEX             0
BIRTH_WEIGHT       558
BABY_ID1             0
BIRTH_OUTCOME1       0
BABY_SEX1            0
BIRTH_WEIGHT1      558
BABY_ID2          4411
BIRTH_OUTCOME2    4411
BABY_SEX2         4411
BIRTH_WEIGHT2     4423
BABY_ID3          4499
BIRTH_OUTCOME3    4499
BABY_SEX3         4499
BIRTH_WEIGHT3     4499
SBP1               199
DBP1               199
UDIP_PROT1         199
SBP2               292
DBP2               292
UDIP_PROT2         292
SBP3               394
DBP3               394
UDIP_PROT3         394
SBP4               658
DBP4               658
UDIP_PROT4         658
DEL_DATE   

In [94]:
df = df.dropna(subset=['APH'])

### Replacing the -88 and -77 as NaN values

In [95]:
df = df.replace([-88, -77], np.nan)

In [96]:
df.columns

Index(['WHOWID', 'ORIG_ID', 'PARTICIPANT_ID', 'PW_AGE', 'PW_EDUCATION',
       'PREV_SB', 'PREV_MIS', 'PREV_PTB', 'PREV_MULTIP', 'PREV_CS',
       'WEALTH_INDEX', 'SINGLE_TWIN', 'GRAVIDITY', 'PARITY', 'LABOUR_HTN',
       'LABOUR_24', 'BIRTH_OUTCOME', 'BABY_SEX', 'BIRTH_WEIGHT', 'BABY_ID1',
       'BIRTH_OUTCOME1', 'BABY_SEX1', 'BIRTH_WEIGHT1', 'BABY_ID2',
       'BIRTH_OUTCOME2', 'BABY_SEX2', 'BIRTH_WEIGHT2', 'BABY_ID3',
       'BIRTH_OUTCOME3', 'BABY_SEX3', 'BIRTH_WEIGHT3', 'SBP1', 'DBP1',
       'UDIP_PROT1', 'SBP2', 'DBP2', 'UDIP_PROT2', 'SBP3', 'DBP3',
       'UDIP_PROT3', 'SBP4', 'DBP4', 'UDIP_PROT4', 'DEL_DATE', 'GAGEBRTH',
       'TYPEDELIV', 'age_death_b1', 'age_death_b2', 'age_death_b3', 'APH',
       'MAT_WEIGHT'],
      dtype='object')

### Drop the columns which are related to the Post Pregancies

In [97]:
df.drop(['ORIG_ID','PARTICIPANT_ID','BIRTH_OUTCOME','BABY_SEX','BIRTH_WEIGHT','LABOUR_HTN','GAGEBRTH','BABY_ID1','BABY_ID2','BABY_ID3','WHOWID','BIRTH_OUTCOME2','BIRTH_WEIGHT2','BIRTH_OUTCOME3','BABY_SEX3','BIRTH_WEIGHT3','WEALTH_INDEX','DEL_DATE','age_death_b1','age_death_b2','age_death_b3','BABY_SEX2','TYPEDELIV','BIRTH_OUTCOME1','BABY_SEX1','BIRTH_WEIGHT1'], axis=1, inplace=True)

## Data Preprocessing & Feature Engineering separately for the APH and Non-APH 
#### *Splitting the APH and Non-APH Cases to perform the Data Preprocessing 
#### *Filling the missing values
#### *Removal of Outliers Using the HDBSCAN and IQR 
#### *Feature Selection by the Mutual Information

In [98]:
df_nonAPH = df[df["APH"] == 0]
df_APH = df[df["APH"] == 1]

### Imputing the missing values Numerical & Categorical columns for APH and Non-APH case

In [99]:
import pandas as pd

# Columns to be imputed
numerical_columns = ['SBP1', 'DBP1', 'UDIP_PROT1', 'SBP2', 'DBP2', 'UDIP_PROT2', 
                     'SBP3', 'DBP3', 'UDIP_PROT3', 'SBP4', 'DBP4', 'UDIP_PROT4', 
                     'MAT_WEIGHT', 'GRAVIDITY', 'PARITY']

categorical_columns = ['PW_EDUCATION', 'LABOUR_24', 'SINGLE_TWIN', 'PREV_SB', 
                       'PREV_MIS', 'PREV_PTB', 'PREV_MULTIP', 'PREV_CS']

# Fill missing values in numerical columns using median
for col in numerical_columns:
    median_value_nonAPH = df_nonAPH[col].median()
    df_nonAPH[col].fillna(median_value_nonAPH, inplace=True)
    
    median_value_APH = df_APH[col].median()
    df_APH[col].fillna(median_value_APH, inplace=True)

# Fill missing values in categorical columns using mode
for col in categorical_columns:
    mode_value_nonAPH = df_nonAPH[col].mode()[0]  # Get the most frequent value
    df_nonAPH[col].fillna(mode_value_nonAPH, inplace=True)
    
    mode_value_APH = df_APH[col].mode()[0]  # Get the most frequent value
    df_APH[col].fillna(mode_value_APH, inplace=True)

# Check for remaining missing values
print("Missing values in df_nonAPH after imputation:")
print(df_nonAPH[numerical_columns + categorical_columns].isnull().sum())

print("\nMissing values in df_APH after imputation:")
print(df_APH[numerical_columns + categorical_columns].isnull().sum())


Missing values in df_nonAPH after imputation:
SBP1            0
DBP1            0
UDIP_PROT1      0
SBP2            0
DBP2            0
UDIP_PROT2      0
SBP3            0
DBP3            0
UDIP_PROT3      0
SBP4            0
DBP4            0
UDIP_PROT4      0
MAT_WEIGHT      0
GRAVIDITY       0
PARITY          0
PW_EDUCATION    0
LABOUR_24       0
SINGLE_TWIN     0
PREV_SB         0
PREV_MIS        0
PREV_PTB        0
PREV_MULTIP     0
PREV_CS         0
dtype: int64

Missing values in df_APH after imputation:
SBP1            0
DBP1            0
UDIP_PROT1      0
SBP2            0
DBP2            0
UDIP_PROT2      0
SBP3            0
DBP3            0
UDIP_PROT3      0
SBP4            0
DBP4            0
UDIP_PROT4      0
MAT_WEIGHT      0
GRAVIDITY       0
PARITY          0
PW_EDUCATION    0
LABOUR_24       0
SINGLE_TWIN     0
PREV_SB         0
PREV_MIS        0
PREV_PTB        0
PREV_MULTIP     0
PREV_CS         0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nonAPH[col].fillna(median_value_nonAPH, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_APH[col].fillna(median_value_APH, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nonAPH[col].fillna(median_value_nonAPH, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_

In [100]:
import pandas as pd

# Assuming train_set_nonAPH, train_set_APH, test_set_nonAPH, test_set_APH are pandas DataFrames

# Merge the train datasets
df = pd.concat([df_nonAPH, df_APH], axis=0)

# Reset the index (optional, to clean up any duplicate indices after concatenation)
df.reset_index(drop=True, inplace=True)



In [101]:
df['APH'].value_counts()

APH
0.0    4243
1.0     171
Name: count, dtype: int64

## One hot encoding 

In [102]:
# Apply one-hot encoding with pd.get_dummies
df = pd.get_dummies(df, columns=['LABOUR_24'], drop_first=True)


# Verify and explicitly convert the encoded column to integer
df['LABOUR_24_1.0'] = df['LABOUR_24_1.0'].astype(int)


## Outliers removal by HDBSCAN

In [103]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from hdbscan import HDBSCAN

# -----------------------------------------------------------------
# Step 1: Standardize the Data
# -----------------------------------------------------------------
# Assuming `df` is your dataframe and `APH` is the target column
X = df.drop(columns=['APH'])  # Exclude the target column for scaling
y = df['APH']  # Target column

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -----------------------------------------------------------------
# Step 2: Apply HDBSCAN for Outlier Detection
# -----------------------------------------------------------------
hdbscan = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_epsilon=0.5)
cluster_labels = hdbscan.fit_predict(X_scaled)

# Identify outliers (labeled as -1 by HDBSCAN) 
non_outliers_mask = cluster_labels != -1

# -----------------------------------------------------------------
# Step 3: Create Cleaned Dataset
# -----------------------------------------------------------------
# Combine back the non-outlier data with the target column
X_cleaned = X[non_outliers_mask]
y_cleaned = y[non_outliers_mask]


# Create the cleaned DataFrame
df_cleaned = pd.concat([X_cleaned, y_cleaned], axis=1)

# -----------------------------------------------------------------
# Step 4: Check Class Distribution
# -----------------------------------------------------------------
class_distribution = df_cleaned['APH'].value_counts()
print("Class Distribution After Outlier Removal:")
print(class_distribution)



Class Distribution After Outlier Removal:
APH
0.0    3288
1.0     136
Name: count, dtype: int64


## Test Train Split

In [104]:
from sklearn.model_selection import train_test_split

# Split the data into features (X) and target (y)
X = X_cleaned
y = y_cleaned

# Perform the stratified train-test split
X_train_Clustering, X_test_Clustering, y_train_Clustering, y_test_Clustering = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Display the shapes of the resulting datasets
print(f"X_train shape: {X_train_Clustering.shape}")
print(f"X_test shape: {X_test_Clustering.shape}")
print(f"y_train shape: {y_train_Clustering.shape}")
print(f"y_test shape: {y_test_Clustering.shape}")

# Check class distribution in the train and test splits
print("\nClass distribution in y_train:")
print(y_train_Clustering.value_counts(normalize=True))

print("\nClass distribution in y_test:")
print(y_test_Clustering.value_counts(normalize=True))


X_train shape: (2739, 24)
X_test shape: (685, 24)
y_train shape: (2739,)
y_test shape: (685,)

Class distribution in y_train:
APH
0.0    0.960204
1.0    0.039796
Name: proportion, dtype: float64

Class distribution in y_test:
APH
0.0    0.960584
1.0    0.039416
Name: proportion, dtype: float64


## Standarization 

In [105]:
from sklearn.preprocessing import StandardScaler

# Assuming X_train_Clustering and X_test_Clustering are your training and test data
scaler = StandardScaler()

# Fit the scaler to the training data and transform both train and test data
X_train_Clustering_scaled = scaler.fit_transform(X_train_Clustering)
X_test_Clustering_scaled = scaler.transform(X_test_Clustering)


In [106]:
import pandas as pd

# Convert the NumPy array to a DataFrame (assuming feature names are stored in a list)
X_train_Clustering_scaled_df = pd.DataFrame(X_train_Clustering_scaled, columns=X_train_Clustering.columns)
X_test_Clustering_scaled_df = pd.DataFrame(X_test_Clustering_scaled, columns=X_test_Clustering.columns)

# Now you can use .info() with the DataFrame
X_train_Clustering_scaled_df.info()
X_test_Clustering_scaled_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2739 entries, 0 to 2738
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PW_AGE         2739 non-null   float64
 1   PW_EDUCATION   2739 non-null   float64
 2   PREV_SB        2739 non-null   float64
 3   PREV_MIS       2739 non-null   float64
 4   PREV_PTB       2739 non-null   float64
 5   PREV_MULTIP    2739 non-null   float64
 6   PREV_CS        2739 non-null   float64
 7   SINGLE_TWIN    2739 non-null   float64
 8   GRAVIDITY      2739 non-null   float64
 9   PARITY         2739 non-null   float64
 10  SBP1           2739 non-null   float64
 11  DBP1           2739 non-null   float64
 12  UDIP_PROT1     2739 non-null   float64
 13  SBP2           2739 non-null   float64
 14  DBP2           2739 non-null   float64
 15  UDIP_PROT2     2739 non-null   float64
 16  SBP3           2739 non-null   float64
 17  DBP3           2739 non-null   float64
 18  UDIP_PRO

## Feature Selection by Mutual Information HDBSCAN

In [107]:
from sklearn.feature_selection import mutual_info_classif
import pandas as pd

# Assuming X_train_Clustering_scaled and y_train_Clustering are already defined

# Step 1: Calculate Mutual Information between features and the target
mutual_info = mutual_info_classif(X_train_Clustering_scaled_df, y_train_Clustering)

# Step 2: Create a DataFrame for better visualization of feature importance
mutual_info_df = pd.DataFrame({
    'Feature': X_train_Clustering_scaled_df.columns,
    'Mutual Information': mutual_info
})

# Step 3: Sort the features by Mutual Information in descending order
mutual_info_df = mutual_info_df.sort_values(by='Mutual Information', ascending=False)

# Step 4: Display the results
print("Mutual Information Scores for Features:")
print(mutual_info_df)

# Optional: Select top N features based on Mutual Information
top_n_features = mutual_info_df['Feature'].head(5).values  # Selecting top 5 features (adjust as needed)

print("\nTop 5 Selected Features based on Mutual Information:")
print(top_n_features)

# You can now use these top features for further processing or training your model
X_train_selected = X_train_Clustering_scaled[:, mutual_info_df['Feature'].isin(top_n_features)]


Mutual Information Scores for Features:
          Feature  Mutual Information
16           SBP3            0.044910
20           DBP4            0.030206
22     MAT_WEIGHT            0.016817
17           DBP3            0.009511
14           DBP2            0.008991
19           SBP4            0.005250
3        PREV_MIS            0.005147
23  LABOUR_24_1.0            0.003165
4        PREV_PTB            0.002365
7     SINGLE_TWIN            0.001967
5     PREV_MULTIP            0.001290
8       GRAVIDITY            0.000967
11           DBP1            0.000816
0          PW_AGE            0.000351
13           SBP2            0.000192
15     UDIP_PROT2            0.000000
1    PW_EDUCATION            0.000000
10           SBP1            0.000000
18     UDIP_PROT3            0.000000
9          PARITY            0.000000
6         PREV_CS            0.000000
21     UDIP_PROT4            0.000000
2         PREV_SB            0.000000
12     UDIP_PROT1            0.000000

Top 5 Sel

### Ensembled Model With all the sampling for the Balanced bagging and all other --- HDBSACN

In [55]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, recall_score, f1_score
from imblearn.ensemble import EasyEnsembleClassifier, RUSBoostClassifier, BalancedBaggingClassifier, BalancedRandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Assuming X_train_Clustering_scaled_df, X_test_Clustering_scaled_df, y_train_Clustering, y_test_Clustering are already defined

# Select the top features (assuming they are in the dataset)
selected_features = ['SBP3', 'DBP4', 'MAT_WEIGHT', 'DBP2', 'DBP3']

# Select only the relevant features from X_train and X_test
X_train_selected = X_train_Clustering_scaled_df[selected_features]
X_test_selected = X_test_Clustering_scaled_df[selected_features]

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Define individual models
models = {
    'EasyEnsembleClassifier': EasyEnsembleClassifier(random_state=42),
    'RUSBoostClassifier': RUSBoostClassifier(random_state=42),
    'BalancedBaggingClassifier': BalancedBaggingClassifier(
        estimator=DecisionTreeClassifier(max_depth=10), 
        n_estimators=100, 
        random_state=42
    ),
    'BalancedRandomForestClassifier': BalancedRandomForestClassifier(
        n_estimators=100, 
        random_state=42
    )
}

# Create an ensemble model using Voting Classifier (Majority Voting)
ensemble_model = VotingClassifier(estimators=[
    ('easy_ensemble', models['EasyEnsembleClassifier']),
    ('rusboost', models['RUSBoostClassifier']),
    ('balanced_bagging', models['BalancedBaggingClassifier']),
    ('balanced_rf', models['BalancedRandomForestClassifier'])
], voting='soft')  # Use 'soft' voting for probability averaging

# Train the ensemble model
ensemble_model.fit(X_train_scaled, y_train_Clustering)

# Make predictions using the ensemble model
y_pred_train_ensemble = ensemble_model.predict(X_train_scaled)
y_pred_test_ensemble = ensemble_model.predict(X_test_scaled)

# Calculate recall and F1-score for the ensemble (class-wise)
recall_train_ensemble = recall_score(y_train_Clustering, y_pred_train_ensemble, average=None)  # No averaging, class-wise
recall_test_ensemble = recall_score(y_test_Clustering, y_pred_test_ensemble, average=None)  # No averaging, class-wise
f1_train_ensemble = f1_score(y_train_Clustering, y_pred_train_ensemble, average=None)  # No averaging, class-wise
f1_test_ensemble = f1_score(y_test_Clustering, y_pred_test_ensemble, average=None)  # No averaging, class-wise

print(f"\nEnsemble Model Recall on Train Data (Class-wise): {recall_train_ensemble}")
print(f"Ensemble Model Recall on Test Data (Class-wise): {recall_test_ensemble}")
print(f"Ensemble Model F1-score on Train Data (Class-wise): {f1_train_ensemble}")
print(f"Ensemble Model F1-score on Test Data (Class-wise): {f1_test_ensemble}")

# Confusion Matrix for Ensemble Test Data
cm = confusion_matrix(y_test_Clustering, y_pred_test_ensemble)

print("\nEnsemble Confusion Matrix (Test Data):")
print(cm)

print("\nEnsemble Classification Report (Test Data):")
print(classification_report(y_test_Clustering, y_pred_test_ensemble))

# ROC Curve and AUC for Ensemble
fpr, tpr, thresholds = roc_curve(y_test_Clustering, ensemble_model.predict_proba(X_test_scaled)[:, 1])
roc_auc = auc(fpr, tpr)
print(f"ROC AUC Score: {roc_auc}")

# Plot colored Confusion Matrix using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, 
            xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.title(f"Ensemble Confusion Matrix")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()

# Save Confusion Matrix plot
confusion_matrix_file = "ensemble_confusion_matrix.png"
plt.savefig(confusion_matrix_file)
plt.close()

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Diagonal line (random classifier)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.tight_layout()

# Save ROC Curve plot
roc_curve_file = "ensemble_roc_curve.png"
plt.savefig(roc_curve_file)
plt.close()

# Print file paths for downloading
print(f"\nConfusion Matrix plot saved at: {confusion_matrix_file}")
print(f"ROC Curve plot saved at: {roc_curve_file}")



Ensemble Model Recall on Train Data (Class-wise): [0.75285171 0.90825688]
Ensemble Model Recall on Test Data (Class-wise): [0.72036474 0.59259259]
Ensemble Model F1-score on Train Data (Class-wise): [0.85714286 0.23076923]
Ensemble Model F1-score on Test Data (Class-wise): [0.82939633 0.14096916]

Ensemble Confusion Matrix (Test Data):
[[474 184]
 [ 11  16]]

Ensemble Classification Report (Test Data):
              precision    recall  f1-score   support

         0.0       0.98      0.72      0.83       658
         1.0       0.08      0.59      0.14        27

    accuracy                           0.72       685
   macro avg       0.53      0.66      0.49       685
weighted avg       0.94      0.72      0.80       685

ROC AUC Score: 0.7253743104806935

Confusion Matrix plot saved at: ensemble_confusion_matrix.png
ROC Curve plot saved at: ensemble_roc_curve.png


### Ensembled Model With all the sampling for the Lightbgm and random forest --- HDBSACN

In [108]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC, SMOTEN, ADASYN, BorderlineSMOTE, SVMSMOTE
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Select only the top features
selected_features = ['SBP3', 'DBP4', 'MAT_WEIGHT', 'DBP2', 'DBP3']
X_train_selected = X_train_Clustering_scaled_df[selected_features]
X_test_selected = X_test_Clustering_scaled_df[selected_features]

# Define oversampling methods to try
over_sampling_methods = {
    'RandomOverSampler': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42, sampling_strategy=0.6),
    'SMOTENC': SMOTENC(categorical_features=[4], random_state=42, sampling_strategy=0.6),
    'SMOTEN': SMOTEN(random_state=42, sampling_strategy=0.6),
    'ADASYN': ADASYN(random_state=42, sampling_strategy=0.6),
    'BorderlineSMOTE': BorderlineSMOTE(random_state=42, sampling_strategy=0.6),
    'SVMSMOTE': SVMSMOTE(random_state=42, sampling_strategy=0.6)
}

# Loop over each oversampling method
for name, sampler in over_sampling_methods.items():
    print(f"Using {name}...")

    # Define the base models for the ensemble
    model1 = LGBMClassifier(random_state=42, class_weight='balanced')
    model2 = RandomForestClassifier(random_state=42, class_weight='balanced')

    # Create an ensemble model using VotingClassifier (soft voting)
    ensemble_model = VotingClassifier(
        estimators=[
            ('lightgbm', model1),
            ('random_forest', model2)
        ],
        voting='soft'
    )

    # Define the pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('sampler', sampler),
        ('ensemble', ensemble_model)
    ])

    # Define hyperparameter grid for tuning
    param_grid = {
        'ensemble__lightgbm__num_leaves': [31, 50],
        'ensemble__lightgbm__max_depth': [-1, 10],
        'ensemble__lightgbm__learning_rate': [0.01, 0.1],
        'ensemble__lightgbm__n_estimators': [100, 200],
        'ensemble__random_forest__n_estimators': [100, 250],
        'ensemble__random_forest__max_depth': [10, 20]
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring='recall',
        cv=2,
        n_jobs=-1,
        verbose=1
    )

    # Train the model with hyperparameter tuning
    grid_search.fit(X_train_selected, y_train_Clustering)

    # Get the best model and parameters
    print("\nBest Parameters from GridSearchCV:")
    print(grid_search.best_params_)

    # Use the best model to make predictions
    y_pred = grid_search.best_estimator_.predict(X_test_selected)
    y_prob = grid_search.best_estimator_.predict_proba(X_test_selected)[:, 1]  # Get probability for positive class

    # Metrics
    print("\nClassification Report:")
    print(classification_report(y_test_Clustering, y_pred))
    print("\n" + "="*80)

    # Calculate ROC curve and AUC
    fpr, tpr, thresholds = roc_curve(y_test_Clustering, y_prob)
    auc_score = roc_auc_score(y_test_Clustering, y_prob)
    
    # Plot ROC curve
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='b', label=f'ROC Curve (AUC = {auc_score:.2f})')
    plt.plot([0, 1], [0, 1], color='r', linestyle='--')  # Diagonal line
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for {name}')
    plt.legend(loc='lower right')
    plt.savefig(f'roc_curve_{name}_HDBSCAN.png')
    plt.close()

    print(f"ROC Curve plot for {name} saved as roc_curve_{name}_HDBSCAN.png")


Using RandomOverSampler...
Fitting 2 folds for each of 64 candidates, totalling 128 fits
[LightGBM] [Info] Number of positive: 2630, number of negative: 2630
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000268 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 436
[LightGBM] [Info] Number of data points in the train set: 5260, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Best Parameters from GridSearchCV:
{'ensemble__lightgbm__learning_rate': 0.01, 'ensemble__lightgbm__max_depth': -1, 'ensemble__lightgbm__n_estimators': 100, 'ensemble__lightgbm__num_leaves': 31, 'ensemble__random_forest__max_depth': 10, 'ensemble__random_forest__n_estimators': 250}

Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      0.98      0.98       658
         1.0       0.36      0.30      0.33        27

    a

### IQR

In [79]:
import pandas as pd

# Function to remove outliers per class using IQR
def remove_outliers_per_class_iqr(df, class_col):
    cleaned_dfs = []
    classes = df[class_col].unique()  # Get unique classes in the target column
    
    for cls in classes:
        # Filter data for the current class
        class_data = df[df[class_col] == cls]
        features = class_data.drop(columns=[class_col])  # Exclude the class column
        
        # Calculate the IQR for each feature
        Q1 = features.quantile(0.25)
        Q3 = features.quantile(0.75)
        IQR = Q3 - Q1
        
        # Define the bounds for outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Identify outliers and keep inliers (values within bounds)
        outliers = (features < lower_bound) | (features > upper_bound)
        
        # Keep only inliers (rows without outliers)
        class_clean = class_data[~outliers.any(axis=1)]  # Drop rows with any outliers
        cleaned_dfs.append(class_clean)
    
    # Combine cleaned data from all classes
    return pd.concat(cleaned_dfs)

# Apply the function to your DataFrame (replace 'APH' with your actual class column name)
df_clean = remove_outliers_per_class_iqr(df, class_col='APH')


In [15]:
df_clean['APH'].value_counts()

APH
0.0    1823
1.0      47
Name: count, dtype: int64

### IQR -- Test / Train Split

In [16]:
from sklearn.model_selection import train_test_split

# Define the target variable and features
target = 'APH'  # Replace 'APH' with the actual target column name if different
features = [col for col in df_clean.columns if col != target]  # All columns except the target

# Split the data into features (X) and target (y)
X = df_clean[features]
y = df_clean[target]

# Perform the stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Display the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# Check class distribution in the train and test splits
print("\nClass distribution in y_train:")
print(y_train.value_counts(normalize=True))

print("\nClass distribution in y_test:")
print(y_test.value_counts(normalize=True))


X_train shape: (1496, 24)
X_test shape: (374, 24)
y_train shape: (1496,)
y_test shape: (374,)

Class distribution in y_train:
APH
0.0    0.974599
1.0    0.025401
Name: proportion, dtype: float64

Class distribution in y_test:
APH
0.0    0.975936
1.0    0.024064
Name: proportion, dtype: float64


### Selecting Best Features --- IQR Interpretability

In [80]:
from sklearn.feature_selection import mutual_info_classif
import pandas as pd

# Assuming `X_train` is your feature matrix and `y_train` is your target variable
# Example (replace this with your actual data):
import numpy as np

# Compute Mutual Information
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)

# Create a DataFrame for easier manipulation and sorting
mi_df = pd.DataFrame({'Feature': X_train.columns, 'Mutual Information': mi_scores})
mi_df = mi_df.sort_values(by='Mutual Information', ascending=False)

# Display top features based on Mutual Information
print("Mutual Information Scores:")
print(mi_df)

# Select top N features (e.g., top 5)
top_features = mi_df.head(6)['Feature'].tolist()
print("\nTop Features based on Mutual Information:")
print(top_features)


Mutual Information Scores:
          Feature  Mutual Information
16           SBP3            0.053132
20           DBP4            0.038337
14           DBP2            0.016465
17           DBP3            0.010171
11           DBP1            0.008805
9          PARITY            0.008375
13           SBP2            0.008193
7     SINGLE_TWIN            0.007666
8       GRAVIDITY            0.006115
19           SBP4            0.005609
3        PREV_MIS            0.004824
22     MAT_WEIGHT            0.004798
5     PREV_MULTIP            0.003724
4        PREV_PTB            0.000893
10           SBP1            0.000399
21     UDIP_PROT4            0.000000
18     UDIP_PROT3            0.000000
0          PW_AGE            0.000000
12     UDIP_PROT1            0.000000
15     UDIP_PROT2            0.000000
1    PW_EDUCATION            0.000000
6         PREV_CS            0.000000
2         PREV_SB            0.000000
23  LABOUR_24_1.0            0.000000

Top Features based on 

### Random Forest with different Oversampling technique IQR

In [82]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC, SMOTEN, ADASYN, BorderlineSMOTE, KMeansSMOTE, SVMSMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Assuming you already have X_train, X_test, y_train, y_test

# Select only the top features
selected_features = ['SBP3', 'DBP4', 'DBP2', 'DBP3', 'DBP1', 'PARITY', 'SBP2']
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Define oversampling methods to try
oversampling_methods = {
    'RandomOverSampler': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42, sampling_strategy=0.8),
    'SMOTENC': SMOTENC(categorical_features=[4], random_state=42, sampling_strategy=0.8),  # Assume 'PARITY' is categorical
    'SMOTEN': SMOTEN(random_state=42, sampling_strategy=0.8),
    'ADASYN': ADASYN(random_state=42, sampling_strategy=0.8),
    'BorderlineSMOTE': BorderlineSMOTE(random_state=42, sampling_strategy=0.8),
    'SVMSMOTE': SVMSMOTE(random_state=42, sampling_strategy=0.8)
}

# Loop over each oversampling method
for name, sampler in oversampling_methods.items():
    print(f"Using {name}...")

    # Define the pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Scaling step
        ('smote', sampler),            # Different oversampling methods
        ('rf', RandomForestClassifier(random_state=42, class_weight='balanced'))  # Random Forest with custom class weights
    ])

    # Define hyperparameter grid for tuning 'n_estimators'
    param_grid = {
        'rf__n_estimators': [100, 250, 500],
        'rf__max_depth': [100]  # Trying different values for max_depth
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring='recall',        # Use recall to evaluate the performance
        cv=3,                    # Cross-validation with 3 folds
        n_jobs=-1,               # Use all CPU cores for computation
        verbose=2                # Show detailed progress
    )

    # Train the model with hyperparameter tuning
    grid_search.fit(X_train_selected, y_train)

    # Get the best model and parameters
    print("\nBest Parameters from GridSearchCV:")
    print(grid_search.best_params_)

    # Use the best model to make predictions
    y_pred = grid_search.best_estimator_.predict(X_test_selected)

    # Metrics
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\n" + "="*80)  # Print a separator between different methods


Using RandomOverSampler...
Fitting 3 folds for each of 3 candidates, totalling 9 fits

Best Parameters from GridSearchCV:
{'rf__max_depth': 100, 'rf__n_estimators': 100}

Confusion Matrix:
[[365   0]
 [  3   6]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       365
         1.0       1.00      0.67      0.80         9

    accuracy                           0.99       374
   macro avg       1.00      0.83      0.90       374
weighted avg       0.99      0.99      0.99       374


Using SMOTE...
Fitting 3 folds for each of 3 candidates, totalling 9 fits

Best Parameters from GridSearchCV:
{'rf__max_depth': 100, 'rf__n_estimators': 250}

Confusion Matrix:
[[363   2]
 [  3   6]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99       365
         1.0       0.75      0.67      0.71         9

    accuracy                           0.99     

### Ensembled model of the EasyEnsembleClassifier,RUSBoostClassifier,BalancedBaggingClassifier,BalancedRandomForestClassifier

In [35]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from imblearn.ensemble import EasyEnsembleClassifier, RUSBoostClassifier, BalancedBaggingClassifier, BalancedRandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import numpy as np

# Assuming X_train, X_test, y_train, y_test are already defined

# Select the top features (assuming they are in the dataset)
selected_features = ['SBP3', 'DBP4', 'DBP2', 'DBP3', 'DBP1', 'PARITY', 'SBP2']

# Select only the relevant features from X_train and X_test
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Define individual models
models = {
    'EasyEnsembleClassifier': EasyEnsembleClassifier(random_state=42),
    'RUSBoostClassifier': RUSBoostClassifier(random_state=42),
    'BalancedBaggingClassifier': BalancedBaggingClassifier(
        estimator=DecisionTreeClassifier(max_depth=10), 
        n_estimators=100, 
        random_state=42
    ),
    'BalancedRandomForestClassifier': BalancedRandomForestClassifier(
        n_estimators=100, 
        random_state=42
    )
}

# Create an ensemble model using Voting Classifier (Majority Voting)
ensemble_model = VotingClassifier(estimators=[
    ('easy_ensemble', models['EasyEnsembleClassifier']),
    ('rusboost', models['RUSBoostClassifier']),
    ('balanced_bagging', models['BalancedBaggingClassifier']),
    ('balanced_rf', models['BalancedRandomForestClassifier'])
], voting='soft')  # Use 'soft' voting for probability averaging

# Train the ensemble model
ensemble_model.fit(X_train_scaled, y_train)

# Make predictions using the ensemble model
y_pred_train_ensemble = ensemble_model.predict(X_train_scaled)
y_pred_test_ensemble = ensemble_model.predict(X_test_scaled)

# Classification Report
print("\nEnsemble Classification Report (Test Data):")
print(classification_report(y_test, y_pred_test_ensemble))

# Confusion Matrix for Ensemble Test Data
cm = confusion_matrix(y_test, y_pred_test_ensemble)
print("\nEnsemble Confusion Matrix (Test Data):")
print(cm)

# ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, ensemble_model.predict_proba(X_test_scaled)[:, 1])
roc_auc = auc(fpr, tpr)
print(f"ROC AUC Score: {roc_auc}")

# Plotting Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.title("Ensemble Confusion Matrix")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()

# Save Confusion Matrix plot
confusion_matrix_file = 'ensemble_confusion_matrix.png'
plt.savefig(confusion_matrix_file)
plt.close()

# Plotting ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Diagonal line (random classifier)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.tight_layout()

# Save ROC Curve plot
roc_curve_file = 'ensemble_roc_curve.png'
plt.savefig(roc_curve_file)
plt.close()

# Print file paths for downloading
print(f"\nConfusion Matrix plot saved at: {confusion_matrix_file}")
print(f"ROC Curve plot saved at: {roc_curve_file}")



Ensemble Classification Report (Test Data):
              precision    recall  f1-score   support

         0.0       0.99      0.77      0.87       365
         1.0       0.08      0.78      0.14         9

    accuracy                           0.77       374
   macro avg       0.53      0.77      0.50       374
weighted avg       0.97      0.77      0.85       374


Ensemble Confusion Matrix (Test Data):
[[281  84]
 [  2   7]]
ROC AUC Score: 0.8983257229832572

Confusion Matrix plot saved at: ensemble_confusion_matrix.png
ROC Curve plot saved at: ensemble_roc_curve.png


## Best Model All the sampling techniques with the lightbgm and Random Forest

In [28]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC, SMOTEN, ADASYN, BorderlineSMOTE, SVMSMOTE
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Assuming you already have X_train, X_test, y_train, y_test

# Select only the top features
selected_features = ['SBP3', 'DBP4', 'DBP2', 'DBP3', 'DBP1', 'PARITY', 'SBP2']
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Define oversampling methods to try
oversampling_methods = {
    'RandomOverSampler': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42, sampling_strategy=0.6),  # Reduced sampling strategy for faster processing
    'SMOTENC': SMOTENC(categorical_features=[4], random_state=42, sampling_strategy=0.6),
    'SMOTEN': SMOTEN(random_state=42, sampling_strategy=0.6),
    'ADASYN': ADASYN(random_state=42, sampling_strategy=0.6),
    'BorderlineSMOTE': BorderlineSMOTE(random_state=42, sampling_strategy=0.6),
    'SVMSMOTE': SVMSMOTE(random_state=42, sampling_strategy=0.6)
}

# Loop over each oversampling method
for name, sampler in oversampling_methods.items():
    print(f"Using {name}...")

    # Define the base models for the ensemble
    model1 = LGBMClassifier(random_state=42, class_weight='balanced')
    model2 = RandomForestClassifier(random_state=42, class_weight='balanced')

    # Create an ensemble model using VotingClassifier (soft voting)
    ensemble_model = VotingClassifier(
        estimators=[
            ('lightgbm', model1),
            ('random_forest', model2)
        ],
        voting='soft'  # Soft voting: average predicted probabilities
    )

    # Define the pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Scaling step
        ('smote', sampler),            # Different oversampling methods
        ('ensemble', ensemble_model)   # The ensemble model
    ])

    # Define hyperparameter grid for tuning ensemble model's parameters
    param_grid = {
        'ensemble__lightgbm__num_leaves': [31, 50],  # Reduced grid
        'ensemble__lightgbm__max_depth': [-1, 10],
        'ensemble__lightgbm__learning_rate': [0.01, 0.1],
        'ensemble__lightgbm__n_estimators': [100, 200],
        
        'ensemble__random_forest__n_estimators': [100, 250],  # Reduced grid
        'ensemble__random_forest__max_depth': [10, 20]
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring='recall',        # Use recall to evaluate the performance
        cv=2,                    # Reduced cross-validation folds
        n_jobs=-1,               # Use all CPU cores for computation
        verbose=1                # Show detailed progress
    )

    # Train the model with hyperparameter tuning
    grid_search.fit(X_train_selected, y_train)

    # Get the best model and parameters
    print("\nBest Parameters from GridSearchCV:")
    print(grid_search.best_params_)

    # Use the best model to make predictions
    y_pred = grid_search.best_estimator_.predict(X_test_selected)

    # Metrics
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\n" + "="*80)  # Print a separator between different methods


Using RandomOverSampler...
Fitting 2 folds for each of 64 candidates, totalling 128 fits
[LightGBM] [Info] Number of positive: 1458, number of negative: 1458
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000353 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 249
[LightGBM] [Info] Number of data points in the train set: 2916, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Best Parameters from GridSearchCV:
{'ensemble__lightgbm__learning_rate': 0.1, 'ensemble__lightgbm__max_depth': -1, 'ensemble__lightgbm__n_estimators': 100, 'ensemble__lightgbm__num_leaves': 31, 'ensemble__random_forest__max_depth': 10, 'ensemble__random_forest__n_estimators': 100}

Confusion Matrix:
[[363   2]
 [  3   6]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99       365
         1.0       