In [None]:
II. Predictive  Analysis

In [153]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from scipy.stats import mannwhitneyu
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import LabelEncoder

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,precision_score,recall_score,f1_score
import xgboost as xgb


In [3]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [8]:
data= pd.read_excel("GDM_Python_Aug2025.xlsx")

In [163]:
df=data.copy()

### Early Risk Prediction of Gestational Diabetes Mellitus (GDM)

**Research Hypotheses:**

- **Null Hypothesis (H₀):** Gestational Diabetes Mellitus (GDM) is *not* significantly associated with adverse pregnancy outcomes or complications.
- **Alternative Hypothesis (H₁):** Gestational Diabetes Mellitus (GDM) *is* significantly associated with adverse pregnancy outcomes and complications.

In [25]:
mean_weeks = df['Gestational Age_V1'].mean() / 7
print(f"Mean gestational age_V1 in weeks: {mean_weeks:.2f}")

Mean gestational age_V1 in weeks: 13.59


##### The average first prenatal visit women is at  arround 14 weeks in our dataset.
##### Our objective is to evaluate the predictive power of the early indicators through rigorous statistical testing and modeling.

###  **Reasoning (Why is it important)**

* **Early identification** of at-risk women allows for **early lifestyle or therapeutic interventions**.
* Can **prevent GDM-related complications** (e.g., macrosomia, pre-eclampsia, stillbirth).
* Enables **personalized prenatal care** before routine screening (\~24–28 weeks).
* Reduces long-term maternal risk and poor neonatal outcomes.

###  **Chosen Parameters and Their Connection to Hypothesis**

| Feature                                           | Why it’s Included                                         |
| ------------------------------------------------- | --------------------------------------------------------- |
| `Age >30`                                         | Older age is a known risk factor for insulin resistance   |
| `BMI (kg/m2) V1`                                  | Obesity is strongly associated with GDM                   |
| `Previous GDM V1`                                 | Past GDM is a strong predictor of recurrence              |
| `      `                                          |           |
| `Smoking`                                         | Lifestyle factors influencing insulin/glucose metabolism  |
| `HighRisk 10`                                     | Pre-defined composite risk factor                         |
| `Chronic Illness V1`                              | Associated comorbidities may increase metabolic burden    |
|                |
| `V1 25OHD value`                                  | Vitamin D deficiency may contribute to insulin resistance |
| `Diagnosed with Vit D Deficiency`                 | Early marker of metabolic dysfunction                     |
| `systolic BP (mmHg) V1`, `diastolic BP (mmHg) V1` | Hypertension is associated with metabolic disease         |
| `Pulse (bpm) V1`                                  | May reflect underlying stress/metabolic state             |


### Correlation Check for GDM Diagnosis

Since the data isn't normally distributed, we used Spearman’s correlation to check how each clinical feature is related to the GDM diagnosis (Yes = 1, No = 0).

In [57]:
df1=data.copy()

In [65]:
# Map GDM to binary
df1['GDM_binary'] = df1['GDM Diagonised'].map({'Yes': 1, 'No': 0})

# Define the feature list
features = ['systolicBP_V1', 'diastolicBP_V1', 'PulseinV1', 'Platelet_V1', 
            'Calcium_V1', 'Albumin_V1', 'U Protein_V1', 'ALT_V1', 
            'U Albumin_V1', 'V1 CRP.1', 'V1 Creatinine.1', 'Hemoglobin_V1',
            'V1 PCR.1', 'BMIinV1']

# Check and filter valid features
valid_features = [f for f in features if f in df1.columns]

# Calculate Spearman correlation with GDM_binary
corr_spearman = df1[valid_features + ['GDM_binary']].corr(method='spearman')['GDM_binary'].drop('GDM_binary')

# Print sorted results
corr_spearman.sort_values(ascending=False)

V1 CRP.1           0.150546
PulseinV1          0.148724
BMIinV1            0.133116
Platelet_V1        0.112980
diastolicBP_V1     0.092703
Calcium_V1         0.091657
systolicBP_V1      0.088896
ALT_V1             0.084171
U Protein_V1       0.074699
V1 PCR.1           0.070136
Hemoglobin_V1      0.006601
V1 Creatinine.1   -0.019262
Albumin_V1        -0.113102
U Albumin_V1      -0.125247
Name: GDM_binary, dtype: float64

In [49]:
results = {}

for col in ['systolicBP_V1', 'diastolicBP_V1', 'PulseinV1','Platelet_V1', 'Calcium_V1', 'Albumin_V1', 'U Protein_V1', 'ALT_V1', 'U Albumin_V1','V1 CRP.1','V1 Creatinine.1', 'Hemoglobin_V1','V1 PCR.1','BMIinV1']:
    group_yes = df[df['GDM Diagonised'].str.lower() == 'yes'][col].dropna()
    group_no = df[df['GDM Diagonised'].str.lower() == 'no'][col].dropna()
    
    if len(group_yes) > 0 and len(group_no) > 0:
        stat, p = mannwhitneyu(group_yes, group_no, alternative='two-sided')
        results[col] = {
            'U-statistic': round(stat, 3),
            'p-value': round(p, 5),
            'Chance of randomness (%)': round(p * 100, 2)
        }
    else:
        results[col] = {
            'U-statistic': None,
            'p-value': None,
            'Chance of randomness (%)': None
        }

pd.DataFrame(results).T.sort_values('p-value')

Unnamed: 0,U-statistic,p-value,Chance of randomness (%)
PulseinV1,22091.5,0.00049,0.05
V1 CRP.1,17671.0,0.00071,0.07
BMIinV1,21627.0,0.0018,0.18
Platelet_V1,20978.5,0.00813,0.81
diastolicBP_V1,20417.5,0.02973,2.97
U Albumin_V1,4153.0,0.03243,3.24
systolicBP_V1,20304.5,0.03712,3.71
ALT_V1,17411.0,0.05499,5.5
Albumin_V1,4392.5,0.06228,6.23
U Protein_V1,19699.5,0.08097,8.1


- Pulse rate, CRP, BMI, and Platelet count show higher correlation and low p-values, which means they are consistently elevated in women diagnosed with GDM. These could be strong early indicators.
- Blood pressure values, ALT, and Albumin also show some difference, though not as strong—they may help as supporting features in prediction.
- Hemoglobin and Creatinine have very low correlation and high p-values, so they likely don’t relate to GDM in your data.
- The chance of randomness (based on p-value) is low for the top features, which means it’s unlikely that these differences happened by accident. That gives more confidence in using them for early GDM prediction


In [None]:
Checking for the Categorical values

In [88]:
# Define binary health markers and outcome
markers = ['Age_gt_30', 'Vit D Deficiency', 'Smoking 123','PreviousGDM10 V1','Nutritional counselling','HighRisk','Chronic Illness','GestationalHP','Took Vit D Supplements']
results = {}

for marker in markers:
    # Create contingency table
    contingency = pd.crosstab(df[marker], df['GDM Diagonised'])
    
    # Run chi-squared test
    chi2, p, dof, expected = chi2_contingency(contingency)
    results[marker] = {'Chi2': chi2, 'p-value': p}

# Convert to DataFrame for a cleaner view
correlation_df = pd.DataFrame(results).T
print(correlation_df.sort_values('p-value'))

                               Chi2        p-value
Nutritional counselling  512.854705  4.315348e-112
GestationalHP            495.051278  7.876842e-106
PreviousGDM10 V1          24.557809   4.648786e-06
Took Vit D Supplements    12.357633   1.488082e-02
Vit D Deficiency           8.240258   1.624242e-02
HighRisk                   4.013410   1.344309e-01
Age_gt_30                  3.405225   1.822069e-01
Smoking 123                5.992763   4.240013e-01
Chronic Illness            0.714893   6.994601e-01


- Use Nutritional Counselling, Previous GDM, and Vitamin D Deficiency confidently in your early prediction models.
- Keep Age and HighRisk flags as possibly relevant

In [203]:
selected_columns = ['systolicBP_V1', 'diastolicBP_V1', 'PulseinV1', 'BMIinV1', 
                    'Smoking 123', 'PreviousGDM10 V1', 'Chronic Illness', 'Age_gt_30',
                    'Platelet_V1', 'Albumin_V1', 'U Albumin_V1', 'U Protein_V1',
                    'ALT_V1', 'V1 CRP.1', 'Vit D Deficiency','HighRisk','GestationalHP', 
                    'Took Vit D Supplements', 'Nutritional counselling','GDM Diagonised']

df_final = df[selected_columns]


In [205]:
df_final.shape

(565, 20)

In [207]:
# Filter rows where GDM was diagnosed
gdm_positive_df = df_final [df_final['GDM Diagonised'] == 'Yes']


In [209]:
gdm_positive_df.isna().sum()

systolicBP_V1               0
diastolicBP_V1              0
PulseinV1                   0
BMIinV1                     0
Smoking 123                 0
PreviousGDM10 V1            0
Chronic Illness             0
Age_gt_30                   0
Platelet_V1                 0
Albumin_V1                 27
U Albumin_V1               34
U Protein_V1                0
ALT_V1                      7
V1 CRP.1                   11
Vit D Deficiency            0
HighRisk                    0
GestationalHP               0
Took Vit D Supplements      0
Nutritional counselling     0
GDM Diagonised              0
dtype: int64

In [211]:
gdm_positive_df.shape

(74, 20)

In [215]:
# Step 1: Get object-type columns
object_cols = df_final.select_dtypes(include='object').columns

# Step 2: Convert each to string, then apply label encoding to get integers
from sklearn.preprocessing import LabelEncoder

df_final = df_final.copy()  # Just to be safe
for col in object_cols:
    df_final.loc[:, col] = df_final[col].astype(str)
    le = LabelEncoder()
    df_final.loc[:, col] = le.fit_transform(df_final[col])

# Step 3: Confirm that the converted columns are now integers
int_cols = df_final.select_dtypes(include='int').columns.to_list()
print(int_cols)

['systolicBP_V1', 'diastolicBP_V1', 'PulseinV1', 'PreviousGDM10 V1', 'Chronic Illness', 'HighRisk']


In [217]:
from sklearn.preprocessing import LabelEncoder

# Columns to convert to integers
object_columns = [
    'Smoking 123',
    'Age_gt_30',
    'Vit D Deficiency',
    'GestationalHP',
    'Took Vit D Supplements',
    'Nutritional counselling',
    'GDM Diagonised'
]

# Dictionary to store label mappings
label_mappings = {}

# Apply encoding
for col in object_columns:
    df_final.loc[:, col] = df_final[col].astype(str)  # ensure all values are strings
    le = LabelEncoder()
    df_final.loc[:, col] = le.fit_transform(df_final[col])
    label_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))

# Print the mappings (optional)
for col, mapping in label_mappings.items():
    print(f"Mapping for '{col}': {mapping}")

Mapping for 'Smoking 123': {'0': 0, '1': 1, '2': 2, '3': 3}
Mapping for 'Age_gt_30': {'0': 0, '1': 1}
Mapping for 'Vit D Deficiency': {'0': 0, '1': 1}
Mapping for 'GestationalHP': {'0': 0, '1': 1, '2': 2}
Mapping for 'Took Vit D Supplements': {'0': 0, '1': 1, '2': 2}
Mapping for 'Nutritional counselling': {'0': 0, '1': 1}
Mapping for 'GDM Diagonised': {'0': 0, '1': 1, '2': 2}


In [225]:
 df_final.head()

Unnamed: 0,systolicBP_V1,diastolicBP_V1,PulseinV1,BMIinV1,Smoking 123,PreviousGDM10 V1,Chronic Illness,Age_gt_30,Platelet_V1,Albumin_V1,U Albumin_V1,U Protein_V1,ALT_V1,V1 CRP.1,Vit D Deficiency,HighRisk,GestationalHP,Took Vit D Supplements,Nutritional counselling,GDM Diagonised
0,114,58,73,20.650699,3,0,0,1,203.0,,300.0,0.02,10.0,0.45,0,0,0,0,0,1
1,178,78,84,29.215625,3,0,0,1,233.0,,,0.09,12.0,0.1,0,0,0,0,1,2
2,123,62,79,26.063378,3,0,0,1,330.0,,,0.1,15.0,1.15,0,0,0,0,0,1
3,115,68,82,24.736333,2,0,0,0,253.0,,300.0,0.03,12.0,0.22,0,0,0,0,0,1
4,116,61,92,23.383904,2,0,0,1,217.0,,300.0,0.05,9.0,0.41,0,0,0,0,0,1


In [185]:
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 1. Select only numeric columns to avoid type issues
df_numeric = df_final.select_dtypes(include=['number'])

# 2. Drop any remaining NaNs just in case
df_numeric = df_numeric.dropna()

# 3. Add intercept term
X = add_constant(df_numeric)

# 4. Compute VIFs
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# 5. Display results
print(vif_data)

             Feature         VIF
0              const  460.285607
1      systolicBP_V1    1.474383
2     diastolicBP_V1    1.433343
3          PulseinV1    1.134141
4            BMIinV1    1.605460
5   PreviousGDM10 V1    2.350795
6    Chronic Illness    1.066114
7        Platelet_V1    1.065271
8         Albumin_V1    1.103865
9       U Albumin_V1    1.103513
10      U Protein_V1    1.088296
11            ALT_V1    1.044426
12          V1 CRP.1    1.486555
13          HighRisk    2.375579


In [231]:
df_final.isna().sum()

systolicBP_V1                0
diastolicBP_V1               0
PulseinV1                    0
BMIinV1                      0
Smoking 123                  0
PreviousGDM10 V1             0
Chronic Illness              0
Age_gt_30                    0
Platelet_V1                  1
Albumin_V1                 284
U Albumin_V1               264
U Protein_V1                 4
ALT_V1                      42
V1 CRP.1                    56
Vit D Deficiency             0
HighRisk                     0
GestationalHP                0
Took Vit D Supplements       0
Nutritional counselling      0
GDM Diagonised               0
dtype: int64

In [181]:
X.columns

Index(['const', 'systolicBP_V1', 'diastolicBP_V1', 'PulseinV1', 'BMIinV1',
       'PreviousGDM10 V1', 'Chronic Illness', 'Platelet_V1', 'Albumin_V1',
       'U Albumin_V1', 'U Protein_V1', 'ALT_V1', 'V1 CRP.1', 'HighRisk'],
      dtype='object')

In [183]:
df_final.columns

Index(['systolicBP_V1', 'diastolicBP_V1', 'PulseinV1', 'BMIinV1',
       'Smoking 123', 'PreviousGDM10 V1', 'Chronic Illness', 'Age_gt_30',
       'Platelet_V1', 'Albumin_V1', 'U Albumin_V1', 'U Protein_V1', 'ALT_V1',
       'V1 CRP.1', 'Vit D Deficiency', 'HighRisk', 'GestationalHP',
       'Took Vit D Supplements', 'Nutritional counselling', 'GDM Diagonised'],
      dtype='object')

In [187]:
vif_data 

Unnamed: 0,Feature,VIF
0,const,460.285607
1,systolicBP_V1,1.474383
2,diastolicBP_V1,1.433343
3,PulseinV1,1.134141
4,BMIinV1,1.60546
5,PreviousGDM10 V1,2.350795
6,Chronic Illness,1.066114
7,Platelet_V1,1.065271
8,Albumin_V1,1.103865
9,U Albumin_V1,1.103513


In [193]:
df_numeric.shape

(164, 13)

In [195]:
df_numeric.dtypes

systolicBP_V1         int64
diastolicBP_V1        int64
PulseinV1             int64
BMIinV1             float64
PreviousGDM10 V1      int64
Chronic Illness       int64
Platelet_V1         float64
Albumin_V1          float64
U Albumin_V1        float64
U Protein_V1        float64
ALT_V1              float64
V1 CRP.1            float64
HighRisk              int64
dtype: object

In [201]:
df_final.dtypes

systolicBP_V1                int64
diastolicBP_V1               int64
PulseinV1                    int64
BMIinV1                    float64
Smoking 123                 object
PreviousGDM10 V1             int64
Chronic Illness              int64
Age_gt_30                   object
Platelet_V1                float64
Albumin_V1                 float64
U Albumin_V1               float64
U Protein_V1               float64
ALT_V1                     float64
V1 CRP.1                   float64
Vit D Deficiency            object
HighRisk                     int64
GestationalHP               object
Took Vit D Supplements      object
Nutritional counselling     object
GDM Diagonised              object
dtype: object