In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import ( 
     accuracy_score, classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay, PrecisionRecallDisplay
)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


from xgboost import XGBClassifier
import xgboost as xgb
import joblib

In [3]:
# Load CSV with robust path handling

CANDIDATE_PATHS = [
    "../data/hypertension_cleaned.csv",
    "./data/hypertension_cleaned.csv",
    "./hypertension_cleaned.csv",
    "/mnt/data/hypertension_cleaned.csv"
]

for p in CANDIDATE_PATHS:
    if os.path.exists(p):
        data_path = p
        break
else:
    raise FileNotFoundError(f"Could not find hypertension_cleaned.csv in any of: {CANDIDATE_PATHS}")

data = pd.read_csv(data_path)
print(f"Loaded data from: {data_path} -> shape={data.shape}")

Loaded data from: ../data/hypertension_cleaned.csv -> shape=(8835, 7)


In [4]:
data.head()

Unnamed: 0,Age,BMI,Systolic_BP,Diastolic_BP,Heart_Rate,Gender,Hypertension
0,83,16.6,156,63,91,Male,High
1,37,38.7,102,85,51,Male,High
2,73,26.4,92,102,50,Male,High
3,65,17.3,100,84,75,Female,High
4,85,25.7,177,64,93,Female,High


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8835 entries, 0 to 8834
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Age           8835 non-null   int64  
 1   BMI           8835 non-null   float64
 2   Systolic_BP   8835 non-null   int64  
 3   Diastolic_BP  8835 non-null   int64  
 4   Heart_Rate    8835 non-null   int64  
 5   Gender        8835 non-null   object 
 6   Hypertension  8835 non-null   object 
dtypes: float64(1), int64(4), object(2)
memory usage: 483.3+ KB


In [6]:
data.shape

(8835, 7)

### Basic Data Cleaning & Quick Sanity checks

In [7]:
# Creating a copy of the data
df = data.copy()

In [8]:
# Converting target columns to binary
assert 'Hypertension' in df.columns, 'Expected a "Hypertension" column'

df['Hypertension'] = (
    df['Hypertension']
    .str.strip()
    .str.capitalize()
    .map({'Low': 0, 'High': 1})
)

In [9]:
df['Gender'] = (
    df['Gender']
    .map({'Male' : 1, 'Female': 0})
)

In [11]:
# Check for unmapped values in hypertension column
if df['Hypertension'].isna().any():
    raise ValueError("Hypertension column contains unexpected values")

df['Hypertension'] = df['Hypertension'].astype(int)

In [12]:
# Check for unmapped values in Gender column
if df['Gender'].isna().any():
    raise ValueError('Gender column contains unexpected values')

df['Gender'] = df['Gender'].astype(int)

In [13]:
df.head()

Unnamed: 0,Age,BMI,Systolic_BP,Diastolic_BP,Heart_Rate,Gender,Hypertension
0,83,16.6,156,63,91,1,1
1,37,38.7,102,85,51,1,1
2,73,26.4,92,102,50,1,1
3,65,17.3,100,84,75,0,1
4,85,25.7,177,64,93,0,1


In [14]:
df['Hypertension'].value_counts()


Hypertension
1    6368
0    2467
Name: count, dtype: int64

In [15]:
# Identify features and dtypes
y = df['Hypertension']
X = df.drop(columns=['Hypertension'])


numericals_cols = X.select_dtypes(include=['number']).columns.tolist()

categorical_cols = X.select_dtypes(exclude=['number']).columns.tolist()

In [16]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.2,
    stratify = y, 
    random_state = 42
)

In [17]:
X_train.shape, X_test.shape

((7068, 6), (1767, 6))

### Preprocessing

In [29]:
preprocess = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numericals_cols)
    ],
    remainder='passthrough'
)

print("Preprocessing pipeline created")

Preprocessing pipeline created


In [19]:
# Inspect class balance and dtypes
print("Target distribution :")
print(y.value_counts(normalize=True))
print("\nNumeric columns:", numericals_cols)
print("Categorical columns:", categorical_cols)

Target distribution :
Hypertension
1    0.72077
0    0.27923
Name: proportion, dtype: float64

Numeric columns: ['Age', 'BMI', 'Systolic_BP', 'Diastolic_BP', 'Heart_Rate', 'Gender']
Categorical columns: []


In [None]:
pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("smote", SMOTE(random_state=42)),
    ("model", xgb_model)
])


### Base Models

#### Model A: Random Forest

In [30]:
# Model A: Random Forest
rf_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(
        n_estimators=300,
        max_depth=15,
        min_samples_split=10,
        min_samples_leaf=4,
        random_state=42,
        class_weight='balanced',  # Handle class imbalance
        n_jobs=-1  # Use all CPU cores
    ))
])

In [32]:
# Fit the Random Forest Model
rf_model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,15
,min_samples_split,10
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [33]:
# Predictions
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)
y_test_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

#### Model B: XGBoost Model

In [36]:
# Calculate scale_pos_weight for imbalanced data
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Scale pos weight: {scale_pos_weight:.2f}")

Scale pos weight: 0.39


In [37]:
# Model B: XGBoost
xgb_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_pos_weight,  # Handle imbalance
        random_state=42,
        eval_metric='logloss',
        use_label_encoder=False,
        n_jobs=-1
    ))
])

In [38]:
# Fit the XGBoost Model
xgb_model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [43]:
# Predictions
y_train_pred_xgb = xgb_model.predict(X_train)
y_test_pred_xgb = xgb_model.predict(X_test)
y_test_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

### Evaluate base models

In [52]:
def evaluate_model(y_true, y_pred, y_pred_proba, model_name):
    """
    Comprehensive model evaluation
    """
    print(f"{model_name} - EVALUATION RESULTS")
    
    # Accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f"\nAccuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    
    # Classification Report
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, 
                                target_names=['0', '1'],
                                digits=4))
    
    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    print(f"\nTrue Negatives:  {cm[0,0]:4d}  |  False Positives: {cm[0,1]:4d}")
    print(f"False Negatives: {cm[1,0]:4d}  |  True Positives:  {cm[1,1]:4d}")
    
    # Additional Metrics
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)  # Recall for positive class
    specificity = tn / (tn + fp)  # Recall for negative class
    
    print(f"\nSensitivity (Recall): {sensitivity:.4f}")
    print(f"Specificity:{specificity:.4f}\n")
    
    # ROC-AUC Score
    if y_pred_proba is not None:
        roc_auc = roc_auc_score(y_true, y_pred_proba)
        print(f"ROC-AUC Score:{roc_auc:.4f}")
    
    return {
        'accuracy': accuracy,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'roc_auc': roc_auc if y_pred_proba is not None else None,
        'confusion_matrix': cm
    }

In [53]:
# Evaluate Random Forest
rf_train_metrics = evaluate_model(y_train, y_train_pred_rf, None, 
                                   "RANDOM FOREST - TRAINING SET")
rf_test_metrics = evaluate_model(y_test, y_test_pred_rf, y_test_pred_proba_rf, 
                                  "RANDOM FOREST - TEST SET")

RANDOM FOREST - TRAINING SET - EVALUATION RESULTS

Accuracy: 0.9706 (97.06%)

Classification Report:
              precision    recall  f1-score   support

           0     0.9662    0.9271    0.9462      1974
           1     0.9722    0.9874    0.9797      5094

    accuracy                         0.9706      7068
   macro avg     0.9692    0.9572    0.9630      7068
weighted avg     0.9705    0.9706    0.9704      7068


True Negatives:  1830  |  False Positives:  144
False Negatives:   64  |  True Positives:  5030

Sensitivity (Recall): 0.9874
Specificity:0.9271

RANDOM FOREST - TEST SET - EVALUATION RESULTS

Accuracy: 0.6684 (66.84%)

Classification Report:
              precision    recall  f1-score   support

           0     0.2896    0.1298    0.1793       493
           1     0.7225    0.8768    0.7922      1274

    accuracy                         0.6684      1767
   macro avg     0.5061    0.5033    0.4857      1767
weighted avg     0.6017    0.6684    0.6212      1767




In [54]:
# Evaluate XGBoost
xgb_train_metrics = evaluate_model(y_train, y_train_pred_xgb, None, 
                                    "XGBOOST - TRAINING SET")
xgb_test_metrics = evaluate_model(y_test, y_test_pred_xgb, y_test_pred_proba_xgb, 
                                   "XGBOOST - TEST SET")


XGBOOST - TRAINING SET - EVALUATION RESULTS

Accuracy: 0.9202 (92.02%)

Classification Report:
              precision    recall  f1-score   support

           0     0.7950    0.9625    0.8708      1974
           1     0.9842    0.9038    0.9423      5094

    accuracy                         0.9202      7068
   macro avg     0.8896    0.9332    0.9065      7068
weighted avg     0.9313    0.9202    0.9223      7068


True Negatives:  1900  |  False Positives:   74
False Negatives:  490  |  True Positives:  4604

Sensitivity (Recall): 0.9038
Specificity:0.9625

XGBOOST - TEST SET - EVALUATION RESULTS

Accuracy: 0.5863 (58.63%)

Classification Report:
              precision    recall  f1-score   support

           0     0.2990    0.3590    0.3263       493
           1     0.7311    0.6743    0.7015      1274

    accuracy                         0.5863      1767
   macro avg     0.5150    0.5166    0.5139      1767
weighted avg     0.6105    0.5863    0.5968      1767


True Negativ

### Model Comparison

In [55]:
comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Sensitivity', 'Specificity', 'ROC-AUC'],
    'Random Forest': [
        rf_test_metrics['accuracy'],
        rf_test_metrics['sensitivity'],
        rf_test_metrics['specificity'],
        rf_test_metrics['roc_auc']
    ],
    'XGBoost': [
        xgb_test_metrics['accuracy'],
        xgb_test_metrics['sensitivity'],
        xgb_test_metrics['specificity'],
        xgb_test_metrics['roc_auc']
    ]
})

print("\n", comparison_df.to_string(index=False))



      Metric  Random Forest  XGBoost
   Accuracy       0.668364 0.586304
Sensitivity       0.876766 0.674254
Specificity       0.129817 0.359026
    ROC-AUC       0.502726 0.508905


In [57]:
if rf_test_metrics['roc_auc'] > xgb_test_metrics['roc_auc']:
    print("Random Forest")
    best_model = rf_model
    best_model_name = "Random Forest"
else:
    print("XGBoost")
    best_model = xgb_model
    best_model_name = "XGBoost"

XGBoost
