In [3]:
import pandas as pd
import numpy as np
np.int = int
np.float = float
np.bool = bool
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, precision_score, recall_score, confusion_matrix, precision_recall_curve, auc, roc_curve
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer
import category_encoders as ce
from boruta import BorutaPy
import shap

categorical_columns = ['site', 'cid', 'gasource', 'magecat', 'medu_r2', 'meducat_r2', 'paritycat', 'wttiming',
                        'priorcsec', 'anyanc', 'anctri', 'ancvisits', 'ancvisitscat', 'vitcaliron', 'ttvaccine', 'hiv',
                        'bpmeas', 'urinetest', 'anyus', 'lb', 'bsex', 'multiple', 'bagmask',
                        'bathed', 'antehem', 'posthem', 'hypertensive', 'transverse', 'oblique',
                        'breech', 'malp', 'induction', 'infdeliv', 'inffu', 'unplanhosp', 'hospcomp', 'seizures',
                        'mantibiotics', 'corticosteroid', 'oxytocics', 'bldtrans', 'dcsuction', 'magsulfate',
                        'hysterectomy', 'episiotomy', 'rentown', 'waterimp', 'waternotimp', 'water30min', 'sanitation',
                        'floormat', 'cookfuel', 'bicycle', 'motorbike', 'vehicle', 'electricity', 'television',
                        'refrigerator', 'computer', 'flipphone', 'smartphone', 'pregout', 'fuout', 'ltfdeliv']

numerical_columns = ['gaenrl', 'mage', 'schyears', 'parity', 'numfamily', 'numrooms']

target_variable = 'pretermalg'

cols_to_read = categorical_columns + numerical_columns + [target_variable]

df = pd.read_csv('data.csv', usecols=cols_to_read, dtype=str)
df['pretermalg'] = pd.to_numeric(df['pretermalg'], errors='coerce')
df = df.dropna(subset=['pretermalg'])
df['pretermalg'] = df['pretermalg'].replace({2: 0})

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df[categorical_columns + numerical_columns], 
                                                    df[target_variable], 
                                                    test_size=0.2, 
                                                    random_state=1)

# Imputation for training data
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='constant', fill_value='unknown')

# Ensure all numerical columns are numeric and contain no non-numeric values
X_train[numerical_columns] = X_train[numerical_columns].apply(pd.to_numeric, errors='coerce')
X_test[numerical_columns] = X_test[numerical_columns].apply(pd.to_numeric, errors='coerce')

# Apply imputers
X_train[numerical_columns] = num_imputer.fit_transform(X_train[numerical_columns])
X_train[categorical_columns] = cat_imputer.fit_transform(X_train[categorical_columns])

# Target Encoding
target_encoder = ce.TargetEncoder()
X_train_encoded = target_encoder.fit_transform(X_train[categorical_columns], y_train)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_columns])

# Combine categorical and numerical data
X_train_final = np.hstack([X_train_encoded, X_train_scaled])

# Preprocessing for test data (transform only)
X_test[numerical_columns] = num_imputer.transform(X_test[numerical_columns])
X_test[categorical_columns] = cat_imputer.transform(X_test[categorical_columns])
X_test_encoded = target_encoder.transform(X_test[categorical_columns])
X_test_scaled = scaler.transform(X_test[numerical_columns])
X_test_final = np.hstack([X_test_encoded, X_test_scaled])

# Feature selection
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
feat_selector = BorutaPy(rf, n_estimators='auto', random_state=1)
feat_selector.fit(X_train_final, y_train)
selected_features_train = X_train_final[:, feat_selector.support_]
selected_features_test = X_test_final[:, feat_selector.support_]

results = {}
# Model Training and Evaluation
models = {
    "XGBClassifier": XGBClassifier(),
    "CatBoostClassifier": CatBoostClassifier(silent=True),
    "LGBMClassifier": LGBMClassifier(),
    "RandomForestClassifier": RandomForestClassifier()
}

for name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predict probabilities and classes
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    auc_score = roc_auc_score(y_test, y_pred_prob)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    # Store results
    results[name] = auc_score

    # Print metrics
    print(f"Model: {name}")
    print(f"AUC: {auc_score}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")

    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {auc_score:.2f})")
    plt.title("ROC Curve")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.show()

    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues')
    plt.title(f"Confusion Matrix for {name}")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# Identify the best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
best_model.fit(X_train, y_train)

# SHAP analysis
explainer = shap.Explainer(best_model, X_train)
shap_values = explainer(X_test)

shap.summary_plot(shap_values, X_test, feature_names=categorical_columns + numerical_columns)

In the future `np.bool` will be defined as the corresponding NumPy scalar.


AttributeError: module 'numpy' has no attribute 'bool'.
`np.bool` was a deprecated alias for the builtin `bool`. To avoid this error in existing code, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations