In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, precision_score, recall_score, confusion_matrix, RocCurveDisplay
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from boruta import BorutaPy
from sklearn.impute import SimpleImputer
import category_encoders as ce

In [2]:
categorical_columns = ['site', 'cid', 'gasource', 'magecat', 'medu_r2', 'meducat_r2', 'paritycat', 'wttiming',
                        'priorcsec', 'anyanc', 'anctri', 'ancvisits', 'ancvisitscat', 'vitcaliron', 'ttvaccine', 'hiv',
                        'bpmeas', 'urinetest', 'anyus', 'lb', 'bsex', 'multiple', 'bagmask',
                        'bathed', 'antehem', 'posthem', 'hypertensive', 'transverse', 'oblique',
                        'breech', 'malp', 'induction', 'infdeliv', 'inffu', 'unplanhosp', 'hospcomp', 'seizures',
                        'mantibiotics', 'corticosteroid', 'oxytocics', 'bldtrans', 'dcsuction', 'magsulfate',
                        'hysterectomy', 'episiotomy', 'rentown', 'waterimp', 'waternotimp', 'water30min', 'sanitation',
                        'floormat', 'cookfuel', 'bicycle', 'motorbike', 'vehicle', 'electricity', 'television',
                        'refrigerator', 'computer', 'flipphone', 'smartphone', 'pregout', 'fuout', 'ltfdeliv']

numerical_columns = ['gaenrl', 'mage', 'schyears', 'parity', 'numfamily', 'numrooms']

target_variable = 'pretermalg'

cols_to_read = categorical_columns + numerical_columns + [target_variable]

df = pd.read_csv('data.csv', usecols=cols_to_read, dtype=str)
df['pretermalg'] = pd.to_numeric(df['pretermalg'], errors='coerce')
df = df.dropna(subset=['pretermalg'])
df['pretermalg'] = df['pretermalg'].replace({2: 0})


In [3]:
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df[categorical_columns + numerical_columns], 
                                                    df[target_variable], 
                                                    test_size=0.2, 
                                                    random_state=1)

# Imputation for training data
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='constant', fill_value='unknown')

# Ensure all numerical columns are numeric and contain no non-numeric values
X_train[numerical_columns] = X_train[numerical_columns].apply(pd.to_numeric, errors='coerce')
X_test[numerical_columns] = X_test[numerical_columns].apply(pd.to_numeric, errors='coerce')

# Apply imputers
X_train[numerical_columns] = num_imputer.fit_transform(X_train[numerical_columns])
X_train[categorical_columns] = cat_imputer.fit_transform(X_train[categorical_columns])


In [4]:
# One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_cat_encoded = encoder.fit_transform(X_train[categorical_columns])
X_test_cat_encoded = encoder.transform(X_test[categorical_columns])

# Combine encoded categorical data with numerical data
X_train_final = np.hstack([X_train_cat_encoded, X_train[numerical_columns]])
X_test_final = np.hstack([X_test_cat_encoded, X_test[numerical_columns]])

# Scaling
scaler = StandardScaler()
X_train_final_scaled = scaler.fit_transform(X_train_final)
X_test_final_scaled = scaler.transform(X_test_final)

In [5]:
# Preprocessing for test data (transform only)
# Apply numerical imputer
X_test[numerical_columns] = num_imputer.transform(X_test[numerical_columns])

# Apply categorical imputer
X_test[categorical_columns] = cat_imputer.transform(X_test[categorical_columns])

# Apply one-hot encoding to categorical columns
X_test_cat_encoded = encoder.transform(X_test[categorical_columns])

# Combine encoded categorical data with numerical data
X_test_final = np.hstack([X_test_cat_encoded, X_test[numerical_columns]])

# Scaling the combined data
X_test_final_scaled = scaler.transform(X_test_final)


In [6]:
# Feature selection
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
feat_selector = BorutaPy(rf, n_estimators='auto', random_state=1)
feat_selector.fit(X_train_final, y_train)
selected_features_train = X_train_final[:, feat_selector.support_]
selected_features_test = X_test_final[:, feat_selector.support_]


In [7]:
# Model Training and Evaluation
models = {
    "XGBClassifier": XGBClassifier(),
    "CatBoostClassifier": CatBoostClassifier(silent=True),
    "LGBMClassifier": LGBMClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
}

results = {}
for name, model in models.items():
    model.fit(selected_features_train, y_train)
    y_pred = model.predict_proba(selected_features_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred)
    results[name] = auc_score

# Model Comparison
best_model_name = max(results, key=results.get)
print(f"Best Model: {best_model_name} with AUC: {results[best_model_name]}")

[LightGBM] [Info] Number of positive: 16163, number of negative: 94479
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015216 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 393
[LightGBM] [Info] Number of data points in the train set: 110642, number of used features: 106
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.146084 -> initscore=-1.765653
[LightGBM] [Info] Start training from score -1.765653
Best Model: CatBoostClassifier with AUC: 0.7802893026922387
