In [1]:
#pandas==1.2.4
#matplotlib==3.3.4
#numpy==1.19.5

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, precision_score, recall_score, confusion_matrix, RocCurveDisplay
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from boruta import BorutaPy
from sklearn.impute import SimpleImputer
import category_encoders as ce

In [15]:
categorical_columns = ['site', 'cid', 'gasource', 'magecat', 'medu_r2', 'meducat_r2', 'paritycat', 'wttiming',
                        'priorcsec', 'anyanc', 'anctri', 'ancvisits', 'ancvisitscat', 'vitcaliron', 'ttvaccine', 'hiv',
                        'bpmeas', 'urinetest', 'anyus', 'lb', 'bsex', 'multiple', 'bagmask',
                        'bathed', 'antehem', 'posthem', 'hypertensive', 'transverse', 'oblique',
                        'breech', 'malp', 'induction', 'infdeliv', 'inffu', 'unplanhosp', 'hospcomp', 'seizures',
                        'mantibiotics', 'corticosteroid', 'oxytocics', 'bldtrans', 'dcsuction', 'magsulfate',
                        'hysterectomy', 'episiotomy', 'rentown', 'waterimp', 'waternotimp', 'water30min', 'sanitation',
                        'floormat', 'cookfuel', 'bicycle', 'motorbike', 'vehicle', 'electricity', 'television',
                        'refrigerator', 'computer', 'flipphone', 'smartphone', 'pregout', 'fuout', 'ltfdeliv']

numerical_columns = ['gaenrl', 'mage', 'schyears', 'parity', 'numfamily', 'numrooms']

target_variable = 'pretermalg'

In [19]:
cols_to_read = categorical_columns + numerical_columns + [target_variable]

df = pd.read_csv('data.csv', usecols=cols_to_read, dtype=str)

for col in categorical_columns:
    df[col] = df[col].astype('category')

for col in numerical_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df['pretermalg'] = pd.to_numeric(df['pretermalg'], errors='coerce')
df['pretermalg'] = df['pretermalg'].replace({2: 0})

In [None]:
# cols_to_read = categorical_columns + numerical_columns + [target_variable]

# df = pd.read_csv('data.csv', usecols=cols_to_read, dtype=str)

# for col in cols_to_read:
#     df[col] = pd.to_numeric(df[col], errors='coerce')

# df[target_variable] = pd.to_numeric(df[target_variable], errors='coerce')
# df['pretermalg'] = df['pretermalg'].replace({2: 0})

In [20]:
num_imputer = SimpleImputer(strategy='mean')
df[numerical_columns] = num_imputer.fit_transform(df[numerical_columns])

cat_imputer = SimpleImputer(strategy='constant', fill_value='unknown')
df[categorical_columns] = cat_imputer.fit_transform(df[categorical_columns])

df = df.dropna(subset=['pretermalg'])

In [21]:
# One hot encoding
# encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# encoded_categorical = encoder.fit_transform(df[categorical_columns])

# Target Encoding
target_encoder = ce.TargetEncoder(cols=categorical_columns)
df[categorical_columns] = target_encoder.fit_transform(df[categorical_columns], df[target_variable])

In [22]:
scaler = StandardScaler()
normalized_numerical = scaler.fit_transform(df[numerical_columns])

In [23]:
# X = np.hstack([encoded_categorical, normalized_numerical])
X = np.hstack([df[categorical_columns], normalized_numerical])

y = df[target_variable].astype(int).values.ravel()

In [24]:
# Define random forest classifier, with utilising all cores and sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
# Define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', random_state=1)
feat_selector.fit(X, y)

# Select the chosen features
selected_features = X[:, feat_selector.support_]

# 3. Model Training and Evaluation
models = {
    "XGBClassifier": XGBClassifier(),
    "CatBoostClassifier": CatBoostClassifier(silent=True),
    "LGBMClassifier": LGBMClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    # Add other models here
}

results = {}
for name, model in models.items():
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    scores = cross_val_score(model, selected_features, y, cv=skf, scoring='roc_auc')
    results[name] = np.mean(scores)  # Store the average score

# 4. Model Comparison and Visualization
best_model_name = max(results, key=results.get)
print(f"Best Model: {best_model_name} with AUC: {results[best_model_name]}")

KeyboardInterrupt: 