## Introduction
This notebook focuses on transforming the cleaned dataset into a model-ready format through preprocessing and feature selection. Categorical variables were encoded using one-hot or label encoding methods, and numerical features were normalized using Min-Max Scaling to ensure consistency across features. Outlier detection and clipping were performed to mitigate their influence on training. Redundant and low-value features were also dropped based on correlation and domain relevance. These preprocessing techniques were critical for optimizing model performance and ensuring the data aligned with the requirements of various machine learning algorithms.

In [1]:
# Importing all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, silhouette_score
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import mutual_info_classif
import joblib

# Importing additional libraries for advanced tasks
try:
    import plotly.express as px
    import imblearn
    from feature_engine import imputation
    import xgboost as xgb
    import lightgbm as lgb
    import catboost as cb
    import missingno as msno
    import statsmodels.api as sm
except ImportError:
    print("Optional libraries not installed. Install them if needed.")
# All libraries are now ready to use
print("Libraries imported successfully!")

# Importing the warnings library to ignore warnings
import warnings
warnings.filterwarnings('ignore')

print("Warnings will now be ignored!")

Optional libraries not installed. Install them if needed.
Libraries imported successfully!


In [None]:
# Load the cleaned datasets
cleaned_diabetic_data = pd.read_csv("cleaned_diabetic_data.csv")
cleaned_IDS_mapping = pd.read_csv("cleaned_IDS_mapping.csv")

# Display a summary of the loaded datasets
print("\n--- Cleaned Diabetic Data ---\n")
print(cleaned_diabetic_data.info())
print(cleaned_diabetic_data.head())

print("\n--- Cleaned IDS Mapping Data ---\n")
print(cleaned_IDS_mapping.info())
print(cleaned_IDS_mapping.head())

In [3]:
cleaned_diabetic_data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_description,weight_flag,outlier_flag
0,2278392,8222157,Caucasian,Female,5,Unknown,6,25,1,1,...,No,No,No,No,No Change,No,Not Readmitted,Transfer from another health care facility,Unknown,False
1,149190,55629189,Caucasian,Female,15,Unknown,1,1,7,3,...,No,No,No,No,Change,Yes,Readmitted,Physician Referral,Unknown,False
2,64410,86047875,AfricanAmerican,Female,25,Unknown,1,1,7,2,...,No,No,No,No,No Change,Yes,Not Readmitted,Physician Referral,Unknown,False
3,500364,82442376,Caucasian,Male,35,Unknown,1,1,7,2,...,No,No,No,No,Change,Yes,Not Readmitted,Physician Referral,Unknown,False
4,16680,42519267,Caucasian,Male,45,Unknown,1,1,7,1,...,No,No,No,No,Change,Yes,Not Readmitted,Physician Referral,Unknown,False


In [4]:
cleaned_diabetic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 53 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   encounter_id                101766 non-null  int64 
 1   patient_nbr                 101766 non-null  int64 
 2   race                        101766 non-null  object
 3   gender                      101766 non-null  object
 4   age                         101766 non-null  int64 
 5   weight                      101766 non-null  object
 6   admission_type_id           101766 non-null  int64 
 7   discharge_disposition_id    101766 non-null  int64 
 8   admission_source_id         101766 non-null  int64 
 9   time_in_hospital            101766 non-null  int64 
 10  payer_code                  101766 non-null  object
 11  medical_specialty           101766 non-null  object
 12  num_lab_procedures          101766 non-null  int64 
 13  num_procedures              1

In [5]:
cleaned_diabetic_data.isna().sum()

encounter_id                      0
patient_nbr                       0
race                              0
gender                            0
age                               0
weight                            0
admission_type_id                 0
discharge_disposition_id          0
admission_source_id               0
time_in_hospital                  0
payer_code                        0
medical_specialty                 0
num_lab_procedures                0
num_procedures                    0
num_medications                   0
number_outpatient                 0
number_emergency                  0
number_inpatient                  0
diag_1                            0
diag_2                            0
diag_3                            0
number_diagnoses                  0
max_glu_serum                 96420
A1Cresult                     84748
metformin                         0
repaglinide                       0
nateglinide                       0
chlorpropamide              

In [6]:
# Replace missing values with "Unknown"
cleaned_diabetic_data = cleaned_diabetic_data.fillna("Unknown")

# Confirm that there are no missing values remaining
missing_summary = cleaned_diabetic_data.isnull().sum()

# Display summary of missing values to confirm
missing_summary

encounter_id                  0
patient_nbr                   0
race                          0
gender                        0
age                           0
weight                        0
admission_type_id             0
discharge_disposition_id      0
admission_source_id           0
time_in_hospital              0
payer_code                    0
medical_specialty             0
num_lab_procedures            0
num_procedures                0
num_medications               0
number_outpatient             0
number_emergency              0
number_inpatient              0
diag_1                        0
diag_2                        0
diag_3                        0
number_diagnoses              0
max_glu_serum                 0
A1Cresult                     0
metformin                     0
repaglinide                   0
nateglinide                   0
chlorpropamide                0
glimepiride                   0
acetohexamide                 0
glipizide                     0
glyburid

# Encode categorical variables using one-hot encoding for simplicity and scalability
categorical_columns = [
    "payer_code", "medical_specialty", "diag_1", "diag_2", "diag_3",
    "max_glu_serum", "A1Cresult", "metformin", "repaglinide", "nateglinide",
    "chlorpropamide", "glimepiride", "acetohexamide", "glipizide", "glyburide",
    "tolbutamide", "pioglitazone", "rosiglitazone", "acarbose", "miglitol",
    "troglitazone", "tolazamide", "examide", "citoglipton", "insulin",
    "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone",
    "metformin-rosiglitazone", "metformin-pioglitazone", "change", "diabetesMed",
    "readmitted", "admission_type_description", "weight_flag"
]

# Perform one-hot encoding
encoded_data = pd.get_dummies(cleaned_diabetic_data, columns=categorical_columns, drop_first=True)

# Check the resulting dataset structure
encoded_data.shape

In [8]:
# List of categorical columns to factorize
categorical_columns = [
    "payer_code", "medical_specialty", "diag_1", "diag_2", "diag_3", "race", "gender", "weight",
    "max_glu_serum", "A1Cresult", "metformin", "repaglinide", "nateglinide",
    "chlorpropamide", "glimepiride", "acetohexamide", "glipizide", "glyburide",
    "tolbutamide", "pioglitazone", "rosiglitazone", "acarbose", "miglitol",
    "troglitazone", "tolazamide", "examide", "citoglipton", "insulin",
    "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone",
    "metformin-rosiglitazone", "metformin-pioglitazone", "change", "diabetesMed",
    "readmitted", "admission_type_description", "weight_flag"
]


# Perform factorization for the categorical columns
for column in categorical_columns:
    cleaned_diabetic_data[column], _ = pd.factorize(cleaned_diabetic_data[column])


# Confirm the result
cleaned_diabetic_data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_description,weight_flag,outlier_flag
0,2278392,8222157,0,0,5,0,6,25,1,1,...,0,0,0,0,0,0,0,0,0,False
1,149190,55629189,0,0,15,0,1,1,7,3,...,0,0,0,0,1,1,1,1,0,False
2,64410,86047875,1,0,25,0,1,1,7,2,...,0,0,0,0,0,1,0,1,0,False
3,500364,82442376,0,1,35,0,1,1,7,2,...,0,0,0,0,1,1,0,1,0,False
4,16680,42519267,0,1,45,0,1,1,7,1,...,0,0,0,0,1,1,0,1,0,False


In [9]:
cleaned_diabetic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 53 columns):
 #   Column                      Non-Null Count   Dtype
---  ------                      --------------   -----
 0   encounter_id                101766 non-null  int64
 1   patient_nbr                 101766 non-null  int64
 2   race                        101766 non-null  int64
 3   gender                      101766 non-null  int64
 4   age                         101766 non-null  int64
 5   weight                      101766 non-null  int64
 6   admission_type_id           101766 non-null  int64
 7   discharge_disposition_id    101766 non-null  int64
 8   admission_source_id         101766 non-null  int64
 9   time_in_hospital            101766 non-null  int64
 10  payer_code                  101766 non-null  int64
 11  medical_specialty           101766 non-null  int64
 12  num_lab_procedures          101766 non-null  int64
 13  num_procedures              101766 non-null 

In [10]:
from sklearn.preprocessing import MinMaxScaler

# Identify numerical columns
numerical_columns = cleaned_diabetic_data.select_dtypes(include=['int64', 'float64']).columns

# Initialize the scaler
scaler = MinMaxScaler()

# Normalize the numerical features
cleaned_diabetic_data[numerical_columns] = scaler.fit_transform(cleaned_diabetic_data[numerical_columns])


# Check the result
cleaned_diabetic_data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_description,weight_flag,outlier_flag
0,0.005105,0.043387,0.0,0.0,0.0,0.0,0.714286,0.888889,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
1,0.000308,0.293553,0.0,0.0,0.111111,0.0,0.0,0.0,0.25,0.181818,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.142857,0.0,False
2,0.000117,0.454072,0.2,0.0,0.222222,0.0,0.0,0.0,0.25,0.090909,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.142857,0.0,False
3,0.001099,0.435046,0.0,0.5,0.333333,0.0,0.0,0.0,0.25,0.090909,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.142857,0.0,False
4,9e-06,0.224372,0.0,0.5,0.444444,0.0,0.0,0.0,0.25,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.142857,0.0,False


In [11]:

# Drop unnecessary columns based on domain knowledge
columns_to_drop = ["encounter_id", "patient_nbr"]  # These columns are identifiers and do not contribute to predictions
cleaned_diabetic_data = cleaned_diabetic_data.drop(columns=columns_to_drop)

# Check the structure of the dataset after dropping columns
cleaned_diabetic_data.head(), cleaned_diabetic_data.columns.tolist()

(   race  gender       age  weight  admission_type_id  \
 0   0.0     0.0  0.000000     0.0           0.714286   
 1   0.0     0.0  0.111111     0.0           0.000000   
 2   0.2     0.0  0.222222     0.0           0.000000   
 3   0.0     0.5  0.333333     0.0           0.000000   
 4   0.0     0.5  0.444444     0.0           0.000000   
 
    discharge_disposition_id  admission_source_id  time_in_hospital  \
 0                  0.888889                 0.00          0.000000   
 1                  0.000000                 0.25          0.181818   
 2                  0.000000                 0.25          0.090909   
 3                  0.000000                 0.25          0.090909   
 4                  0.000000                 0.25          0.000000   
 
    payer_code  medical_specialty  ...  glipizide-metformin  \
 0         0.0           0.000000  ...                  0.0   
 1         0.0           0.013889  ...                  0.0   
 2         0.0           0.013889  ... 



# Define target and features
target_column = "readmitted"
X = cleaned_diabetic_data.drop(columns=[target_column])
y = cleaned_diabetic_data[target_column]

# Encode the target variable if needed
#y, _ = pd.factorize(y)

# Select only numeric features
X_numeric = X.select_dtypes(include=["number"])

# Normalize the numeric features for better performance of feature selection
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_numeric)

# Compute mutual information scores
mutual_info_scores = mutual_info_classif(X_scaled, y, discrete_features="auto")

# Create a DataFrame to display feature importance
mutual_info_df = pd.DataFrame({
    "Feature": X_numeric.columns,
    "Mutual_Info_Score": mutual_info_scores
}).sort_values(by="Mutual_Info_Score", ascending=False)

# Display the top features
print(mutual_info_df)

# Save the selected features if needed
top_features = mutual_info_df[mutual_info_df["Mutual_Info_Score"] > 0.02]["Feature"].tolist()
X_selected = X_numeric[top_features]
X_selected.to_csv('selected_features.csv', index=False)

In [13]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Define target and features
target_column = "readmitted"
X = cleaned_diabetic_data.drop(columns=[target_column])
y = cleaned_diabetic_data[target_column]

# Select only numeric features
X_numeric = X.select_dtypes(include=["number"])

# Normalize the numeric features for better performance of feature selection
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_numeric)

# Compute mutual information scores
mutual_info_scores = mutual_info_classif(X_scaled, y, discrete_features="auto")

# Create a DataFrame to display feature importance
mutual_info_df = pd.DataFrame({
    "Feature": X_numeric.columns,
    "Mutual_Info_Score": mutual_info_scores
}).sort_values(by="Mutual_Info_Score", ascending=False)

# Display the top features
print("\n--- Mutual Information Scores ---\n")
print(mutual_info_df)

# Select the top 10 features
top_10_features = mutual_info_df.head(10)["Feature"].tolist()
print(f"\nTop 10 Features Selected: {top_10_features}\n")

# Create a new DataFrame with only the top 10 features
X_selected = X_numeric[top_10_features]

# Save the selected features if needed
X_selected.to_csv('top_10_selected_features.csv', index=False)
print("\nSelected features saved to 'top_10_selected_features.csv'")


--- Mutual Information Scores ---

                       Feature  Mutual_Info_Score
15            number_inpatient           0.027713
5     discharge_disposition_id           0.015788
46                 diabetesMed           0.013309
17                      diag_2           0.013136
16                      diag_1           0.012956
19            number_diagnoses           0.012001
18                      diag_3           0.009744
14            number_emergency           0.009296
6          admission_source_id           0.009104
45                      change           0.008455
1                       gender           0.008368
13           number_outpatient           0.006561
9            medical_specialty           0.006229
47  admission_type_description           0.004280
7             time_in_hospital           0.004156
12             num_medications           0.004082
39                     insulin           0.003102
4            admission_type_id           0.002047
37            

In [14]:
from sklearn.decomposition import PCA

# Apply PCA for dimensionality reduction
# Retain 95% of the variance
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

# Check the shape of the transformed dataset and the explained variance ratio
pca_results = {
    "Original Features": X_numeric.shape[1],
    "Reduced Features (PCA)": X_pca.shape[1],
    "Explained Variance Ratio": sum(pca.explained_variance_ratio_)
}

pca_results

{'Original Features': 49,
 'Reduced Features (PCA)': 24,
 'Explained Variance Ratio': 0.9582745300200146}

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Split the PCA-transformed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logistic_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

accuracy

0.5754151518129115

In [16]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

In [17]:
# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Train a Random Forest model
random_forest = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)
random_forest.fit(X_train_balanced, y_train_balanced)

# Make predictions on the test set
y_pred_rf = random_forest.predict(X_test)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_classification_rep = classification_report(y_test, y_pred_rf)

print(f"Accuracy: {rf_accuracy}")
print("Classification Report:\n", rf_classification_rep)

# Extract feature importance from the Random Forest model
feature_importance = random_forest.feature_importances_
pca_feature_importance = pd.DataFrame({
    "PCA Component": [f"PC{i+1}" for i in range(len(feature_importance))],
    "Importance": feature_importance
}).sort_values(by="Importance", ascending=False)

# Display feature importance
print(pca_feature_importance)

Accuracy: 0.5815564508204776
Classification Report:
               precision    recall  f1-score   support

         0.0       0.63      0.55      0.59     10952
         1.0       0.54      0.62      0.58      9402

    accuracy                           0.58     20354
   macro avg       0.58      0.58      0.58     20354
weighted avg       0.59      0.58      0.58     20354

   PCA Component  Importance
5            PC6    0.099889
0            PC1    0.061338
11          PC12    0.054794
9           PC10    0.054206
7            PC8    0.047933
10          PC11    0.044888
22          PC23    0.040781
3            PC4    0.040324
14          PC15    0.039810
18          PC19    0.038997
4            PC5    0.038374
17          PC18    0.037581
15          PC16    0.037241
19          PC20    0.036464
16          PC17    0.036146
6            PC7    0.035512
8            PC9    0.035321
21          PC22    0.035153
23          PC24    0.032661
13          PC14    0.032080
20         

In [18]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# Initialize the Random Forest model
random_forest = RandomForestClassifier(random_state=42)

# Initialize the Grid Search
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid,
                           scoring='accuracy', cv=5, verbose=2, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train_balanced, y_train_balanced)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)
print("Test Set Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Best Cross-Validation Score: 0.6305688193438135
Test Set Accuracy: 0.5939864400117912
Classification Report:
               precision    recall  f1-score   support

         0.0       0.62      0.64      0.63     10952
         1.0       0.56      0.55      0.55      9402

    accuracy                           0.59     20354
   macro avg       0.59      0.59      0.59     20354
weighted avg       0.59      0.59      0.59     20354



In [19]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score

# Train and evaluate XGBoost
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_balanced, y_train_balanced)
y_pred_xgb = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_classification_rep = classification_report(y_test, y_pred_xgb)

print("XGBoost Results")
print(f"Accuracy: {xgb_accuracy}")
print(f"Classification Report:\n{xgb_classification_rep}")

# Train and evaluate LightGBM
lgb_model = LGBMClassifier(random_state=42)
lgb_model.fit(X_train_balanced, y_train_balanced)
y_pred_lgb = lgb_model.predict(X_test)
lgb_accuracy = accuracy_score(y_test, y_pred_lgb)
lgb_classification_rep = classification_report(y_test, y_pred_lgb)

print("\nLightGBM Results")
print(f"Accuracy: {lgb_accuracy}")
print(f"Classification Report:\n{lgb_classification_rep}")

# Train and evaluate CatBoost
cat_model = CatBoostClassifier(random_state=42, verbose=0)
cat_model.fit(X_train_balanced, y_train_balanced)
y_pred_cat = cat_model.predict(X_test)
cat_accuracy = accuracy_score(y_test, y_pred_cat)
cat_classification_rep = classification_report(y_test, y_pred_cat)

print("\nCatBoost Results")
print(f"Accuracy: {cat_accuracy}")
print(f"Classification Report:\n{cat_classification_rep}")

XGBoost Results
Accuracy: 0.5811142772919328
Classification Report:
              precision    recall  f1-score   support

         0.0       0.62      0.56      0.59     10952
         1.0       0.54      0.61      0.57      9402

    accuracy                           0.58     20354
   macro avg       0.58      0.58      0.58     20354
weighted avg       0.59      0.58      0.58     20354

[LightGBM] [Info] Number of positive: 43912, number of negative: 43912
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005155 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6120
[LightGBM] [Info] Number of data points in the train set: 87824, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

LightGBM Results
Accuracy: 0.5835216665028987
Classification Report:
              precision    recall  f1-score   support

         0.0       0.63      0.56      0.

In [20]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# Define base models
base_models = [
    ('random_forest', RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)),
    ('xgboost', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
]

# Define the meta-model
meta_model = LogisticRegression(random_state=42, max_iter=1000)

# Combine base models and meta-model in a stacking classifier
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5, n_jobs=-1)

# Train the stacking model
stacking_model.fit(X_train_balanced, y_train_balanced)

# Make predictions on the test set
y_pred_stacking = stacking_model.predict(X_test)

# Evaluate the stacking model
stacking_accuracy = accuracy_score(y_test, y_pred_stacking)
stacking_classification_rep = classification_report(y_test, y_pred_stacking)

print("Stacking Ensemble Results")
print(f"Accuracy: {stacking_accuracy}")
print(f"Classification Report:\n{stacking_classification_rep}")



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Stacking Ensemble Results
Accuracy: 0.5872555762994989
Classification Report:
              precision    recall  f1-score   support

         0.0       0.63      0.56      0.60     10952
         1.0       0.55      0.61      0.58      9402

    accuracy                           0.59     20354
   macro avg       0.59      0.59      0.59     20354
weighted avg       0.59      0.59      0.59     20354



In [21]:
# Save the stacking model to a file
stacking_model_filename = "stacking_model.pkl"
joblib.dump(stacking_model, stacking_model_filename)


['stacking_model.pkl']

In [22]:
# Extract feature names from X
features_used = list(X.columns)
print("Features used in training:", features_used)

Features used in training: ['race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'payer_code', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'admission_type_description', 'weight_flag', 'outlier_flag']
