In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import warnings

warnings.filterwarnings('ignore')

# Load and preprocess the training data
df = pd.read_csv('train.csv')

# Apply one-hot encoding to categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

df_test = pd.read_csv('test.csv')

# Apply the same one-hot encoding to test data
df_test = pd.get_dummies(df_test, columns=categorical_cols, drop_first=True)

# Drop columns as needed
text_to_find = 'noninvasive'
columns_to_drop = [col for col in df.columns if text_to_find in col]
df.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)
imr = KNNImputer(n_neighbors=10000, weights='uniform')
imr = imr.fit(df.values)
df[:] = imr.transform(df.values)
imr_test = KNNImputer(n_neighbors=10000, weights='uniform')
imr_test = imr_test.fit(df_test.values)
df_test[:] = imr_test.transform(df_test.values)

X = df.drop(columns='hospital_death')
y = df['hospital_death']

numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Check if 'hospital_death' is in numeric_columns before dropping it
if 'hospital_death' in numeric_columns:
    numeric_columns = numeric_columns.drop(['RecordID', 'hospital_id', 'icu_id', 'hospital_death'])
else:
    # Handle the case where 'hospital_death' is not in numeric_columns
    print("Warning: 'hospital_death' not found in numeric_columns")

rbs = RobustScaler()
X[numeric_columns] = rbs.fit_transform(X[numeric_columns])
df_test[numeric_columns] = rbs.transform(df_test[numeric_columns])

# Use the best hyperparameters for XGBoost
best_xgb_params = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
best_xgb_model = XGBClassifier(random_state=42, **best_xgb_params)

# Create a BaggingClassifier with XGBoost as the base estimator
bagging_xgb_model = BaggingClassifier(base_estimator=best_xgb_model, n_estimators=10, random_state=42)

# Fit the bagging model on the training data
bagging_xgb_model.fit(X, y)

# Perform cross-validation to assess bagging model performance
bagging_cv_scores = cross_val_score(bagging_xgb_model, X, y, cv=5, scoring='accuracy')

# Make predictions on the test data using the bagging model
bagging_predictions = bagging_xgb_model.predict_proba(df_test)[:, 1]

# Create a DataFrame with bagging model predictions and record IDs
df_predictions = pd.DataFrame(bagging_predictions, columns=['hospital_death'])
custom_starting_index = 50001
df_predictions.insert(0, 'RecordID', range(custom_starting_index, custom_starting_index + len(df_predictions)))

# Specify the file path for saving the CSV file
csv_file_path = 'prediction_bagging_xgb.csv'

# Save the bagging model predictions to a CSV file
df_predictions.to_csv(csv_file_path, index=False)

# Print cross-validation scores and classification report
print("Bagging Model Cross-Validation Scores:", bagging_cv_scores)
print("Mean CV Score for Bagging Model:", np.mean(bagging_cv_scores))

# Evaluate the bagging model on the training data
y_pred_train = bagging_xgb_model.predict(X)
print("Accuracy on Training Data:", accuracy_score(y, y_pred_train))
print("ROC AUC on Training Data:", roc_auc_score(y, y_pred_train))
print("Classification Report on Training Data:\n", classification_report(y, y_pred_train))


Bagging Model Cross-Validation Scores: [0.9276 0.9264 0.9267 0.9233 0.9269]
Mean CV Score for Bagging Model: 0.9261799999999999
Accuracy on Training Data: 0.93618
ROC AUC on Training Data: 0.6691296850175339
Classification Report on Training Data:
               precision    recall  f1-score   support

           0       0.94      0.99      0.97     45662
           1       0.81      0.35      0.48      4338

    accuracy                           0.94     50000
   macro avg       0.88      0.67      0.73     50000
weighted avg       0.93      0.94      0.92     50000



In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import warnings

warnings.filterwarnings('ignore')

# Load and preprocess the training data
df = pd.read_csv('train.csv')

# Apply label encoding to categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

label_encoders = {}  # Store label encoders for each categorical column

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store the label encoder for future use

df_test = pd.read_csv('test.csv')

# Apply the same label encoding to test data
for col in categorical_cols:
    le = label_encoders.get(col)  # Get the corresponding label encoder for the column
    if le is not None:
        df_test[col] = le.transform(df_test[col])
    else:
        # Handle the case where the test data has unseen categories
        print(f"Warning: Unseen categories in '{col}' column of test data")

# Drop columns as needed
text_to_find = 'noninvasive'
columns_to_drop = [col for col in df.columns if text_to_find in col]
df.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)

imr = KNNImputer(n_neighbors=10000, weights='uniform')
imr = imr.fit(df.values)
df[:] = imr.transform(df.values)

imr_test = KNNImputer(n_neighbors=10000, weights='uniform')
imr_test = imr_test.fit(df_test.values)
df_test[:] = imr_test.transform(df_test.values)

X = df.drop(columns='hospital_death')
y = df['hospital_death']

numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Check if 'hospital_death' is in numeric_columns before dropping it
if 'hospital_death' in numeric_columns:
    numeric_columns = numeric_columns.drop(['RecordID', 'hospital_id', 'icu_id', 'hospital_death'])
else:
    # Handle the case where 'hospital_death' is not in numeric_columns
    print("Warning: 'hospital_death' not found in numeric_columns")

rbs = RobustScaler()
X[numeric_columns] = rbs.fit_transform(X[numeric_columns])
df_test[numeric_columns] = rbs.transform(df_test[numeric_columns])

# Use the best hyperparameters for XGBoost
best_xgb_params = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
best_xgb_model = XGBClassifier(random_state=42, **best_xgb_params)

# Create a BaggingClassifier with XGBoost as the base estimator
bagging_xgb_model = BaggingClassifier(base_estimator=best_xgb_model, n_estimators=10, random_state=42)

# Fit the bagging model on the training data
bagging_xgb_model.fit(X, y)

# Perform cross-validation to assess bagging model performance
bagging_cv_scores = cross_val_score(bagging_xgb_model, X, y, cv=5, scoring='accuracy')

# Make predictions on the test data using the bagging model
bagging_predictions = bagging_xgb_model.predict_proba(df_test)[:, 1]

# Create a DataFrame with bagging model predictions and record IDs
df_predictions = pd.DataFrame(bagging_predictions, columns=['hospital_death'])
custom_starting_index = 50001
df_predictions.insert(0, 'RecordID', range(custom_starting_index, custom_starting_index + len(df_predictions)))

# Specify the file path for saving the CSV file
csv_file_path = 'prediction_bagging_xgb.csv'

# Save the bagging model predictions to a CSV file
df_predictions.to_csv(csv_file_path, index=False)

# Print cross-validation scores and classification report
print("Bagging Model Cross-Validation Scores:", bagging_cv_scores)
print("Mean CV Score for Bagging Model:", np.mean(bagging_cv_scores))

# Evaluate the bagging model on the training data
y_pred_train = bagging_xgb_model.predict(X)
print("Accuracy on Training Data:", accuracy_score(y, y_pred_train))
print("ROC AUC on Training Data:", roc_auc_score(y, y_pred_train))
print("Classification Report on Training Data:\n", classification_report(y, y_pred_train))


Bagging Model Cross-Validation Scores: [0.9275 0.9264 0.926  0.9245 0.9274]
Mean CV Score for Bagging Model: 0.9263600000000001
Accuracy on Training Data: 0.93634
ROC AUC on Training Data: 0.6693215956748688
Classification Report on Training Data:
               precision    recall  f1-score   support

           0       0.94      0.99      0.97     45662
           1       0.81      0.35      0.49      4338

    accuracy                           0.94     50000
   macro avg       0.88      0.67      0.73     50000
weighted avg       0.93      0.94      0.92     50000



In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import warnings

warnings.filterwarnings('ignore')

# Load and preprocess the training data
df = pd.read_csv('train.csv')

# Apply label encoding to categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

label_encoders = {}  # Store label encoders for each categorical column

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store the label encoder for future use

df_test = pd.read_csv('test.csv')

# Apply the same label encoding to test data
for col in categorical_cols:
    le = label_encoders.get(col)  # Get the corresponding label encoder for the column
    if le is not None:
        df_test[col] = le.transform(df_test[col])
    else:
        # Handle the case where the test data has unseen categories
        print(f"Warning: Unseen categories in '{col}' column of test data")

# Drop columns as needed
text_to_find = 'noninvasive'
columns_to_drop = [col for col in df.columns if text_to_find in col]
df.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)

imr = KNNImputer(n_neighbors=10000, weights='uniform')
imr = imr.fit(df.values)
df[:] = imr.transform(df.values)

imr_test = KNNImputer(n_neighbors=10000, weights='uniform')
imr_test = imr_test.fit(df_test.values)
df_test[:] = imr_test.transform(df_test.values)

X = df.drop(columns='hospital_death')
y = df['hospital_death']

numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Check if 'hospital_death' is in numeric_columns before dropping it
if 'hospital_death' in numeric_columns:
    numeric_columns = numeric_columns.drop(['RecordID', 'hospital_id', 'icu_id', 'hospital_death'])
else:
    # Handle the case where 'hospital_death' is not in numeric_columns
    print("Warning: 'hospital_death' not found in numeric_columns")

rbs = RobustScaler()
X[numeric_columns] = rbs.fit_transform(X[numeric_columns])
df_test[numeric_columns] = rbs.transform(df_test[numeric_columns])

# Use the best hyperparameters for XGBoost
best_xgb_params = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
best_xgb_model = XGBClassifier(random_state=42, **best_xgb_params)

# Create a BaggingClassifier with XGBoost as the base estimator
bagging_xgb_model = BaggingClassifier(base_estimator=best_xgb_model, n_estimators=10, random_state=42)

# Fit the bagging model on the training data
bagging_xgb_model.fit(X, y)

# Perform cross-validation to assess bagging model performance
bagging_cv_scores = cross_val_score(bagging_xgb_model, X, y, cv=5, scoring='accuracy')

# Make predictions on the test data using the bagging model
bagging_predictions = bagging_xgb_model.predict_proba(df_test)[:, 1]

# Create a DataFrame with bagging model predictions and record IDs
df_predictions = pd.DataFrame(bagging_predictions, columns=['hospital_death'])
custom_starting_index = 50001
df_predictions.insert(0, 'RecordID', range(custom_starting_index, custom_starting_index + len(df_predictions)))

# Specify the file path for saving the CSV file
csv_file_path = 'prediction_bagging_xgb.csv'

# Save the bagging model predictions to a CSV file
df_predictions.to_csv(csv_file_path, index=False)

# Print cross-validation scores and classification report
print("Bagging Model Cross-Validation Scores:", bagging_cv_scores)
print("Mean CV Score for Bagging Model:", np.mean(bagging_cv_scores))

# Evaluate the bagging model on the training data
y_pred_train = bagging_xgb_model.predict(X)
print("Accuracy on Training Data:", accuracy_score(y, y_pred_train))
print("ROC AUC on Training Data:", roc_auc_score(y, y_pred_train))
print("Classification Report on Training Data:\n", classification_report(y, y_pred_train))


Bagging Model Cross-Validation Scores: [0.9275 0.9264 0.926  0.9245 0.9274]
Mean CV Score for Bagging Model: 0.9263600000000001
Accuracy on Training Data: 0.93634
ROC AUC on Training Data: 0.6693215956748688
Classification Report on Training Data:
               precision    recall  f1-score   support

           0       0.94      0.99      0.97     45662
           1       0.81      0.35      0.49      4338

    accuracy                           0.94     50000
   macro avg       0.88      0.67      0.73     50000
weighted avg       0.93      0.94      0.92     50000



In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier  # Import LightGBM
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import warnings

warnings.filterwarnings('ignore')

# Load and preprocess the training data
df = pd.read_csv('train.csv')

# Apply label encoding to categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

label_encoders = {}  # Store label encoders for each categorical column

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store the label encoder for future use

df_test = pd.read_csv('test.csv')

# Apply the same label encoding to test data
for col in categorical_cols:
    le = label_encoders.get(col)  # Get the corresponding label encoder for the column
    if le is not None:
        df_test[col] = le.transform(df_test[col])
    else:
        # Handle the case where the test data has unseen categories
        print(f"Warning: Unseen categories in '{col}' column of test data")

# Drop columns as needed
text_to_find = 'noninvasive'
columns_to_drop = [col for col in df.columns if text_to_find in col]
df.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)

imr = KNNImputer(n_neighbors=10000, weights='uniform')
imr = imr.fit(df.values)
df[:] = imr.transform(df.values)

imr_test = KNNImputer(n_neighbors=10000, weights='uniform')
imr_test = imr_test.fit(df_test.values)
df_test[:] = imr_test.transform(df_test.values)

X = df.drop(columns='hospital_death')
y = df['hospital_death']

numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Check if 'hospital_death' is in numeric_columns before dropping it
if 'hospital_death' in numeric_columns:
    numeric_columns = numeric_columns.drop(['RecordID', 'hospital_id', 'icu_id', 'hospital_death'])
else:
    # Handle the case where 'hospital_death' is not in numeric_columns
    print("Warning: 'hospital_death' not found in numeric_columns")

rbs = RobustScaler()
X[numeric_columns] = rbs.fit_transform(X[numeric_columns])
df_test[numeric_columns] = rbs.transform(df_test[numeric_columns])

# Use the best hyperparameters for XGBoost
best_xgb_params = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
best_xgb_model = XGBClassifier(random_state=42, **best_xgb_params)

# Use LightGBM as an alternative to XGBoost
best_lgbm_params = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
best_lgbm_model = LGBMClassifier(random_state=42, **best_lgbm_params)

# Create a BaggingClassifier with LightGBM as the base estimator
bagging_lgbm_model = BaggingClassifier(base_estimator=best_lgbm_model, n_estimators=100, random_state=42)

# Fit the bagging model on the training data
bagging_lgbm_model.fit(X, y)

# Perform cross-validation to assess bagging model performance
bagging_cv_scores = cross_val_score(bagging_lgbm_model, X, y, cv=5, scoring='accuracy')

# Make predictions on the test data using the bagging model
bagging_predictions = bagging_lgbm_model.predict_proba(df_test)[:, 1]

# Create a DataFrame with bagging model predictions and record IDs
df_predictions = pd.DataFrame(bagging_predictions, columns=['hospital_death'])
custom_starting_index = 50001
df_predictions.insert(0, 'RecordID', range(custom_starting_index, custom_starting_index + len(df_predictions)))

# Specify the file path for saving the CSV file
csv_file_path = 'prediction_bagging_lgbm.csv'

# Save the bagging model predictions to a CSV file
df_predictions.to_csv(csv_file_path, index=False)

# Print cross-validation scores and classification report
print("Bagging Model (LightGBM) Cross-Validation Scores:", bagging_cv_scores)
print("Mean CV Score for Bagging Model (LightGBM):", np.mean(bagging_cv_scores))

# Evaluate the bagging model on the training data
y_pred_train = bagging_lgbm_model.predict(X)
print("Accuracy on Training Data:", accuracy_score(y, y_pred_train))
print("ROC AUC on Training Data:", roc_auc_score(y, y_pred_train))
print("Classification Report on Training Data:\n", classification_report(y, y_pred_train))


[LightGBM] [Info] Number of positive: 4338, number of negative: 45662
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003564 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5467
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.089600 -> initscore=-2.318529
[LightGBM] [Info] Start training from score -2.318529
[LightGBM] [Info] Number of positive: 4338, number of negative: 45662
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003227 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5467
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 49
[LightGBM] [Info] [b