In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import warnings

warnings.filterwarnings('ignore')

In [14]:
# Load and preprocess the training data
df = pd.read_csv('train.csv')

# Apply one-hot encoding to categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

df_test = pd.read_csv('test.csv')

# Apply the same one-hot encoding to test data
df_test = pd.get_dummies(df_test, columns=categorical_cols, drop_first=True)

# Drop columns as needed
text_to_find = 'noninvasive'
columns_to_drop = [col for col in df.columns if text_to_find in col]
df.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)


In [15]:
imr = KNNImputer(n_neighbors=10000, weights='uniform')
imr = imr.fit(df.values)
df[:] = imr.transform(df.values)


In [16]:
imr_test = KNNImputer(n_neighbors=10000, weights='uniform')
imr_test = imr_test.fit(df_test.values)
df_test[:] = imr_test.transform(df_test.values)

In [17]:
X = df.drop(columns='hospital_death')
y = df['hospital_death']
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns


In [18]:
# Check if 'hospital_death' is in numeric_columns before dropping it
if 'hospital_death' in numeric_columns:
    numeric_columns = numeric_columns.drop(['RecordID', 'hospital_id', 'icu_id', 'hospital_death'])
else:
    # Handle the case where 'hospital_death' is not in numeric_columns
    print("Warning: 'hospital_death' not found in numeric_columns")




In [19]:
rbs = RobustScaler()
X[numeric_columns] = rbs.fit_transform(X[numeric_columns])
df_test[numeric_columns] = rbs.transform(df_test[numeric_columns])


In [20]:

# Feature selection using Random Forest to select the top features
rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), max_features=60)
X_train_selected = rf_selector.fit_transform(X, y)


In [21]:
# Use the best hyperparameters for XGBoost
best_xgb_params = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
best_xgb_model = XGBClassifier(random_state=42, **best_xgb_params)

In [22]:
# Create a BaggingClassifier with XGBoost as the base estimator
bagging_xgb_model = BaggingClassifier(base_estimator=best_xgb_model, n_estimators=10, random_state=42)

# Fit the bagging model on the training data
bagging_xgb_model.fit(X_train_selected, y)


In [23]:
# Perform cross-validation to assess bagging model performance
bagging_cv_scores = cross_val_score(bagging_xgb_model, X_train_selected, y, cv=5, scoring='accuracy')




In [24]:
# Make predictions on the test data using the bagging model
X_test_selected = rf_selector.transform(df_test)
bagging_predictions = bagging_xgb_model.predict_proba(X_test_selected)[:, 1]

In [25]:
# Create a DataFrame with bagging model predictions and record IDs
df_predictions = pd.DataFrame(bagging_predictions, columns=['hospital_death'])
custom_starting_index = 50001
df_predictions.insert(0, 'RecordID', range(custom_starting_index, custom_starting_index + len(df_predictions)))


In [26]:

# Specify the file path for saving the CSV file
csv_file_path = 'prediction_bagging_xgb.csv'

# Save the bagging model predictions to a CSV file
df_predictions.to_csv(csv_file_path, index=False)

# Print cross-validation scores to assess bagging model performance
print("Bagging Model Cross-Validation Scores:", bagging_cv_scores)
print("Mean CV Score for Bagging Model:", np.mean(bagging_cv_scores))

Bagging Model Cross-Validation Scores: [0.9266 0.9267 0.9265 0.9233 0.9265]
Mean CV Score for Bagging Model: 0.92592
