In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import warnings

warnings.filterwarnings('ignore')

In [9]:
# Load and preprocess the training data
df = pd.read_csv('train.csv')

# Apply label encoding to categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

label_encoders = {}  # Store label encoders for each categorical column

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store the label encoder for future use

df_test = pd.read_csv('test.csv')

# Apply the same label encoding to test data
for col in categorical_cols:
    le = label_encoders.get(col)  # Get the corresponding label encoder for the column
    if le is not None:
        df_test[col] = le.transform(df_test[col])
    else:
        # Handle the case where the test data has unseen categories
        print(f"Warning: Unseen categories in '{col}' column of test data")

# Drop columns as needed
text_to_find = 'noninvasive'
columns_to_drop = [col for col in df.columns if text_to_find in col]
df.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)

In [10]:
imr = KNNImputer(n_neighbors=10000, weights='uniform')
imr = imr.fit(df.values)
df[:] = imr.transform(df.values)


In [11]:
imr_test = KNNImputer(n_neighbors=10000, weights='uniform')
imr_test = imr_test.fit(df_test.values)
df_test[:] = imr_test.transform(df_test.values)

In [12]:

X = df.drop(columns='hospital_death')
y = df['hospital_death']

numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Check if 'hospital_death' is in numeric_columns before dropping it
if 'hospital_death' in numeric_columns:
    numeric_columns = numeric_columns.drop(['RecordID', 'hospital_id', 'icu_id', 'hospital_death'])
else:
    # Handle the case where 'hospital_death' is not in numeric_columns
    print("Warning: 'hospital_death' not found in numeric_columns")




In [13]:
rbs = RobustScaler()
X[numeric_columns] = rbs.fit_transform(X[numeric_columns])
df_test[numeric_columns] = rbs.transform(df_test[numeric_columns])


In [15]:
# Specify the best XGBoost parameters
xgb_params = {
    'learning_rate': 0.1,
    'max_depth': 2,
    'objective': 'reg:logistic',
    'min_child_weight': 3,
    'colsample_bytree': 0.8,
    'subsample': 0.9,
    'reg_alpha': 0.9,
    'n_jobs': -1
}

# Create a BaggingClassifier with XGBoost as the base estimator
base_xgb_model = XGBClassifier(random_state=42, **xgb_params)


In [20]:
# Create a BaggingClassifier with XGBoost as the base estimator
bagging_xgb_model = BaggingClassifier(base_estimator=base_xgb_model, n_estimators=100, random_state=42)

# Fit the bagging model on the training data
bagging_xgb_model.fit(X, y)

In [22]:
# Perform cross-validation to assess bagging model performance
bagging_cv_scores = cross_val_score(bagging_xgb_model, X, y, cv=5, scoring='accuracy')

# Make predictions on the test data using the bagging model
bagging_predictions = bagging_xgb_model.predict_proba(df_test)[:, 1]

# Create a DataFrame with bagging model predictions and record IDs
df_predictions = pd.DataFrame(bagging_predictions, columns=['hospital_death'])
custom_starting_index = 50001
df_predictions.insert(0, 'RecordID', range(custom_starting_index, custom_starting_index + len(df_predictions)))

# Specify the file path for saving the CSV file
csv_file_path = 'prediction_bagging_xgb.csv'

# Save the bagging model predictions to a CSV file
df_predictions.to_csv(csv_file_path, index=False)

# Print cross-validation scores and classification report
print("Bagging Model Cross-Validation Scores:", bagging_cv_scores)
print("Mean CV Score for Bagging Model:", np.mean(bagging_cv_scores))


KeyboardInterrupt: 