In [15]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import roc_auc_score

In [16]:
# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submission_df = pd.read_csv('sample.csv')

In [17]:
# Preprocessing
X = train_df.drop(columns=['hospital_death', 'RecordID'])
y = train_df['hospital_death']
test_df = test_df.drop(columns=['RecordID'])

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

In [18]:
# Label encode
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))

In [19]:
# Impute missing values
imputer = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
test_df = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)


In [20]:
# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=70)


In [21]:
# Train XGBoost model with hyperparameter tuning
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=200,
    max_depth=4,
    random_state=42,
    n_jobs=-1,
    eval_metric='auc'
)

In [23]:
# Hyperparameter tuning for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='roc_auc', cv=5)
grid_search.fit(X_train, y_train)
best_xgb_model = grid_search.best_estimator_


In [24]:
# Train additional models (Random Forest, Gradient Boosting, etc.)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)


In [25]:
# Create an ensemble of models
ensemble = VotingClassifier(estimators=[
    ('xgb', best_xgb_model),
    ('rf', rf_model),
    ('gb', gb_model)
], voting='soft')

ensemble.fit(X_train, y_train)

In [26]:
# Get model predictions on validation set
ensemble_val_probs = ensemble.predict_proba(X_val)[:, 1]


In [27]:
# Stack predictions for meta-model training
stacked_val_predictions = ensemble_val_probs.reshape(-1, 1)

In [28]:
# Train logistic regression meta-model
meta_model = LogisticRegression()
meta_model.fit(stacked_val_predictions, y_val)

In [29]:
# Get model predictions on test set
ensemble_test_probs = ensemble.predict_proba(test_df)[:, 1]

In [30]:

# Stack predictions for final predictions
stacked_test_predictions = ensemble_test_probs.reshape(-1, 1)

In [31]:
# Create submission file
submission_df = sample_submission_df.copy()
submission_df['hospital_death'] = stacked_test_predictions
submission_file_path = 'test_predict_file.csv'
submission_df.to_csv(submission_file_path, index=False)