In [45]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.impute import KNNImputer
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import warnings

warnings.filterwarnings('ignore')

In [46]:


# Load and preprocess the training data
df = pd.read_csv('train.csv')

# Apply label encoding to categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}  # Store label encoders for each categorical column

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store the label encoder for future use

df_test = pd.read_csv('test.csv')

# Apply the same label encoding to test data
for col in categorical_cols:
    le = label_encoders.get(col)  # Get the corresponding label encoder for the column
    if le is not None:
        df_test[col] = le.transform(df_test[col])
    else:
        # Handle the case where the test data has unseen categories
        print(f"Warning: Unseen categories in '{col}' column of test data")

# Rest of your code for data preprocessing and modeling...


In [47]:


text_to_find = 'noninvasive'
columns_to_drop = [col for col in df.columns if text_to_find in col]
df.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)
imr = KNNImputer(n_neighbors=10000, weights='uniform')
imr = imr.fit(df.values)
df[:] = imr.transform(df.values)
imr_test = KNNImputer(n_neighbors=10000, weights='uniform')
imr_test = imr_test.fit(df_test.values)
df_test[:] = imr_test.transform(df_test.values)
X = df.drop(columns='hospital_death')
y = df['hospital_death']
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns



In [48]:
# Check if 'hospital_death' is in numeric_columns before dropping it
if 'hospital_death' in numeric_columns:
    numeric_columns = numeric_columns.drop(['RecordID', 'hospital_id', 'icu_id', 'hospital_death'])
else:
    # Handle the case where 'hospital_death' is not in numeric_columns
    print("Warning: 'hospital_death' not found in numeric_columns")




In [49]:
rbs = RobustScaler()
X[numeric_columns] = rbs.fit_transform(X[numeric_columns])
df_test[numeric_columns] = rbs.transform(df_test[numeric_columns])


In [50]:
# Apply label encoding to categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    X[col] = le.fit_transform(X[col])
    df_test[col] = le.transform(df_test[col])

In [51]:
# Feature selection using Random Forest to select the top features
rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), max_features=49)
X_train_selected = rf_selector.fit_transform(X, y)


In [52]:
# Define the parameter grid for hyperparameter tuning for XGBoost
param_grid_xgb = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
    'subsample': [0.8, 0.9, 1.0]
}

# Create the XGBoost classifier
xgb_model = xgb.XGBClassifier(random_state=42)

# Perform grid search with cross-validation for XGBoost
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_xgb.fit(X_train_selected, y)


In [53]:
# Get the best hyperparameters for XGBoost
best_params_xgb = grid_search_xgb.best_params_
print("Best Hyperparameters for XGBoost:", best_params_xgb)


Best Hyperparameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}


In [54]:
# Model training using the best hyperparameters for XGBoost
best_xgb_model = xgb.XGBClassifier(random_state=42, **best_params_xgb)
best_xgb_model.fit(X_train_selected, y)

In [55]:

# Create the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=1000, random_state=42)

# Model training using Random Forest
rf_model.fit(X_train_selected, y)


KeyboardInterrupt: 

In [None]:
# Create an ensemble of models using a VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('xgb', best_xgb_model),
    ('rf', rf_model)
], voting='soft')

# Fit the ensemble model on the training data
ensemble_model.fit(X_train_selected, y)

In [None]:
# Perform cross-validation to assess ensemble model performance
ensemble_cv_scores = cross_val_score(ensemble_model, X_train_selected, y, cv=5, scoring='accuracy')

# Make predictions on the test data using the ensemble model
X_test_selected = rf_selector.transform(df_test)
ensemble_predictions = ensemble_model.predict_proba(X_test_selected)[:, 1]


In [None]:
# Create a DataFrame with ensemble predictions and record IDs
df_predictions = pd.DataFrame(ensemble_predictions, columns=['hospital_death'])
custom_starting_index = 50001
df_predictions.insert(0, 'RecordID', range(custom_starting_index, custom_starting_index + len(df_predictions)))


In [None]:
# Specify the file path for saving the CSV file
csv_file_path = 'prediction_ensemble.csv'

# Save the ensemble predictions to a CSV file
df_predictions.to_csv(csv_file_path, index=False)

# Print cross-validation scores to assess ensemble model performance
print("Ensemble Model Cross-Validation Scores:", ensemble_cv_scores)
print("Mean CV Score for Ensemble Model:", np.mean(ensemble_cv_scores))

Ensemble Model Cross-Validation Scores: [0.9273 0.9262 0.9267 0.9238 0.9256]
Mean CV Score for Ensemble Model: 0.92592
