In [80]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
import warnings

warnings.filterwarnings('ignore')

In [81]:
# Load and preprocess the training data
df = pd.read_csv('train.csv')
df = pd.get_dummies(df)

df_test = pd.read_csv('test.csv')
df_test = pd.get_dummies(df_test)

In [82]:
text_to_find = 'noninvasive'
columns_to_drop = [col for col in df.columns if text_to_find in col]
df.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)

In [83]:
text_to_find = 'noninvasive'
columns_to_drop = [col for col in df.columns if text_to_find in col]
df.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)

In [84]:
imr = KNNImputer(n_neighbors=10000, weights='uniform')
imr = imr.fit(df.values)
df[:] = imr.transform(df.values)

In [85]:
imr_test = KNNImputer(n_neighbors=10000, weights='uniform')
imr_test = imr_test.fit(df_test.values)
df_test[:] = imr_test.transform(df_test.values)

In [86]:
X = df.drop(columns='hospital_death')
y = df[['hospital_death']]

In [87]:
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Check if 'hospital_death' is in numeric_columns before dropping it
if 'hospital_death' in numeric_columns:
    numeric_columns = numeric_columns.drop(['RecordID', 'hospital_id', 'icu_id', 'hospital_death'])
else:
    # Handle the case where 'hospital_death' is not in numeric_columns
    print("Warning: 'hospital_death' not found in numeric_columns")



In [88]:
rbs = RobustScaler()
X[numeric_columns] = rbs.fit_transform(X[numeric_columns])
df_test[numeric_columns] = rbs.transform(df_test[numeric_columns])

In [89]:
# Feature selection using Random Forest to select the top features
rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), max_features=60)
X_train_selected = rf_selector.fit_transform(X, y)

In [90]:
# Model training using XGBoost
xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train_selected, y)


In [91]:
# Cross-validation to assess model performance
cv_scores = cross_val_score(xgb_model, X_train_selected, y, cv=5)

In [92]:
# Make predictions on the test data
X_test_selected = rf_selector.transform(df_test)
predictions = xgb_model.predict_proba(X_test_selected)[:, 1]


In [93]:
# Create a DataFrame with predictions and record IDs
df_predictions = pd.DataFrame(predictions, columns=['hospital_death'])
custom_starting_index = 50001
df_predictions.insert(0, 'RecordID', range(custom_starting_index, custom_starting_index + len(df_predictions)))

# Specify the file path for saving the CSV file
csv_file_path = 'prediction_xgb.csv'

# Save the predictions to a CSV file
df_predictions.to_csv(csv_file_path, index=False)

# Print cross-validation scores to assess model performance
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))

Cross-Validation Scores: [0.9258 0.9245 0.9261 0.9228 0.9239]
Mean CV Score: 0.92462
