In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
# from catboost import CatBoostClassifier  # Import CatBoost
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import warnings
from sklearn.ensemble import BaggingClassifier
import lightgbm as lgb


warnings.filterwarnings('ignore')

In [2]:
# Load and preprocess the training data
df = pd.read_csv('train.csv')

# Apply one-hot encoding to categorical columns
df = pd.get_dummies(df)

df_test = pd.read_csv('test.csv')

# Apply the same one-hot encoding to test data
df_test = pd.get_dummies(df_test)


In [3]:
# Drop columns as needed
text_to_find = 'noninvasive'
columns_to_drop = [col for col in df.columns if text_to_find in col]
df.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)

In [4]:
numeric_columns = df.select_dtypes(include=['int64','float64']).columns
numeric_columns = numeric_columns.drop(['RecordID', 'hospital_id', 'icu_id', 'hospital_death'])
object_columns = df.select_dtypes(include=['object']).columns
object_columns

Index([], dtype='object')

In [5]:
from sklearn.impute import SimpleImputer
import numpy as np

imr = SimpleImputer(missing_values=np.nan, strategy='mean')
df[numeric_columns] = imr.fit_transform(df[numeric_columns].values)

In [6]:
from sklearn.preprocessing import RobustScaler


rbs1 = RobustScaler()
df[numeric_columns] = rbs1.fit_transform(df[numeric_columns])

In [7]:
X = df.loc[:, df.columns != 'hospital_death']
y = df[['hospital_death']]

In [8]:
imr_test = SimpleImputer(missing_values=np.nan, strategy='mean')
df_test[numeric_columns] = imr_test.fit_transform(df_test[numeric_columns].values)

rbs = RobustScaler()
df_test[numeric_columns] = rbs.fit_transform(df_test[numeric_columns])

In [9]:
# Specify the LightGBM parameters
lgbm_params = {
    'learning_rate': 0.1,
    'max_depth': 3,
    'boosting_type': 'gbdt',
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'reg_alpha': 0.9,
    'reg_lambda': 0.8,
    'n_jobs': -1,
    'num_leaves': 10
}

# Create a LightGBMClassifier as the base estimator
base_lgbm_model = lgb.LGBMClassifier(n_estimators=500, random_state=42, **lgbm_params)

# Create a BaggingClassifier with LightGBM as the base estimator
bagging_lgbm_model = BaggingClassifier(base_estimator=base_lgbm_model, n_estimators=209, random_state=42)


In [10]:
# Fit the bagging model on the training data
bagging_lgbm_model.fit(X, y)

[LightGBM] [Info] Number of positive: 4338, number of negative: 45662
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005607 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4009
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.089600 -> initscore=-2.318529
[LightGBM] [Info] Start training from score -2.318529
[LightGBM] [Info] Number of positive: 4338, number of negative: 45662
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003472 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4009
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 87
[LightGBM] [Info] [b

In [11]:
# Make predictions on the test data using the bagging model
bagging_predictions = bagging_lgbm_model.predict_proba(df_test)[:, 1]

In [12]:
# Create a DataFrame with bagging model predictions and record IDs
df_predictions = pd.DataFrame(bagging_predictions, columns=['hospital_death'])
custom_starting_index = 50001
df_predictions.insert(0, 'RecordID', range(custom_starting_index, custom_starting_index + len(df_predictions)))

# Specify the file path for saving the CSV file
csv_file_path = 'prediction_bagging_lgbm.csv'

# Save the bagging model predictions to a CSV file
df_predictions.to_csv(csv_file_path, index=False)