In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from catboost import CatBoostClassifier
import warnings

warnings.filterwarnings('ignore')


In [37]:
# Load and preprocess the training data
df = pd.read_csv('train.csv')

# Apply one-hot encoding to categorical columns
df = pd.get_dummies(df)

df_test = pd.read_csv('test.csv')

# Apply the same one-hot encoding to test data
df_test = pd.get_dummies(df_test)

# Drop columns as needed
text_to_find = 'noninvasive'
columns_to_drop = [col for col in df.columns if text_to_find in col]
df.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)


In [38]:
imr = KNNImputer(n_neighbors=10000, weights='uniform')
imr = imr.fit(df.values)
df[:] = imr.transform(df.values)

imr_test = KNNImputer(n_neighbors=10000, weights='uniform')
imr_test = imr_test.fit(df_test.values)
df_test[:] = imr_test.transform(df_test.values)


In [39]:
X = df.drop(columns='hospital_death')
y = df['hospital_death']

numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Check if 'hospital_death' is in numeric_columns before dropping it
if 'hospital_death' in numeric_columns:
    numeric_columns = numeric_columns.drop(['RecordID', 'hospital_id', 'icu_id', 'hospital_death'])
else:
    # Handle the case where 'hospital_death' is not in numeric_columns
    print("Warning: 'hospital_death' not found in numeric_columns")




In [40]:
rbs = RobustScaler()
X[numeric_columns] = rbs.fit_transform(X[numeric_columns])
df_test[numeric_columns] = rbs.transform(df_test[numeric_columns])

In [41]:
# Specify the CatBoost parameters
catboost_params = {
    'learning_rate': 0.1,
    'max_depth': 3,
    'iterations': 500,  # Number of boosting iterations (equivalent to n_estimators)
    'subsample': 0.8,
    'colsample_bylevel': 0.8,  # Similar to colsample_bytree in LightGBM
    'l2_leaf_reg': 0.9,
    'thread_count': -1,  # Use all available CPU cores
    'random_seed': 42
}

# Create a CatBoostClassifier as the base estimator
base_catboost_model = CatBoostClassifier(**catboost_params)
#bg_1 = BaggingClassifier(CatBoostClassifier(n_estimators=373, learning_rate=0.1,max_depth=3, subsample = 0.8),n_estimators=100,n_jobs=-1)

# Create a BaggingClassifier with CatBoost as the base estimator
bagging_catboost_model = BaggingClassifier(base_estimator=base_catboost_model, n_estimators=100, random_state=42)


In [42]:


# Fit the bagging model on the training data
bagging_catboost_model.fit(X, y)



0:	learn: 0.5789212	total: 6.37ms	remaining: 3.18s
1:	learn: 0.5034649	total: 16.6ms	remaining: 4.12s
2:	learn: 0.4360112	total: 22.6ms	remaining: 3.74s
3:	learn: 0.3895846	total: 32.8ms	remaining: 4.07s
4:	learn: 0.3496637	total: 37.8ms	remaining: 3.75s
5:	learn: 0.3232913	total: 42.7ms	remaining: 3.51s
6:	learn: 0.3019535	total: 47.6ms	remaining: 3.35s
7:	learn: 0.2841259	total: 52.6ms	remaining: 3.23s
8:	learn: 0.2730080	total: 57.1ms	remaining: 3.11s
9:	learn: 0.2618748	total: 61.9ms	remaining: 3.03s
10:	learn: 0.2551188	total: 66.8ms	remaining: 2.97s
11:	learn: 0.2487630	total: 72ms	remaining: 2.93s
12:	learn: 0.2428214	total: 77ms	remaining: 2.88s
13:	learn: 0.2383207	total: 82.5ms	remaining: 2.86s
14:	learn: 0.2353143	total: 87.5ms	remaining: 2.83s
15:	learn: 0.2324710	total: 93.1ms	remaining: 2.82s
16:	learn: 0.2309267	total: 98ms	remaining: 2.79s
17:	learn: 0.2284858	total: 103ms	remaining: 2.76s
18:	learn: 0.2269509	total: 108ms	remaining: 2.72s
19:	learn: 0.2258089	total: 11

In [43]:
# Make predictions on the test data using the bagging model
bagging_predictions = bagging_catboost_model.predict_proba(df_test)[:, 1]


In [44]:
# Create a DataFrame with bagging model predictions and record IDs
df_predictions = pd.DataFrame(bagging_predictions, columns=['hospital_death'])
custom_starting_index = 50001
df_predictions.insert(0, 'RecordID', range(custom_starting_index, custom_starting_index + len(df_predictions)))

# Specify the file path for saving the CSV file
csv_file_path = 'prediction_bagging_catboost.csv'

# Save the bagging model predictions to a CSV file
df_predictions.to_csv(csv_file_path, index=False)