In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import BaggingClassifier
from lightgbm import LGBMClassifier
import warnings

warnings.filterwarnings('ignore')


In [4]:
# Load and preprocess the training data
df = pd.read_csv('train.csv')

# Apply one-hot encoding to categorical columns
df = pd.get_dummies(df)

df_test = pd.read_csv('test.csv')

# Apply the same one-hot encoding to test data
df_test = pd.get_dummies(df_test)

# Drop columns as needed
text_to_find = 'noninvasive'
columns_to_drop = [col for col in df.columns if text_to_find in col]
df.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)

numeric_columns = df.select_dtypes(include=['int64','float64']).columns
numeric_columns = numeric_columns.drop(['RecordID', 'hospital_id', 'icu_id', 'hospital_death'])
object_columns = df.select_dtypes(include=['object']).columns
object_columns

In [5]:
# imr = KNNImputer(n_neighbors=10000, weights='uniform')
# imr = imr.fit(df.values)
# df[:] = imr.transform(df.values)

# imr_test = KNNImputer(n_neighbors=10000, weights='uniform')
# imr_test = imr_test.fit(df_test.values)
# df_test[:] = imr_test.transform(df_test.values)
imr = SimpleImputer(missing_values=np.nan, strategy='median')
df[numeric_columns] = imr.fit_transform(df[numeric_columns].values)

rbs1 = RobustScaler()
df[numeric_columns] = rbs1.fit_transform(df[numeric_columns])

imr_test = SimpleImputer(missing_values=np.nan, strategy='median')
df_test[numeric_columns] = imr_test.fit_transform(df_test[numeric_columns].values)

rbs = RobustScaler()
df_test[numeric_columns] = rbs.fit_transform(df_test[numeric_columns])


KeyboardInterrupt: 

In [None]:
X = df.drop(columns='hospital_death')
y = df['hospital_death']

numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Check if 'hospital_death' is in numeric_columns before dropping it
if 'hospital_death' in numeric_columns:
    numeric_columns = numeric_columns.drop(['RecordID', 'hospital_id', 'icu_id', 'hospital_death'])
else:
    # Handle the case where 'hospital_death' is not in numeric_columns
    print("Warning: 'hospital_death' not found in numeric_columns")

rbs = RobustScaler()
X[numeric_columns] = rbs.fit_transform(X[numeric_columns])
df_test[numeric_columns] = rbs.transform(df_test[numeric_columns])


In [None]:
# Specify the LightGBM parameters
lgbm_params = {
    'learning_rate': 0.1,
    'max_depth': 3,
    'boosting_type': 'gbdt',
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'reg_alpha': 0.9,
    'reg_lambda': 0.8,
    'n_jobs': -1,
    'num_leaves': 10
}

# Create a LightGBMClassifier as the base estimator
base_lgbm_model = LGBMClassifier(n_estimators=500, random_state=42, **lgbm_params)

# Create a BaggingClassifier with LightGBM as the base estimator
bagging_lgbm_model = BaggingClassifier(base_estimator=base_lgbm_model, n_estimators=200, random_state=42)


In [None]:
# Define the parameter grid for the grid search
param_grid = {
    'base_estimator__learning_rate': [0.01, 0.1, 0.2],
    'base_estimator__max_depth': [3, 4, 5],
    'n_estimators': [100, 200, 300]
}

# Create the grid search object
grid_search = GridSearchCV(estimator=bagging_lgbm_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [None]:
# Fit the grid search on the training data
grid_search.fit(X, y)

# Print the best hyperparameters and corresponding accuracy score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)