In [13]:
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer


In [14]:
# Load and preprocess the training data
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

categorical_cols = df.select_dtypes(include=['object']).columns

label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

for col in categorical_cols:
    le = label_encoders.get(col)
    if le is not None:
        df_test[col] = le.transform(df_test[col])

text_to_find = 'noninvasive'
columns_to_drop = [col for col in df.columns if text_to_find in col]
df.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)

In [15]:
imr = KNNImputer(n_neighbors=10000, weights='uniform')
imr = imr.fit(df.values)
df[:] = imr.transform(df.values)

In [16]:
imr_test = KNNImputer(n_neighbors=10000, weights='uniform')
imr_test = imr_test.fit(df_test.values)
df_test[:] = imr_test.transform(df_test.values)

In [17]:
X = df.drop(columns='hospital_death')
y = df['hospital_death']

rbs = RobustScaler()
X[numeric_columns] = rbs.fit_transform(X[numeric_columns])
df_test[numeric_columns] = rbs.transform(df_test[numeric_columns])


In [19]:
 #Specify the LightGBM parameters
lgbm_params = {
    'learning_rate': 0.1,
    'max_depth': 4,
    'boosting_type': 'gbdt',
    'min_child_weight': 3,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'reg_alpha': 0.9,
    'reg_lambda': 0.8,
    'n_jobs': -1,
    'num_leaves': 10
}

# Create a LightGBMClassifier as the base estimator
base_lgbm_model = LGBMClassifier(n_estimators=1000, random_state=42, **lgbm_params)

# Create a BaggingClassifier with LightGBM as the base estimator
bagging_lgbm_model = BaggingClassifier(base_estimator=base_lgbm_model, n_estimators=200, random_state=42)


In [20]:
# Fit the bagging model on the training data
bagging_lgbm_model.fit(X, y)


[LightGBM] [Info] Number of positive: 4338, number of negative: 45662
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003066 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5467
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.089600 -> initscore=-2.318529
[LightGBM] [Info] Start training from score -2.318529
[LightGBM] [Info] Number of positive: 4338, number of negative: 45662
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002973 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5467
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 49
[LightGBM] [Info] [b

In [23]:
# Make predictions on the test data using the bagging model
bagging_predictions = bagging_lgbm_model.predict_proba(df_test)[:, 1]


TypeError: predict_proba() takes 2 positional arguments but 3 were given

In [22]:
# Create a DataFrame with bagging model predictions and record IDs
df_predictions = pd.DataFrame(bagging_predictions, columns=['hospital_death'])
custom_starting_index = 50001
df_predictions.insert(0, 'RecordID', range(custom_starting_index, custom_starting_index + len(df_predictions)))

# Specify the file path for saving the CSV file
csv_file_path = 'prediction_bagging_lgbm.csv'

# Save the bagging model predictions to a CSV file
df_predictions.to_csv(csv_file_path, index=False)