In [20]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [21]:
# Load your train and test data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [22]:
train_data.drop(columns=["RecordID", "hospital_id"], axis=1, inplace=True)
test_data.drop(columns=["RecordID", "hospital_id"], axis=1, inplace=True)

In [23]:
# Separate the target variable from the features
X_train = train_data.drop("hospital_death", axis=1)
y_train = train_data["hospital_death"]
X_test = test_data

In [24]:
# Define the column types (numeric and string columns)
numeric_columns = X_train.select_dtypes(include=['int', 'float']).columns
string_columns = X_train.select_dtypes(include=['object']).columns

In [25]:
# Data cleaning and preprocessing for numeric data
numeric_imputer = KNNImputer(n_neighbors=10001)
X_train[numeric_columns] = numeric_imputer.fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = numeric_imputer.transform(X_test[numeric_columns])

In [26]:
scaler = RobustScaler()
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

In [27]:
# Data cleaning and preprocessing for string data
string_imputer = SimpleImputer(strategy='most_frequent')
X_train[string_columns] = string_imputer.fit_transform(X_train[string_columns])
X_test[string_columns] = string_imputer.transform(X_test[string_columns])

In [28]:
encoder = OneHotEncoder()
X_train_encoded = encoder.fit_transform(X_train[string_columns])
X_test_encoded = encoder.transform(X_test[string_columns])

In [29]:
# Combine the numeric and one-hot encoded string data
X_train_processed = np.hstack((X_train[numeric_columns], X_train_encoded.toarray()))
X_test_processed = np.hstack((X_test[numeric_columns], X_test_encoded.toarray()))

In [30]:
# Feature selection using Random Forest to select the top 25 features
rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), max_features=60)
X_train_selected = rf_selector.fit_transform(X_train_processed, y_train)
X_test_selected = rf_selector.transform(X_test_processed)

In [31]:
# Model training using AdaBoost with RandomForest as the base estimator
base_estimator = RandomForestClassifier(n_estimators=1000, random_state=42)
adaboost_model = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=1000, random_state=42, learning_rate=0.1)
adaboost_model.fit(X_train_selected, y_train)



In [32]:
# Make predictions on the test data
np.set_printoptions(precision=10)
predictions = adaboost_model.predict_proba(X_test_selected)

In [33]:
predictions = predictions[:,1]
predictions

array([0.0072057646, 0.6116893515, 0.064051241 , ..., 0.0128102482,
       0.0104083267, 0.0048038431])

In [34]:
df = pd.DataFrame(predictions)
custom_starting_index = 50001
df.insert(0, 'RecordID', range(custom_starting_index, custom_starting_index + len(df)))

custom_column_names = ['RecordID', 'hospital_death']

csv_file_path = '../iml-fall-2023-first-challenge/prediction.csv'

df.to_csv(csv_file_path, index=False, header=custom_column_names)