In [37]:
import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline


In [38]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")


In [39]:
# Identify categorical columns in your dataset
categorical_columns_train = df_train.select_dtypes(include=['object']).columns
categorical_columns_test = df_test.select_dtypes(include=['object']).columns

# Apply one-hot encoding to categorical columns in both datasets
df_train_encoded = pd.get_dummies(df_train, columns=categorical_columns_train)
df_test_encoded = pd.get_dummies(df_test, columns=categorical_columns_test)

In [40]:
# Separate the target variable (assuming it's in a column called 'hospital_death')
X_train = df_train_encoded.drop(columns=['hospital_death'])
y_train = df_train_encoded['hospital_death']
X_test = df_test_encoded


In [41]:
# Create a preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('scaler', RobustScaler()),  # Use RobustScaler for robust scaling
    ('imputer', KNNImputer(n_neighbors=12000)),  # You can adjust 'n_neighbors' as needed
])

In [42]:
# Apply preprocessing to both train and test data
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train)
X_test_preprocessed = preprocessing_pipeline.transform(X_test)

In [43]:
# Feature scaling using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[ 1.73208545,  1.11215214,  0.4917898 , ..., -0.21463661,
        -0.06224916, -0.17921931],
       [ 1.73215473, -0.56545771, -0.26106263, ..., -0.21463661,
        -0.06224916, -0.17921931],
       [ 1.73222401,  1.42868231,  1.31508457, ..., -0.21463661,
        -0.06224916, -0.17921931],
       ...,
       [ 3.81033857,  0.85892802, -0.17741236, ..., -0.21463661,
        -0.06224916, -0.17921931],
       [ 3.81040785,  0.19421468, -1.83720983, ..., -0.21463661,
        -0.06224916, -0.17921931],
       [ 3.81047714, -0.56545771, -0.20382824, ..., -0.21463661,
        -0.06224916, -0.17921931]])

In [44]:

# Apply preprocessing to both train and test data
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train)
X_test_preprocessed = preprocessing_pipeline.transform(X_test)


In [45]:
# Train a KNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=1500)  # You can adjust 'n_neighbors' as needed
knn_classifier.fit(X_train_preprocessed, y_train)

# Predict probabilities on the test set for class 1
y_pred_proba_class_1 = knn_classifier.predict_proba(X_test_preprocessed)[:, 1]



In [46]:
# Create a DataFrame with record ID and predicted probabilities for class 1
output_df = pd.DataFrame({'RecordID': df_test['RecordID'], 'probability_class_1': y_pred_proba_class_1})

# Save the DataFrame to a CSV file
output_df.to_csv("predicted_probabilities_class_1.csv", index=False)