In [48]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report



In [49]:
train = pd.read_csv('/content/Train Dataset.csv')
test = pd.read_csv('/content/Test Dataset.csv')

In [50]:
X = train.drop(['patient_id', 'heart_attack_risk'], axis=1)
y = train['heart_attack_risk']
X_test = test.drop(['patient_id'], axis=1)
test_ids = test['patient_id']


In [51]:
cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(exclude=['object']).columns

In [52]:
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

In [53]:
clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

In [54]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [60]:
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [8, 12, None],
    'classifier__min_samples_split': [2, 5, 10]
}

search = RandomizedSearchCV(
    clf, param_distributions=param_grid, n_iter=5, cv=3,
    scoring='recall', n_jobs=-1, random_state=42
)
search.fit(X_train, y_train)
best_model = search.best_estimator_
print("Best parameters:", search.best_params_)

Best parameters: {'classifier__n_estimators': 200, 'classifier__min_samples_split': 5, 'classifier__max_depth': 8}


In [70]:
y_pred = best_model.predict(X_val)
y_proba = best_model.predict_proba(X_val)[:,1]

print("Accuracy  = ", accuracy_score(y_val, y_pred))
print("Precision = ", precision_score(y_val, y_pred))
print("Recall    = ", recall_score(y_val, y_pred))
print("F1-score  = ", f1_score(y_val, y_pred))
print("ROC-AUC   = ", roc_auc_score(y_val, y_proba))
print("\nClassification Report:\n\n", classification_report(y_val, y_pred))


Accuracy  =  0.5568110483364721
Precision =  0.3244444444444444
Recall    =  0.2664233576642336
F1-score  =  0.2925851703406814
ROC-AUC   =  0.48832815283064995

Classification Report:

               precision    recall  f1-score   support

           0       0.65      0.71      0.68      1045
           1       0.32      0.27      0.29       548

    accuracy                           0.56      1593
   macro avg       0.49      0.49      0.48      1593
weighted avg       0.54      0.56      0.54      1593



In [72]:
test_pred = best_model.predict(X_test)
submission = pd.DataFrame({
    'patient_id': test_ids,
    'heart_attack_risk': test_pred
})
submission.to_csv('EM02_cloud9_Predictions.csv', index=False)