<a href="https://colab.research.google.com/github/Nandini-1811/healthcare_prediction/blob/main/Copy_of_Heathcare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE


# ============================
# 1) Load dataset
# ============================
df = pd.read_csv("processed_full_dataset.csv")
print("Loaded:", df.shape)

# Identify target
target = "Outcome"
print("Target column:", target)
print(df[target].value_counts())


# ============================
# 2) Fix missing values for key numeric columns
# ============================
for col in ["bmi", "BMI", "Glucose"]:
    if col in df.columns:
        imputer = SimpleImputer(strategy='median')
        df[col] = imputer.fit_transform(df[[col]])
        print(f"Imputed median for column: {col}")


# ============================
# 3) Identify numeric & categorical columns
# ============================
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
numeric_cols.remove(target)

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)


# ============================
# 4) Preprocessing pipeline
# ============================
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)


# ============================
# 5) Split dataset
# ============================
X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# ============================
# 6) SMOTE oversampling
# ============================
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("After SMOTE:", np.bincount(y_train_res))


# ============================
# 7) Final pipeline (preprocess + model)
# ============================
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=300, random_state=42))
])


# ============================
# 8) Train the model
# ============================
model.fit(X_train_res, y_train_res)
print("Model training complete.")


# ============================
# 9) Evaluate
# ============================
y_pred = model.predict(X_test)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Loaded: (6798, 16)
Target column: Outcome
Outcome
0    6530
1     268
Name: count, dtype: int64
Imputed median for column: bmi
Imputed median for column: BMI
Imputed median for column: Glucose
Numeric columns: ['Age', 'Glucose', 'BloodPressure', 'BMI', 'age', 'chol', 'num', 'avg_glucose_level', 'bmi', 'stroke', 'sex_Female', 'sex_Male', 'gender_Female', 'gender_Male', 'gender_Other']
Categorical columns: []
After SMOTE: [5224 5224]
Model training complete.

Confusion Matrix:
[[1270   36]
 [   5   49]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      1306
           1       0.58      0.91      0.71        54

    accuracy                           0.97      1360
   macro avg       0.79      0.94      0.84      1360
weighted avg       0.98      0.97      0.97      1360

