In [10]:
# import statements: These import necessary libraries and modules.
# pandas (pd): Used for data manipulation and analysis, particularly for handling data in tabular form (DataFrames).
# numpy (np): Used for numerical operations, often with arrays.
# os: Used for interacting with the operating system (file paths, directories).
# pickle: Used for serializing (saving) and deserializing (loading) Python objects like models.
# train_test_split: A method from sklearn to split data into training and test sets.
# RandomForestClassifier: A machine learning algorithm used to build a classification model.
# StandardScaler: A preprocessing technique that scales features to have a mean of 0 and standard deviation of 1.
# classification_report: Generates a report that shows precision, recall, and F1 score for a classification model.

import pandas as pd
import numpy as np
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Define Paths: These lines define the paths for the dataset, saved model, and scaler.
# BASE_DIR = ".": Refers to the current directory.
# DATA_PATH: Path to the CSV dataset containing patient data.
# MODEL_PATH and SCALER_PATH: Paths where the trained model and scaler will be saved after training.
# ðŸ”¹ Define Paths

BASE_DIR = "."
DATA_PATH = os.path.join(BASE_DIR, "healthcare_dataset.csv")
MODEL_PATH = os.path.join(BASE_DIR, "model.pkl")
SCALER_PATH = os.path.join(BASE_DIR, "scaler.pkl")
FEATURES_PATH = os.path.join(BASE_DIR, "features.pkl")  # To save feature names

# File Existence Check: This checks if the dataset exists at the specified path. If it doesn't, it raises an error with a helpful message.
# ðŸ”¹ Ensure Dataset Exists
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}. Please ensure the file is present in {BASE_DIR}.")

# Loading and Preprocessing:
# pd.read_csv(DATA_PATH): Loads the dataset from the CSV file into a DataFrame.
# df.dropna(): Drops any rows with missing (NaN) values from the DataFrame.

# ðŸ”¹ Load Data
df = pd.read_csv(DATA_PATH)
df = df.dropna()

# pd.get_dummies(df, columns=['Gender', 'Diagnosis', 'Insurance_Type']): Performs one-hot encoding on the 
# categorical columns (Gender, Diagnosis, Insurance_Type), creating binary columns for each category (e.g., Gender_Male, Diagnosis_COPD, etc.).
# ðŸ”¹ Identify Categorical Columns
categorical_columns = ['Gender', 'Diagnosis', 'Insurance_Type']  # Update if there are other categorical columns

# ðŸ”¹ One-hot Encoding for Categorical Variables
df = pd.get_dummies(df, columns=categorical_columns)

# Feature and Target Separation:
# X: Contains all columns except Readmission and Patient_ID (features for training).
# y: Contains the target column, Readmission, which we are trying to predict.
# Splitting Data:
# train_test_split: Splits the data into training and testing sets.
# X_train and y_train: Training features and target.
# X_test and y_test: Test features and target.
# test_size=0.2: 20% of the data is used for testing, 80% for training.
# random_state=42: Ensures reproducibility of the split.

# ðŸ”¹ Split Features and Target
X = df.drop(columns=['Readmission', 'Patient_ID'])
y = df['Readmission']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling Features:
# StandardScaler: Standardizes the features by removing the mean and scaling to unit variance.
# scaler.fit_transform(X_train): Fits the scaler to the training data and transforms it (scales it).
# scaler.transform(X_test): Transforms the test data using the already fitted scaler.

# ðŸ”¹ Create and Save Scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit & transform training data
X_test_scaled = scaler.transform(X_test)        # Only transform test data

# Model Definition:
# RandomForestClassifier: Initializes a random forest classifier model with 100 trees (n_estimators=100).
# model.fit(X_train_scaled, y_train): Trains the model on the scaled training data (X_train_scaled) and corresponding target values (y_train).

# ðŸ”¹ Train Model with Scaled Data
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Creating Directory: Ensures the directory for saving the model and scaler exists (if not, it creates it).
# ðŸ”¹ Save Feature Names for Consistency
FEATURES = X.columns.tolist()
with open(FEATURES_PATH, "wb") as features_file:
    pickle.dump(FEATURES, features_file)


# ðŸ”¹ Ensure Directory Exists
os.makedirs(BASE_DIR, exist_ok=True)

# Saving the Model: Serializes and saves the trained model to model.pkl.
# ðŸ”¹ Save Model & Scaler
with open(MODEL_PATH, "wb") as model_file:
    pickle.dump(model, model_file)

# Saving the Scaler: Serializes and saves the fitted scaler to scaler.pkl.
with open(SCALER_PATH, "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)

# Making Predictions:
# model.predict(X_test_scaled): Makes predictions using the trained model on the scaled test data.
# Evaluation:
# classification_report(y_test, y_pred): Generates a classification report showing precision, recall, F1 score, and accuracy for 
# the model's predictions compared to the true values (y_test).
# ðŸ”¹ Evaluate Model
y_pred = model.predict(X_test_scaled)
print("Model Evaluation:\n", classification_report(y_test, y_pred))

# Confirmation Messages: Prints messages confirming that the model and scaler were saved successfully.
print(f"âœ… Model saved at: {MODEL_PATH}")
print(f"âœ… Scaler saved at: {SCALER_PATH}")
print(f"âœ… Features saved at: {FEATURES_PATH}")


Model Evaluation:
               precision    recall  f1-score   support

           0       0.74      0.95      0.83       147
           1       0.27      0.06      0.09        53

    accuracy                           0.71       200
   macro avg       0.50      0.50      0.46       200
weighted avg       0.61      0.71      0.63       200

âœ… Model saved at: .\model.pkl
âœ… Scaler saved at: .\scaler.pkl
âœ… Features saved at: .\features.pkl
