In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

df = pd.read_csv('final_stroke_dataset.csv')
X = df.drop('stroke', axis=1)
y = df['stroke']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

joblib.dump(clf, 'rf_model.pkl')
print("\nModel saved as rf_model.pkl")


Accuracy: 0.8905622489959839

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.87      0.89       498
           1       0.87      0.91      0.89       498

    accuracy                           0.89       996
   macro avg       0.89      0.89      0.89       996
weighted avg       0.89      0.89      0.89       996


Confusion Matrix:
 [[432  66]
 [ 43 455]]

Model saved as rf_model.pkl


In [6]:
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

# Load raw dataset
raw = pd.read_csv('healthcare-dataset-stroke-data.csv')

# Define columns
cat_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
num_cols = ['age', 'avg_glucose_level']

# === STEP 1: Use one LabelEncoder per categorical column ===
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    raw[col] = le.fit_transform(raw[col])
    label_encoders[col] = le

# === STEP 2: StandardScaler per categorical column ===
cat_scalers = {}
for col in cat_cols:
    scaler = StandardScaler()
    raw[col] = scaler.fit_transform(raw[[col]])
    cat_scalers[col] = scaler

# === STEP 3: MinMaxScaler for numeric columns ===
scaler_age = MinMaxScaler()
scaler_glucose = MinMaxScaler()

raw['age'] = scaler_age.fit_transform(raw[['age']])
raw['avg_glucose_level'] = scaler_glucose.fit_transform(raw[['avg_glucose_level']])

# === STEP 4: Save all transformers ===
joblib.dump(label_encoders, 'label_encoders.pkl')        # dict of LabelEncoder per col
joblib.dump(cat_scalers, 'cat_scalers.pkl')              # dict of StandardScaler per col
joblib.dump(scaler_age, 'scaler_age.pkl')                # MinMaxScaler for age
joblib.dump(scaler_glucose, 'scaler_glucose.pkl')        # MinMaxScaler for glucose

print("✅ Saved: label_encoders.pkl, cat_scalers.pkl, scaler_age.pkl, scaler_glucose.pkl")


✅ Saved: label_encoders.pkl, cat_scalers.pkl, scaler_age.pkl, scaler_glucose.pkl
