In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import joblib
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import pandas as pd

# Load heart disease dataset
heart_df = pd.read_csv(r"C:\Users\devar\Downloads\heart_project\data\heart.csv")
print("✅ Dataset Loaded Successfully!")

# Basic overview
print("🔎 Shape:", heart_df.shape)
print("🧾 Columns:", heart_df.columns.tolist())
heart_df.head()



# Summary statistics
heart_df.describe()

# Check for missing values
print("\n🔍 Missing Values:\n", heart_df.isnull().sum())

# Check data types
print("\n🧬 Data Types:\n", heart_df.dtypes)


heart_df['age_bin'] = pd.cut(heart_df['Age'], bins=[0, 40, 50, 60, 70, 100], labels=['<40', '40-50', '50-60', '60-70', '70+'])


heart_df['Sex'] = heart_df['Sex'].map({'M': 1, 'F': 0})
heart_df['ExerciseAngina'] = heart_df['ExerciseAngina'].map({'Y': 1, 'N': 0})

# 2. Ordinal Encoding for 'ST_Slope'
st_slope_mapping = {'Up': 2, 'Flat': 1, 'Down': 0}
heart_df['ST_Slope'] = heart_df['ST_Slope'].map(st_slope_mapping)

# 3. One-hot Encoding for 'ChestPainType' and 'RestingECG'
heart_df = pd.get_dummies(heart_df, columns=['ChestPainType', 'RestingECG'], drop_first=True)

# 4. Derived Feature: Heart Rate Reserve
heart_df['HR_Reserve'] = 220 - heart_df['Age'] - heart_df['MaxHR']

# 5. Skipping BMI or cholesterol ratios since BMI not present

# 6. Drop 'age_bin' (optional — not useful for most ML models)
if 'age_bin' in heart_df.columns:
    heart_df.drop('age_bin', axis=1, inplace=True)


# Convert all boolean columns to integers (0/1)
bool_cols = heart_df.select_dtypes(include=['bool']).columns
heart_df[bool_cols] = heart_df[bool_cols].astype(int)


from sklearn.preprocessing import StandardScaler

# Select features (exclude target column)
X = heart_df.drop('HeartDisease', axis=1)
y = heart_df['HeartDisease']

# Apply StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame (optional)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

import joblib
joblib.dump(scaler, 'heart_scaler.pkl')



X_scaled['HeartDisease'] = y

# Save normalized dataset
X_scaled.to_csv("heart_processed.csv", index=False)

# Show top 5 rows
print(X_scaled.head())


def train_heart_model():
    heart_df = pd.read_csv("heart_processed.csv")
    X = heart_df.drop("HeartDisease", axis=1)
    y = heart_df["HeartDisease"]

    pipeline = ImbPipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('model', RandomForestClassifier(random_state=42))
    ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    print("\n🫀 Heart Disease Model:")
    print(classification_report(y_test, y_pred))

    joblib.dump(pipeline, "heart_model.pkl")


if __name__ == "__main__":
    train_heart_model()

✅ Dataset Loaded Successfully!
🔎 Shape: (918, 12)
🧾 Columns: ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope', 'HeartDisease']

🔍 Missing Values:
 Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

🧬 Data Types:
 Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object
        Age       Sex  RestingBP  Cholesterol  FastingBS     MaxHR  \
0 -1.433140  0.515952   0.410909     0.825070  -0.551341  1.382928   
1 -0.478484 -1.938163   1.49175