In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from imblearn.under_sampling import TomekLinks


# === Step 1: Load and preprocess raw data ===
data = pd.read_csv("heart.csv")  # Replace with actual path

original_features = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                     'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

target = 'target'

# Separate features and target
X = data[original_features]
y = data[target]

# Save original column names
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_cols = ['cp', 'restecg', 'slope', 'ca', 'thal']

# === Step 2: Create Preprocessing Pipeline ===
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first'), categorical_cols)
], remainder='passthrough')  # 'sex', 'fbs', 'exang' pass through

# Apply preprocessing
X_encoded = preprocessor.fit_transform(X)
feature_names = preprocessor.get_feature_names_out()

# === Step 3: Split and Resample ===
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Step 1: Apply SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

# Step 2: Borderline SMOTE
borderline_smote = BorderlineSMOTE(sampling_strategy='all', k_neighbors=15, random_state=42)
X_borderline, y_borderline = borderline_smote.fit_resample(X_smote, y_smote)

# Step 3: Tomek Links
tomek = TomekLinks(sampling_strategy='all')
X_final, y_final = tomek.fit_resample(X_borderline, y_borderline)

# === Step 4: Define Models ===
base_models = [
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5, min_samples_leaf=2, random_state=42)),
    ('svc', SVC(probability=True, kernel='rbf', C=1.0, gamma='scale', random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=5, algorithm='auto', p=2)),
    ('log', LogisticRegression(max_iter=2000, C=0.5, solver='lbfgs', random_state=42)),
    ('et', ExtraTreesClassifier(n_estimators=200, max_depth=10, min_samples_split=5, min_samples_leaf=2, random_state=42)),
    ('dt', DecisionTreeClassifier(max_depth=10, min_samples_split=10, min_samples_leaf=2, random_state=42))
]

meta_model = LogisticRegression(max_iter=2000, C=0.5, solver='lbfgs')
stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

# === Step 5: Train the Model ===
stacked_model.fit(X_final, y_final)

# === Step 6: Evaluate ===
X_test_scaled = X_test  # Already scaled by preprocessor
y_pred = stacked_model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# === Step 7: Wrap everything in a class ===
class HeartDiseasePredictor:
    def __init__(self, model, preprocessor):
        self.model = model
        self.preprocessor = preprocessor
        self.feature_order = original_features

    def predict(self, input_data):
        """
        input_data: pd.DataFrame with original 13 columns (raw, unencoded)
        Returns prediction and probability
        """
        if not all(col in input_data.columns for col in self.feature_order):
            raise ValueError(f"Missing columns in input data. Required: {self.feature_order}")
        transformed = self.preprocessor.transform(input_data[self.feature_order])
        prediction = self.model.predict(transformed)
        probability = self.model.predict_proba(transformed)[:, 1] * 100  # percentage
        return prediction, probability

# Create final object
final_model = HeartDiseasePredictor(model=stacked_model, preprocessor=preprocessor)

# === Step 8: Save Final Model ===
with open("final_heart.pkl", "wb") as file:
    pickle.dump(final_model, file)


Accuracy: 0.9609756097560975
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96       102
           1       0.98      0.94      0.96       103

    accuracy                           0.96       205
   macro avg       0.96      0.96      0.96       205
weighted avg       0.96      0.96      0.96       205



In [43]:
import pandas as pd

class HeartDiseasePredictor:
    def __init__(self, model, scaler, dummy_columns):
        self.model = model
        self.scaler = scaler
        self.dummy_columns = dummy_columns  # List of final column names after get_dummies()

    def preprocess(self, df):
        # Step 1: One-hot encode same as training
        df_encoded = pd.get_dummies(df, columns=['cp', 'restecg', 'slope', 'ca', 'thal'])

        # Step 2: Align columns with training data
        for col in self.dummy_columns:
            if col not in df_encoded.columns:
                df_encoded[col] = 0
        df_encoded = df_encoded[self.dummy_columns]

        # Step 3: Scale numeric features
        cols_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
        df_encoded[cols_to_scale] = self.scaler.transform(df_encoded[cols_to_scale])

        return df_encoded

    def predict(self, input_df):
        processed = self.preprocess(input_df)
        prediction = self.model.predict(processed)
        probability = self.model.predict_proba(processed)[0][1] * 100
        return prediction, round(probability, 2)


In [44]:
final_model = HeartDiseasePredictor(model=stacked_model, scaler=scaler, dummy_columns=X.columns.tolist())

import pickle
with open('final_heart.pkl', 'wb') as file:
    pickle.dump(final_model, file)


In [45]:
import pandas as pd
import pickle

# Load the model
with open('final_heart.pkl', 'rb') as file:
    model_obj = pickle.load(file)

# Raw input with 13 features
new_data = pd.DataFrame([{
    'age': 21,
    'sex': 1,
    'cp': 0,
    'trestbps': 115,
    'chol': 170,
    'fbs': 0,
    'restecg': 1,
    'thalach': 180,
    'exang': 0,
    'oldpeak': 0.0,
    'slope': 2,
    'ca': 0,
    'thal': 2
}])

prediction, probability = model_obj.predict(new_data)

print("Prediction:", prediction[0])
print(f"Probability of Heart Disease: {probability}%")


ValueError: X has 13 features, but RandomForestClassifier is expecting 22 features as input.