In [16]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
import warnings
warnings.filterwarnings('ignore')  

In [17]:
heart_disease=pd.read_csv(r"Data\heart_disease_selected_feature.csv")
#drop column unname0
heart_disease=heart_disease.iloc[:,1:]
# spilt data to feature and target
x=heart_disease.iloc[:,:-1]
y=heart_disease['num']

In [26]:

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

param_logreg = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l2'],
    'max_iter': [100, 200],
    'class_weight': [None, 'balanced']
}

param_dt = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': [None, 'sqrt']
}

param_svm = {
    'C': [0.1, 1, 10,0],
    'gamma': ['scale', 'auto', 0.1],
    'kernel': ['rbf'],
    'shrinking': [True, False],
    'probability': [True]
}

param_rf = {
    'n_estimators': [100, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True]
}

# 3️⃣ Initialize models
models = {
    "Logistic Regression": (LogisticRegression(), param_logreg),
    "Decision Tree": (DecisionTreeClassifier(), param_dt),
    "SVM (RBF)": (SVC(), param_svm),
    "Random Forest": (RandomForestClassifier(), param_rf)
}

for name, (model, params) in models.items():
    print(f"\n🔷 {name}")
    
    # GridSearchCV
    grid = GridSearchCV(model, params, cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)
    
    best_params = grid.best_params_


    # 3️⃣ Re-train a NEW model using best params on the **full training set**
    model_best = type(grid.estimator)(**best_params)
    model_best.fit(X_train, y_train)
    y_pred=model_best.predict(X_test)
    print("✅ Best Params:", grid.best_params_)
    print("✅ Test Accuracy:", model_best.score(X_test, y_test))
    print(classification_report(y_test, y_pred))

Train shape: (242, 6), Test shape: (61, 6)

🔷 Logistic Regression
✅ Best Params: {'C': 10, 'class_weight': None, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
✅ Test Accuracy: 0.6229508196721312
              precision    recall  f1-score   support

           0       0.79      0.94      0.86        33
           1       0.44      0.36      0.40        11
           2       0.25      0.14      0.18         7
           3       0.29      0.29      0.29         7
           4       0.00      0.00      0.00         3

    accuracy                           0.62        61
   macro avg       0.36      0.35      0.35        61
weighted avg       0.57      0.62      0.59        61


🔷 Decision Tree
✅ Best Params: {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'splitter': 'best'}
✅ Test Accuracy: 0.6229508196721312
              precision    recall  f1-score   support

           0       0.80      0.85      0.82       

In [29]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# 1️⃣ Train Dummy Classifier (Baseline)
dummy = DummyClassifier(strategy='most_frequent')  
dummy.fit(X_train, y_train)
y_dummy_pred = dummy.predict(X_test)
dummy_acc = accuracy_score(y_test, y_dummy_pred)

print("🔴 Baseline Dummy Classifier (Most Frequent)")
print("Accuracy:", dummy_acc)
print(classification_report(y_test, y_dummy_pred))

# 2️⃣ Train Logistic Regression (Default)
default_model = LogisticRegression()
default_model.fit(X_train, y_train)
y_default_pred = default_model.predict(X_test)
default_acc = accuracy_score(y_test, y_default_pred)

print("🔵 Logistic Regression (Default)")
print("Accuracy:", default_acc)
print(classification_report(y_test, y_default_pred))

# 3️⃣ Train Logistic Regression (Best Params from GridSearch)
best_params = grid.best_params_
log_best = LogisticRegression(C= 10,  max_iter =100,penalty= 'l2' ,solver= 'liblinear')
log_best.fit(X_train, y_train)
y_best_pred = log_best.predict(X_test)
best_acc = accuracy_score(y_test, y_best_pred)

print("🟢 Logistic Regression (Best Model)")
print("Accuracy:", best_acc)
print(classification_report(y_test, y_best_pred))

# 4️⃣ Show improvement
improvement = best_acc - dummy_acc
print(f"📈 Model improvement over Dummy: {improvement:.4f} (from {dummy_acc:.4f} to {best_acc:.4f})")


🔴 Baseline Dummy Classifier (Most Frequent)
Accuracy: 0.5409836065573771
              precision    recall  f1-score   support

           0       0.54      1.00      0.70        33
           1       0.00      0.00      0.00        11
           2       0.00      0.00      0.00         7
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         3

    accuracy                           0.54        61
   macro avg       0.11      0.20      0.14        61
weighted avg       0.29      0.54      0.38        61

🔵 Logistic Regression (Default)
Accuracy: 0.6229508196721312
              precision    recall  f1-score   support

           0       0.81      0.91      0.86        33
           1       0.40      0.36      0.38        11
           2       0.25      0.14      0.18         7
           3       0.38      0.43      0.40         7
           4       0.00      0.00      0.00         3

    accuracy                           0.62        