In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import randint

In [None]:
file_path = "../data/heart_disease_selected_features.csv"
df = pd.read_csv(file_path)

In [3]:
X = df.drop(columns="target")
y = df["target"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [5]:
# -------------------------------
# 1️⃣ Logistic Regression - GridSearchCV
# -------------------------------
log_reg = LogisticRegression(solver='liblinear')
param_grid_lr = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"]
}

In [6]:
grid_lr = GridSearchCV(log_reg, param_grid_lr, cv=5, scoring="accuracy")
grid_lr.fit(X_train, y_train)
best_lr = grid_lr.best_estimator_
print("Best Logistic Regression:", grid_lr.best_params_)

Best Logistic Regression: {'C': 0.1, 'penalty': 'l2'}


In [7]:
# -------------------------------
# 2️⃣ Decision Tree - RandomizedSearchCV
# -------------------------------
tree = DecisionTreeClassifier(random_state=42)
param_dist_tree = {
    "max_depth": [3, 5, 7, 10, None],
    "min_samples_split": randint(2, 20),
    "min_samples_leaf": randint(1, 10),
    "criterion": ["gini", "entropy"]
}

In [8]:
rand_tree = RandomizedSearchCV(tree, param_distributions=param_dist_tree, n_iter=20, cv=5, random_state=42, scoring="accuracy")
rand_tree.fit(X_train, y_train)
best_tree = rand_tree.best_estimator_
print("Best Decision Tree:", rand_tree.best_params_)

Best Decision Tree: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 9}


In [9]:
# -------------------------------
# 3️⃣ Random Forest - RandomizedSearchCV
# -------------------------------
rf = RandomForestClassifier(random_state=42)
param_dist_rf = {
    "n_estimators": randint(50, 200),
    "max_depth": [3, 5, 7, 10, None],
    "min_samples_split": randint(2, 20),
    "min_samples_leaf": randint(1, 10),
    "bootstrap": [True, False]
}

In [10]:
rand_rf = RandomizedSearchCV(rf, param_distributions=param_dist_rf, n_iter=20, cv=5, random_state=42, scoring="accuracy")
rand_rf.fit(X_train, y_train)
best_rf = rand_rf.best_estimator_
print("Best Random Forest:", rand_rf.best_params_)

Best Random Forest: {'bootstrap': True, 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 99}


In [11]:
# -------------------------------
# 4️⃣ SVM - GridSearchCV
# -------------------------------
svm = SVC(probability=True)
param_grid_svm = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf", "poly"],
    "gamma": ["scale", "auto"]
}

In [12]:
grid_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring="accuracy")
grid_svm.fit(X_train, y_train)
best_svm = grid_svm.best_estimator_
print("Best SVM:", grid_svm.best_params_)

Best SVM: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


In [13]:
# -------------------------------
# 5️⃣ تقييم كل موديل
# -------------------------------
models = {
    "Logistic Regression": best_lr,
    "Decision Tree": best_tree,
    "Random Forest": best_rf,
    "SVM": best_svm
}

In [14]:
results = []

for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred))
    results.append({"Model": name, "Accuracy": acc})



Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.97      0.88        33
           1       0.50      0.18      0.27        11
           2       0.29      0.29      0.29         7
           3       0.33      0.43      0.38         7
           4       0.00      0.00      0.00         3

    accuracy                           0.64        61
   macro avg       0.38      0.37      0.36        61
weighted avg       0.59      0.64      0.60        61


Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.67      0.71        33
           1       0.12      0.18      0.15        11
           2       0.00      0.00      0.00         7
           3       0.17      0.14      0.15         7
           4       0.00      0.00      0.00         3

    accuracy                           0.41        61
   macro avg       0.21      0.20      0.20      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
results_df = pd.DataFrame(results).sort_values(by="Accuracy", ascending=False)

In [16]:
print("📊 Model Performance:")
print(results_df)

📊 Model Performance:
                 Model  Accuracy
0  Logistic Regression  0.639344
3                  SVM  0.606557
2        Random Forest  0.557377
1        Decision Tree  0.409836


In [17]:
rf = RandomForestClassifier(random_state=42)
param_dist_rf = {
    "n_estimators": randint(50, 200),
    "max_depth": [3, 5, 7, 10, None],
    "min_samples_split": randint(2, 20),
    "min_samples_leaf": randint(1, 10),
    "bootstrap": [True, False]
}

rand_rf = RandomizedSearchCV(
    rf,
    param_distributions=param_dist_rf,
    n_iter=20,
    cv=5,
    random_state=42,
    scoring="accuracy"
)
rand_rf.fit(X_train, y_train)

best_rf = rand_rf.best_estimator_
print("Best Random Forest:", rand_rf.best_params_)

Best Random Forest: {'bootstrap': True, 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 99}


In [None]:
results_df.to_csv("../data/hyperparameter_tuning_results.csv", index=False)
print("✔ Results saved in ./data/hyperparameter_tuning_results.csv")

✔ Results saved in hyperparameter_tuning_results.csv


In [19]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [20]:

best_model = best_rf
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", best_model)
])

pipeline.fit(X, y)

In [None]:
model_filename = "../models/final_model.pkl"
joblib.dump(pipeline, model_filename)

print(f"model saved successfully")

✔ تم حفظ الموديل في ملف: final_model.pkl
