In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
#load
X_train = np.load("X_train.npy",allow_pickle=True)
X_test = np.load("X_test.npy",allow_pickle=True)
Y_train = np.load("Y_train.npy",allow_pickle=True)
Y_test = np.load("Y_test.npy",allow_pickle=True)
# Logistic Regression
param_grid_lr = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],        # keep it simple (avoid elasticnet issues)
    "solver": ["liblinear", "saga"] # both support l1 & l2
}

# SVM
param_grid_svm = {
    "C": [0.1, 1, 10],        
    "gamma": [0.01, 0.1],      
    "kernel": ["linear", "rbf"]  
}


# Random Forest
param_grid_rf = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

# Decision Tree
param_grid_dt = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}
# i have got a warning in the random forest so this step is to check and clean 
# If X_train is a DataFrame
if isinstance(X_train, pd.DataFrame):
    X_train = X_train.apply(pd.to_numeric, errors="coerce")
    X_test = X_test.apply(pd.to_numeric, errors="coerce")

# If it's a NumPy array with object dtype
if isinstance(X_train, np.ndarray) and X_train.dtype == "object":
    X_train = X_train.astype(float)
    X_test = X_test.astype(float)

# Double check for NaN/inf
print("NaN in train:", np.isnan(X_train).any())
print("NaN in test:", np.isnan(X_test).any())
print("Inf in train:", np.isinf(X_train).any())
print("Inf in test:", np.isinf(X_test).any())


NaN in train: False
NaN in test: False
Inf in train: False
Inf in test: False


In [4]:
# Dictionary of models and param grids
models = {
    "Logistic Regression": (LogisticRegression(max_iter=1000), param_grid_lr),
    "Random Forest": (RandomForestClassifier(random_state=42), param_grid_rf),
    "Decision Tree": (DecisionTreeClassifier(random_state=42), param_grid_dt),
    "SVM": (SVC(probability=True), param_grid_svm)
}

for name, (model, param_grid) in models.items():
    print(f"\n🔹 {name} 🔹")
    
    # GridSearchCV
    grid = GridSearchCV(model, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, Y_train)
    print("Best (GridSearchCV):", grid.best_params_)
    print("Accuracy:", grid.score(X_test, Y_test))
    
    # RandomizedSearchCV
    rand = RandomizedSearchCV(model, param_grid, n_iter=10, cv=5, scoring="accuracy", n_jobs=-1, random_state=42)
    rand.fit(X_train, Y_train)
    print("Best (RandomizedSearchCV):", rand.best_params_)
    print("Accuracy:", rand.score(X_test, Y_test))
3


🔹 Logistic Regression 🔹
Best (GridSearchCV): {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.8152173913043478
Best (RandomizedSearchCV): {'solver': 'liblinear', 'penalty': 'l1', 'C': 1}
Accuracy: 0.8097826086956522

🔹 Random Forest 🔹


  _data = np.array(data, dtype=dtype, copy=copy,


Best (GridSearchCV): {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 0.8478260869565217
Best (RandomizedSearchCV): {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 10, 'bootstrap': False}
Accuracy: 0.8586956521739131

🔹 Decision Tree 🔹
Best (GridSearchCV): {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
Accuracy: 0.7391304347826086
Best (RandomizedSearchCV): {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 5, 'criterion': 'gini'}
Accuracy: 0.7608695652173914

🔹 SVM 🔹
Best (GridSearchCV): {'C': 10, 'gamma': 0.01, 'kernel': 'linear'}
Accuracy: 0.8206521739130435
Best (RandomizedSearchCV): {'kernel': 'linear', 'gamma': 0.1, 'C': 10}
Accuracy: 0.8206521739130435


3

In [5]:
# model export
import joblib
from sklearn.metrics import classification_report, confusion_matrix
best_params_rf={'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 10, 'bootstrap': False}
# Train final model
final_model = RandomForestClassifier(**best_params_rf, random_state=42)
final_model.fit(X_train, Y_train)

# Evaluate
y_pred = final_model.predict(X_test)
accuracy = final_model.score(X_test, Y_test)
print("Final Random Forest Accuracy:", accuracy)
print("\nConfusion Matrix:\n", confusion_matrix(Y_test, y_pred))
print("\nClassification Report:\n", classification_report(Y_test, y_pred))
#exporting model
joblib.dump(final_model, "final_model.pkl")
print("Model saved as final_model.pkl")

Final Random Forest Accuracy: 0.8586956521739131

Confusion Matrix:
 [[62 13]
 [13 96]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.83      0.83        75
           1       0.88      0.88      0.88       109

    accuracy                           0.86       184
   macro avg       0.85      0.85      0.85       184
weighted avg       0.86      0.86      0.86       184

Model saved as final_model.pkl


In [6]:
#exporting pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Final chosen features
final_feature = [
    'thalch', 'age', 'oldpeak', 'chol', 'sex_Male',
    'cp_non-anginal', 'fbs', 'cp_typical angina',
    'restecg_st-t abnormality', 'exang',
    'slope_upsloping', 'cp_atypical angina'
]

# Preprocessing
preprocessor = ColumnTransformer([
    ("scaler", StandardScaler(), final_feature)
], remainder="drop") 

# Pipeline with your tuned Random Forest
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=2,
        min_samples_leaf=4,
        bootstrap=False,
        random_state=42
    ))
])
X_train = pd.DataFrame(X_train, columns=final_feature)
X_test = pd.DataFrame(X_test, columns=final_feature)
# Fit on your training set
pipeline.fit(X_train, Y_train)

# Evaluate
print("Pipeline accuracy:", pipeline.score(X_test, Y_test))
#saving
joblib.dump(pipeline, "final_pipeline.pkl")
print("pipeline saved as final_pipeline.pk1")



Pipeline accuracy: 0.8586956521739131
pipeline saved as final_pipeline.pk1
