In [1]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import joblib

import pandas as pd


In [2]:
# load data 
df = pd.read_csv('../data/feature_selected.csv')
x= df.drop('num', axis =1)
y =df['num']
#split data 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [3]:
#hyperparameter grids 
param_grids ={
    'Random Forest': {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    },
    'SVM': {
        'model__C': [0.1, 1, 10],
        'model__gamma': ['scale', 'auto', 0.01, 0.1],
        'model__kernel': ['rbf', 'poly']
    }
}

In [4]:
#grid search 
best_models={}
for model_name , params in param_grids.items():
    if model_name == 'Random Forest':
        pipeline = Pipeline([
            ('scalar', StandardScaler()),
            ('model', RandomForestClassifier(random_state=42))
        ])
    elif model_name== 'SVM':
        pipeline =Pipeline([
            ('scaler', StandardScaler()),
            ('model', SVC(random_state=42, probability=True))
        ])
    grid_search = GridSearchCV(pipeline, params, cv=5, scoring = 'f1', n_jobs=-1)
    grid_search.fit(x_train , y_train)
    best_models[model_name] = grid_search.best_estimator_
    print(f"{model_name} best params: {grid_search.best_params_}")
    print(f"{model_name} best score: {grid_search.best_score_:.3f}")
    
    

Random Forest best params: {'model__max_depth': None, 'model__min_samples_leaf': 2, 'model__min_samples_split': 10, 'model__n_estimators': 200}
Random Forest best score: 0.757
SVM best params: {'model__C': 10, 'model__gamma': 'scale', 'model__kernel': 'rbf'}
SVM best score: 0.774


In [5]:
#save the best model 
final_model = best_models['SVM']
joblib.dump(final_model, '../models/complete_pipeline.pkl')


['../models/complete_pipeline.pkl']

In [6]:
# Save selected features for the app
selected_features = list(x.columns)
joblib.dump(selected_features, '../models/selected_features.pkl')
print(f"Final model saved: {type(final_model)}")
print(f"Selected features: {selected_features}")


Final model saved: <class 'sklearn.pipeline.Pipeline'>
Selected features: ['oldpeak', 'thal_7.0', 'cp_4', 'exang', 'slope_2']


In [7]:
#test model 
test_prediction = final_model.predict(x_test[:1])
test_probability = final_model.predict_proba(x_test[:1])
print(f"Test prediction: {test_prediction}")
print(f"Test probability: {test_probability}")

Test prediction: [0]
Test probability: [[0.79388396 0.20611604]]
