Hyperparameter Tunning

GridSearch & Randomized Search

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("johnsmith88/heart-disease-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/heart-disease-dataset


In [2]:
import pandas as pd
import os


In [3]:
path = kagglehub.dataset_download("johnsmith88/heart-disease-dataset")


In [5]:
csv_file = os.path.join(path, "heart.csv")
df = pd.read_csv(csv_file)

df.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [10]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline


# Features and target
X = df.drop('target', axis=1)
y = df['target']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [12]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Predict & Evaluate
lr_preds = lr.predict(X_test)
print("Logistic Regression Accuracy (Test):", accuracy_score(y_test, lr_preds))


Logistic Regression Accuracy (Test): 0.7951219512195122


In [14]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Predict & Evaluate
rf_preds = rf.predict(X_test)
print("Random Forest Accuracy (Test):", accuracy_score(y_test, rf_preds))


Random Forest Accuracy (Test): 0.9853658536585366


In [15]:
# 5-fold cross-validation
lr_scores = cross_val_score(LogisticRegression(), X, y, cv=5, scoring='accuracy')
rf_scores = cross_val_score(RandomForestClassifier(), X, y, cv=5, scoring='accuracy')

print("Logistic Regression Cross-Val Accuracy Scores:", lr_scores)
print("Mean Logistic Regression CV Accuracy:", lr_scores.mean())

print("Random Forest Cross-Val Accuracy Scores:", rf_scores)
print("Mean Random Forest CV Accuracy:", rf_scores.mean())


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression Cross-Val Accuracy Scores: [0.88292683 0.85365854 0.86341463 0.8195122  0.8       ]
Mean Logistic Regression CV Accuracy: 0.8439024390243903
Random Forest Cross-Val Accuracy Scores: [1.         1.         1.         1.         0.98536585]
Mean Random Forest CV Accuracy: 0.9970731707317073


In [13]:
# Logistic Regression
lr = LogisticRegression()

# Hyperparameter grids
lr_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}

# Grid Search
lr_grid_search = GridSearchCV(lr, lr_grid, cv=5, scoring='accuracy')
lr_grid_search.fit(X_train, y_train)

print("Best Params (GridSearchCV):", lr_grid_search.best_params_)
print("Test Accuracy:", accuracy_score(y_test, lr_grid_search.predict(X_test)))


Best Params (GridSearchCV): {'C': 1, 'solver': 'liblinear'}
Test Accuracy: 0.7951219512195122


In [8]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Randomized Search
rf_random_search = RandomizedSearchCV(rf, rf_grid, n_iter=5, cv=5, scoring='accuracy', random_state=42)
rf_random_search.fit(X_train, y_train)

print("Best Params (RandomizedSearchCV):", rf_random_search.best_params_)
print("Test Accuracy:", accuracy_score(y_test, rf_random_search.predict(X_test)))


Best Params (RandomizedSearchCV): {'n_estimators': 50, 'min_samples_split': 2, 'max_depth': None}
Test Accuracy: 0.9853658536585366


In [9]:
from sklearn.svm import SVC

svc = SVC()

svc_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

svc_grid_search = GridSearchCV(svc, svc_grid, cv=5, scoring='accuracy')
svc_grid_search.fit(X_train, y_train)

print("Best Params (GridSearchCV):", svc_grid_search.best_params_)
print("Test Accuracy:", accuracy_score(y_test, svc_grid_search.predict(X_test)))


Best Params (GridSearchCV): {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Test Accuracy: 0.975609756097561
