In [4]:
# We need to first import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.cm import rainbow

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
from scipy import stats
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

In [5]:
data = pd.read_csv("dataset.csv")
data.head()

Unnamed: 0,Age,Sex,cp,trestbps,Cholesterol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [6]:
import pandas as pd

# Define 'categorical_val' as a list of the categorical column names you want to one-hot encode
categorical_val = ['Sex','cp','fbs','restecg','exang','slope','ca','thal','target']

# Assuming 'data' is your DataFrame
# Remove the 'target' column from the list of categorical values
categorical_val.remove('target')

# Use get_dummies to one-hot encode the categorical columns
dataset = pd.get_dummies(data, columns=categorical_val)

# Print the resulting DataFrame
dataset

Unnamed: 0,Age,trestbps,Cholesterol,thalach,oldpeak,target,Sex_0,Sex_1,cp_0,cp_1,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,63,145,233,150,2.3,1,False,True,False,False,...,False,True,False,False,False,False,False,True,False,False
1,37,130,250,187,3.5,1,False,True,False,False,...,False,True,False,False,False,False,False,False,True,False
2,41,130,204,172,1.4,1,True,False,False,True,...,True,True,False,False,False,False,False,False,True,False
3,56,120,236,178,0.8,1,False,True,False,True,...,True,True,False,False,False,False,False,False,True,False
4,57,120,354,163,0.6,1,True,False,True,False,...,True,True,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,140,241,123,0.2,0,True,False,True,False,...,False,True,False,False,False,False,False,False,False,True
299,45,110,264,132,1.2,0,False,True,False,False,...,False,True,False,False,False,False,False,False,False,True
300,68,144,193,141,3.4,0,False,True,True,False,...,False,False,False,True,False,False,False,False,False,True
301,57,130,131,115,1.2,0,False,True,True,False,...,False,False,True,False,False,False,False,False,False,True


In [7]:
from sklearn.preprocessing import StandardScaler
s_sc = StandardScaler()
col_to_scale = ['Age', 'trestbps', 'Cholesterol', 'thalach', 'oldpeak']
dataset[col_to_scale] = s_sc.fit_transform(dataset[col_to_scale])

In [8]:
from sklearn.model_selection import train_test_split
X = dataset.drop('target', axis=1)
y = dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Support Vector Machine

In [9]:
from sklearn.svm import SVC
svm_clf = SVC(kernel='rbf', gamma='auto', C=1.0)
svm_clf.fit(X_train, y_train)

In [10]:
pred = svm_clf.predict(X_train)
from sklearn.metrics import accuracy_score
print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")

Accuracy Score: 87.26%


#### Support Vector Machine Hyper-parameter Tuning

In [22]:
from sklearn.model_selection import GridSearchCV
svm_clf = SVC(kernel='rbf', gamma=0.1, C=1.0)

params = {"C":(0.1, 0.5, 1, 2, 5, 10, 20),
          "gamma":(0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1),
          "kernel":('linear', 'poly', 'rbf')}

svm_cv = GridSearchCV(svm_clf, params, n_jobs=-1, cv=5, verbose=1, scoring="accuracy")
svm_cv.fit(X_train, y_train)
best_params = svm_cv.best_params_
print(f"Best params: {best_params}")
svm_clf = SVC(**best_params)
svm_clf.fit(X_train, y_train)

Fitting 5 folds for each of 147 candidates, totalling 735 fits
Best params: {'C': 5, 'gamma': 0.01, 'kernel': 'rbf'}


In [23]:
predsvm = svm_clf.predict(X_train)
print(f"Accuracy Score: {accuracy_score(y_train, predsvm) * 100:.2f}%")

Accuracy Score: 87.74%


### Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf_clf = RandomForestClassifier(n_estimators=1000, random_state=42)
rf_clf.fit(X_train, y_train)

In [25]:
predrf = rf_clf.predict(X_test)
print(f"Accuracy Score:{accuracy_score(y_test,predrf)*100:.2f}%")

Accuracy Score:85.71%


In a random forest algorithm giving 100% accuracy i.e. maybe the model is learned very precisely and may not be good in testing or real data.

So, we can do hyperparameter tuning to set the best parameter for all the cases.

#### Random Forest Classifier Hyperparameter Tuning

In [15]:
from sklearn.model_selection import GridSearchCV
svm_clf = SVC(kernel='rbf', gamma=0.1, C=1.0)
params = {"C":(0.1, 0.5, 1, 2, 5, 10, 20),
          "gamma":(0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1),
          "kernel":('linear', 'poly', 'rbf')}
svm_cv = GridSearchCV(svm_clf, params, n_jobs=-1, cv=5, verbose=1, scoring="accuracy")
svm_cv.fit(X_train, y_train)
best_params = svm_cv.best_params_
print(f"Best params: {best_params}")
svm_clf = SVC(**best_params)
svm_clf.fit(X_train, y_train)

Fitting 5 folds for each of 147 candidates, totalling 735 fits


Best params: {'C': 5, 'gamma': 0.01, 'kernel': 'rbf'}


In [24]:
# Check the accuracy of the model after hyperparameter tuning.

predsvm = svm_clf.predict(X_train)
print(f"Accuracy Score: {accuracy_score(y_train, predsvm) * 100:.2f}%")

Accuracy Score: 84.62%


In [17]:
n_estimators = [500, 900, 1100, 1500]
max_features = ['auto', 'sqrt']
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
max_depth=[2,3,4]

params_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf
              }
rf_clf = RandomForestClassifier(random_state=42)
rf_cv = GridSearchCV(rf_clf, params_grid, scoring="accuracy", cv=3,
                     verbose=1, n_jobs=-1)
rf_cv.fit(X_train, y_train)
best_params = rf_cv.best_params_
rf_clf = RandomForestClassifier(**best_params)
rf_clf.fit(X_train, y_train)
print(f"Best parameters: {best_params}")

Fitting 3 folds for each of 216 candidates, totalling 648 fits


Best parameters: {'max_depth': 2, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1100}
