In [1]:
# Import data
import pandas as pd
df = pd.read_csv("../dataset/heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [2]:
# Features and target variables
X = df.drop("HeartDisease",axis=1)
y = df["HeartDisease"]

In [8]:
# Categorical variables 
X.ChestPainType.unique(), X.RestingECG.unique(), X.ExerciseAngina.unique() , X.ST_Slope.unique()

(array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object),
 array(['Normal', 'ST', 'LVH'], dtype=object),
 array(['N', 'Y'], dtype=object),
 array(['Up', 'Flat', 'Down'], dtype=object))

In [6]:
# Missing values
X.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
dtype: int64

In [57]:
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from dense_transformer import DenseTransformer
import warnings
warnings.filterwarnings("ignore")

# Splitting features into categories and numerics
numerics = ["Age","RestingBP","Cholesterol","FastingBS","MaxHR","Oldpeak"]
categoricals = ["Sex","ChestPainType","RestingECG" , "ExerciseAngina"]
ordinals = ["ST_Slope"]


# Numeric transformer
num_transformer = Pipeline(steps=[
    ("Normalization", MinMaxScaler())
])
# Categorical transformation
cat_transformer = Pipeline(steps=[
    ("Encoder", OneHotEncoder())
])

ord_transformer = Pipeline(steps=[

     ("OrdinalEncoder" , OrdinalEncoder())
])


# Column transformer 
col_transformer = ColumnTransformer( transformers=[
    ("numeric", num_transformer, numerics),
    ("categorics", cat_transformer, categoricals),
    ("ordinals" , ord_transformer , ordinals)
])

#Dimensionality reduction
pca = PCA()
lda = LDA()

# Pipeline
pipe = Pipeline( steps=[
    ("preprocessing", col_transformer),
    ("dimensionReduction" , pca),
    ("model", LogisticRegression())
])
# Grid params
grid_params = {
    "dimensionReduction" : [pca,lda],
    "dimensionReduction__n_components" : range(1,11),
    "model__C" : range(1,11),
    "model__solver" : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    "model__multi_class" : ['auto', 'ovr', 'multinomial']
}
# grid searching for K tuning
grid = GridSearchCV(pipe,grid_params)
# Diagram 
set_config(display="diagram")
grid

In [29]:
# Fitting the data to the grid search process
fitted = grid.fit(X,y)

In [30]:
# Inspecting results
result_df = pd.DataFrame(fitted.cv_results_).drop(["mean_fit_time","std_fit_time",
                                       "mean_score_time","std_score_time",
                                       "params","split0_test_score"	,"split1_test_score",
                                       "split2_test_score","split3_test_score","split4_test_score"],axis=1)

result_df.sort_values(by=["rank_test_score"])

Unnamed: 0,param_dimensionReduction,param_dimensionReduction__n_components,param_model__C,param_model__multi_class,param_model__solver,mean_test_score,std_test_score,rank_test_score
458,PCA(n_components=4),4,1,ovr,sag,0.848521,0.048366,1
450,PCA(n_components=4),4,1,auto,newton-cg,0.848521,0.048366,1
451,PCA(n_components=4),4,1,auto,lbfgs,0.848521,0.048366,1
452,PCA(n_components=4),4,1,auto,liblinear,0.848521,0.048366,1
453,PCA(n_components=4),4,1,auto,sag,0.848521,0.048366,1
...,...,...,...,...,...,...,...,...
2021,LinearDiscriminantAnalysis(),4,5,multinomial,lbfgs,,,2996
2020,LinearDiscriminantAnalysis(),4,5,multinomial,newton-cg,,,2997
2019,LinearDiscriminantAnalysis(),4,5,ovr,saga,,,2998
2017,LinearDiscriminantAnalysis(),4,5,ovr,liblinear,,,2999


In [31]:
# Best parameters and best score
"Best parameters : " , fitted.best_params_ , "Best score : " , fitted.best_score_

('Best parameters : ',
 {'dimensionReduction': PCA(n_components=4),
  'dimensionReduction__n_components': 4,
  'model__C': 1,
  'model__multi_class': 'auto',
  'model__solver': 'newton-cg'},
 'Best score : ',
 0.84852102637206)

In [35]:
# Confusion matrix using the best estimator from the grid search
from sklearn.metrics import confusion_matrix
final_model = fitted.best_estimator_
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 0)
final_model.fit(X_train,y_train)
y_pred = final_model.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
cm

array([[62, 15],
       [16, 91]])