#Import the Dependencies

In [2]:
import numpy as np   
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#Data collection and processing

In [3]:
#loading the csv data to a Pandas Dataframe
heart_data = pd.read_csv('dataset.csv')

#print first 5 rows of dataframe
heart_data.head()


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [4]:
#getting info of the dataset
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [5]:
#seperate the features and target variables

X = heart_data.drop(columns=['id','num','dataset'])   #creating new dataframe 'X' from the original dataframe except for the 'num', 'id' and 'dataset' columns
y = heart_data['num']  

In [6]:
#Identify the categorical and numerical columns

categorical_cols = X.select_dtypes(include=['object']).columns  #string data types
numerical_cols = X.select_dtypes(include=['int64','float64']).columns  #float and int data types

print(f"Categorical : {categorical_cols} \n Numerical : {numerical_cols}")

Categorical : Index(['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal'], dtype='object') 
 Numerical : Index(['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca'], dtype='object')


In [7]:
#Preprocessing pipeline for numerical data

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),   #replace the missing values with the mean value of that column
    ('scaler', StandardScaler())                   #scales the features to a mean of 0 and a standard deviation of 1.
])

In [8]:
#Preprocessing pipeline for categorical data

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values with most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # encode categorical data into numerical data
])

In [9]:
# Combine both transformers into a single ColumnTransformer

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [10]:
# Apply the transformation to the dataset

X_preprocessed = preprocessor.fit_transform(X)

In [14]:
# Extract the names of the numerical columns
numerical_columns = numerical_cols

# Extract the names of the categorical columns after one-hot encoding
categorical_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols)

# Combine both numerical and categorical column names into one list
all_column_names = list(numerical_columns) + list(categorical_columns)

# Convert the preprocessed data into a DataFrame with the identified column names
df_preprocessed = pd.DataFrame(X_preprocessed, columns=all_column_names)

# Display the DataFrame
print(df_preprocessed.head())


        age  trestbps      chol    thalch   oldpeak        ca  sex_Female  \
0  1.007386  0.698041  0.311021  0.495698  1.349421 -1.249371         0.0   
1  1.432034  1.511761  0.797713 -1.175955  0.589832  4.292099         0.0   
2  1.432034 -0.658158  0.274289 -0.340128  1.634267  2.444942         0.0   
3 -1.752828 -0.115679  0.467130  1.968345  2.488805 -1.249371         0.0   
4 -1.328180 -0.115679  0.044717  1.371326  0.494884 -1.249371         1.0   

   sex_Male  cp_asymptomatic  cp_atypical angina  ...  restecg_normal  \
0       1.0              0.0                 0.0  ...             0.0   
1       1.0              1.0                 0.0  ...             0.0   
2       1.0              1.0                 0.0  ...             0.0   
3       1.0              0.0                 0.0  ...             1.0   
4       0.0              0.0                 1.0  ...             0.0   

   restecg_st-t abnormality  exang_False  exang_True  slope_downsloping  \
0                      