# Predicting Heart Diseases (Cleveland)

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from category_encoders import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline

## Feature Classification

In [2]:
features = [ 
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", 
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"
]

ordinal = ["slope", "ca"]
norminal = ["sex", "thal", "cp", "restecg", "exang"]

# Running EDA

In [3]:
def wrangle_data(filepath):
    df = pd.read_csv(filepath)
    df.columns = features
    for ordi in ordinal:
        df[ordi] = pd.to_numeric(df[ordi], errors="coerce")
    
    #use the original value of cp, restecg and thal
    df["cp"] = df["cp"].replace({1: "typical_angina", 2: "atypical_angina", 3: "non_angina", 4: "asymptomatic"})
    df["thal"] = df["thal"].replace({"3.0": "normal", "6.0": "fixed_defect", "7.0": "reversible_defect", "?": "normal"})
    df["restecg"] = df["restecg"].replace({0: "normal", 1: "abnormal", 2: "probably"})
    
    #Convert and clean the ca features
    df["ca"] = df["ca"].replace("?", "0").fillna(0.0).astype(int)
    
    #Convert neccesary columns to int
    discrete_value = ["age", "sex", "fbs", "exang", "trestbps", "thalach", "slope", "chol"]
    df[discrete_value] = df[discrete_value].astype(int)
    
    #Change the target variable to binary
    df["num"] = (df["num"] > 0).astype(int)
    
    return df

In [4]:
clev = wrangle_data("heart+disease/cleveland.csv")
clev.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,67,1,asymptomatic,160,286,0,probably,108,1,1.5,2,3,normal,1
1,67,1,asymptomatic,120,229,0,probably,129,1,2.6,2,2,reversible_defect,1
2,37,1,non_angina,130,250,0,normal,187,0,3.5,3,0,normal,0
3,41,0,atypical_angina,130,204,0,probably,172,0,1.4,1,0,normal,0
4,56,1,atypical_angina,120,236,0,normal,178,0,0.8,1,0,normal,0


In [5]:
clev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       302 non-null    int64  
 1   sex       302 non-null    int64  
 2   cp        302 non-null    object 
 3   trestbps  302 non-null    int64  
 4   chol      302 non-null    int64  
 5   fbs       302 non-null    int64  
 6   restecg   302 non-null    object 
 7   thalach   302 non-null    int64  
 8   exang     302 non-null    int64  
 9   oldpeak   302 non-null    float64
 10  slope     302 non-null    int64  
 11  ca        302 non-null    int64  
 12  thal      302 non-null    object 
 13  num       302 non-null    int64  
dtypes: float64(1), int64(10), object(3)
memory usage: 33.2+ KB


In [6]:
clev.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64