# Heart Disease Classification

In [1]:
import pandas as pd

df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [3]:
import numpy as np

df = df.sample(frac=1).reset_index(drop=True) # Shuffle
df=(df-df.min())/(df.max()-df.min())
df = df.astype(np.int16)

In [4]:
def train_test_split(df, test_radio = 0.2):
    thresh = int(df.shape[0] * test_radio)
    labels = df['target']
    df.drop(['target'], axis=1, inplace=True)
    return df.iloc[:thresh], df.iloc[thresh:], labels[:thresh], labels[thresh:]

test_x, train_x, test_y, train_y = train_test_split(df)
print(test_x.shape)
print(train_x.shape)
train_x

(60, 13)
(243, 13)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
60,0,0,0,0,0,0,0,0,0,0,1,0,0
61,0,0,0,0,0,0,0,0,1,0,1,0,0
62,0,1,0,0,0,0,0,0,1,0,0,0,0
63,0,1,1,0,0,0,0,0,0,0,1,0,0
64,0,1,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0,1,0,0,0,0,0,0,1,0,1,0,1
299,0,1,0,0,0,0,0,0,0,0,1,0,0
300,0,0,0,0,0,1,0,0,0,0,0,0,0
301,0,0,0,0,0,0,0,0,0,0,0,0,0


## Model class


In [18]:
class SoftmaxRegression:
    def __init__(self):
        np.random.seed(42)
        pass
    
    def one_hot_encode(self, Y):
        y = np.zeros((Y.shape[0], Y.max()+1))
        y[np.arange(Y.shape[0]), Y] = 1
        return y
    
    def softmax(self, X):
        exps = np.exp(X - X.max())
        for i in range(len(X)):
            exps[i] /= np.sum(exps[i])
        return exps
    
    def fit(self, X, Y, eta=0.1, alpha=0.001, iterations = 10000, epsilon=1e-7):
        self.num_classes = Y.max() + 1
        self.features = X.shape[1] + 1 # Add bias
        self.samples = X.shape[0]
        self.theta = np.random.randn(self.features, self.num_classes)
        X_bias = np.c_[np.ones((self.samples)), X]
        Y_hot_encoded = self.one_hot_encode(Y)
        
        for i in range(iterations):
            y_hat = X_bias.dot(self.theta)
            y_prob = self.softmax(y_hat)
            error = y_prob - Y_hot_encoded
            gradient = 1/self.samples * X_bias.T.dot(error)
            self.theta = self.theta - eta * gradient
            if i % 500 == 0:
                loss = -np.mean(np.sum(Y_hot_encoded * np.log(y_prob + epsilon), axis=1))
                print(i, loss)
            
    def predict(self, X):
        X = np.c_[(np.ones(X.shape[0])), X]
        y_hat = X.dot(self.theta)
        pred = self.softmax(y_hat)
        return pred.argmax(axis=1)
        
        

In [19]:
model = SoftmaxRegression()
model.fit(train_x, train_y)

0 1.344089791923541
500 0.5051574743118452
1000 0.5005518004471757
1500 0.4983311592324403
2000 0.4967605744321973
2500 0.4955842871801357
3000 0.49468228447176205
3500 0.4939768549095713
4000 0.49341475710597726
4500 0.4929589221526972
5000 0.49258321341507505
5500 0.49226895321568703
6000 0.49200260136229096
6500 0.49177419453888976
7000 0.49157628950080223
7500 0.49140324115165507
8000 0.49125070451651265
8500 0.4911152874931494
9000 0.49099430598057797
9500 0.4908856091491013


In [20]:
pred = model.predict(train_x)
print('Train: ', np.sum(pred == train_y) / len(pred))

pred = model.predict(test_x)
print('Test: ', np.sum(pred == test_y) / len(pred))

Train:  0.7818930041152263
Test:  0.9333333333333333
