# Heart Disease Classification

In [1]:
import pandas as pd

df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [3]:
import numpy as np

df = df.sample(frac=1).reset_index(drop=True) # Shuffle
df=(df-df.min())/(df.max()-df.min())
df = df.astype(np.int16)

In [4]:
def train_test_split(df, test_radio = 0.2):
    thresh = int(df.shape[0] * test_radio)
    labels = df['target']
    df.drop(['target'], axis=1, inplace=True)
    return df.iloc[:thresh], df.iloc[thresh:], labels[:thresh], labels[thresh:]

test_x, train_x, test_y, train_y = train_test_split(df)
print(test_x.shape)
print(train_x.shape)
train_x

(60, 13)
(243, 13)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
60,0,1,0,0,0,0,0,0,0,0,1,0,1
61,0,1,1,0,0,0,0,0,0,0,1,0,0
62,0,1,0,0,0,0,0,0,0,0,0,0,1
63,0,0,0,0,0,0,0,0,0,0,1,0,0
64,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0,1,0,0,0,1,0,0,1,0,0,0,0
299,0,1,0,0,0,0,0,0,0,0,0,0,1
300,0,1,0,0,0,1,0,0,0,0,0,0,1
301,0,0,0,0,0,0,0,0,0,0,1,0,0


## Model class


In [47]:
class SoftmaxRegression:
    def __init__(self):
        np.random.seed(42)
        pass
    
    def one_hot_encode(self, Y):
        y = np.zeros((Y.shape[0], Y.max()+1))
        y[np.arange(Y.shape[0]), Y] = 1
        return y
    
    def softmax(self, X):
        exps = np.exp(X - X.max())
        for i in range(len(X)):
            exps[i] /= np.sum(exps[i])
        return exps
    
    def fit(self, X, Y, eta=0.05, alpha=0.001, iterations = 5001, epsilon=1e-7):
        self.num_classes = Y.max() + 1
        self.features = X.shape[1] + 1 # Add bias
        self.samples = X.shape[0]
        self.theta = np.random.randn(self.features, self.num_classes)
        X_bias = np.c_[np.ones((self.samples)), X]
        Y_hot_encoded = self.one_hot_encode(Y)
        
        for i in range(iterations):
            y_hat = X_bias.dot(self.theta)
            y_prob = self.softmax(y_hat)
            
            if i%500 == 0:
                xentropy = -np.mean(np.sum(Y_hot_encoded * np.log(y_prob + epsilon), axis=1))
                l2_loss = 1/2 * np.sum(np.square(self.theta[1:]))
                loss = xentropy + l2_loss * alpha
                print(i, loss)
            
            error = y_prob - Y_hot_encoded
            gradient = 1/self.samples * X_bias.T.dot(error) + np.r_[np.zeros([1,self.num_classes]), alpha * self.theta[1:]]
            self.theta = self.theta - eta * gradient 
            
    def predict(self, X):
        X = np.c_[(np.ones(X.shape[0])), X]
        y_hat = X.dot(self.theta)
        pred = self.softmax(y_hat)
        return pred.argmax(axis=1)
        
        

In [48]:
model = SoftmaxRegression()
model.fit(train_x, train_y)

0 1.340603134429683
500 0.48596557839505555
1000 0.4717594289985235
1500 0.4680250552786299
2000 0.46591098503321476
2500 0.46434904042567343
3000 0.4630779617836974
3500 0.4620046211049316
4000 0.46108322543983926
4500 0.4602848203430526
5000 0.45958820604275497


In [46]:
pred = model.predict(train_x)
print('Train: ', np.sum(pred == train_y) / len(pred))

pred = model.predict(test_x)
print('Test: ', np.sum(pred == test_y) / len(pred))

Train:  0.7695473251028807
Test:  0.8166666666666667
