In [23]:
import numpy as np
import pandas as pd
import statistics

In [24]:
data = pd.read_csv("datasetframingham.csv")

In [25]:
data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [26]:
# Pre-training data preparation

data = np.asarray(data)
X = data[:, :-1]
Y = data[:, -1]

# Replacing nan value with average value of values from the same column
for column in range(X.shape[1]):
    mean = statistics.mean(v for v in X[:, column] if not np.isnan(v))
    X[:, column][np.isnan(X[:, column])] = mean

# Feature scaling to range[0:1]
for column in range(X.shape[1]):
    min_value = min(X[:, column])
    max_value = max(X[:, column])
    diff = max_value - min_value
    scaled_vector = list(map(lambda v: (v - min_value)/ diff, X[:, column]))
    X[:, column] = scaled_vector

# Extend data by adding x0 = 1 to each input entry point
X = np.concatenate((np.ones((X.shape[0], 1)), X), axis = 1)

# Randomize the order of data
permutation = np.random.permutation(X.shape[0])
X = X[permutation]
Y = Y[permutation]

# Use 15% of total input for testing, and the rest for training
num_test = int(0.15 * len(X))
# Training dataset
X_train = X[: -num_test]
Y_train = Y[: -num_test]
# Testing dataset
X_test = X[-num_test:]
Y_test = Y[-num_test:]

In [27]:
# Sigmoid function
def sigmoid(s):
    return 1/(1 + np.exp(-s))

# Method to train logistic regression model with the training dataset
def logistic_regression(X, Y, w_init, alpha, eps = 1e-4, max_cnt = 100000):
    w = [w_init]
    N = X.shape[0]
    d = X.shape[1]
    cnt = 0
    check_w_after = 20
    while cnt < max_cnt:
        # Randomize the order of training point in the dataset
        permutation = np.random.permutation(N)
        for i in permutation:
            xi = X[i, :].reshape(d, 1)
            yi = Y[i]
            zi = sigmoid(np.dot(w[-1].T, xi))
            w_new = w[-1] + alpha * (yi - zi) * xi
            cnt += 1
            # Stop when the weight barely changes after 20 iterations
            if cnt % check_w_after == 0:
                if np.linalg.norm(w_new - w[-check_w_after]) < eps:
                    return w
            w.append(w_new)
    return w

# Train
alpha = 0.05
d = X.shape[1]
w_init = np.random.randn(d, 1)
w = logistic_regression(X_train, Y_train, w_init, alpha)
w = w[-1]

In [28]:
# Test the logistic regression model with testing dataset
correctTest = 0
num_test = len(X_test)
for i in range(num_test):
    prob = np.dot(X_test[i].T, w)
    if prob >= 0.5 and Y_test[i] == 1:
        correctTest += 1
    elif prob < 0.5 and Y_test[i] == 0:
        correctTest += 1
print("Accuracy: {} %".format((correctTest / num_test) * 100))

Accuracy: 86.77165354330708 %
