In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import os
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as stats 

In [12]:
class Scaler:
    def __init__(self):
        self.params = {}
    
    def fit(self, data):
        self.params['mean'] = np.mean(data)
        self.params['std_dev'] = np.std(data)
    
    def standardize(self, data):
        return (data - self.params['mean'])/self.params['std_dev']
    
    def fit_standardize(self, data):
        self.fit(data)
        return self.standardize(data)

    def unstandardize(self, data):
        return (data * self.params['std_dev']) + self.params['mean']

In [14]:
def one_hot_encode(data):
    data = [str(item) for item in data]
    unique_category = sorted(set(data))
    category_index = {category : index for index, category in enumerate(unique_category)}
    one_hot_matrix = []
    for item in data:
        one_hot_vector = [0] * len(unique_category)
        index = category_index[item]
        one_hot_vector[index] = 1
        one_hot_matrix.append(one_hot_vector)
    one_hot_matrix = np.array(one_hot_matrix)
    return one_hot_matrix, category_index

In [3]:
def get_features(data):
    
    x1 = np.array(data['male'])
    x1 = np.reshape(x1, (x1.shape[0], 1))

    x2 = np.array(data['age'])
    x2 = np.reshape(x2, (x2.shape[0], 1))
    scaler_x2 = Scaler()
    x2 = scaler_x2.fit_standardize(x2)

    x3 = np.array(data['currentSmoker'])
    x3 = np.reshape(x3, (x3.shape[0], 1))

    x4 = np.array(data['cigsPerDay'])
    x4 = np.reshape(x4, (x4.shape[0], 1))
    scaler_x4 = Scaler()
    x4 = scaler_x4.fit_standardize(x4)

    x5 = np.array(data['BPMeds'])
    x5 = np.reshape(x5, (x5.shape[0], 1))
    
    x6 = np.array(data['prevalentStroke'])
    x6 = np.reshape(x6, (x6.shape[0], 1))

    x7 = np.array(data['prevalentHyp'])
    x7 = np.reshape(x7, (x7.shape[0], 1))
    
    x8 = np.array(data['diabetes'])
    x8 = np.reshape(x8, (x8.shape[0], 1))

    x9 = np.array(data['totChol'])
    x9 = np.reshape(x9, (x9.shape[0], 1))
    scaler_x9 = Scaler()
    x9 = scaler_x9.fit_standardize(x9)

    x10 = np.array(data['sysBP'])
    x10 = np.reshape(x10, (x10.shape[0], 1))
    scaler_x10 = Scaler()
    x10 = scaler_x10.fit_standardize(x10)

    x11 = np.array(data['diaBP'])
    x11 = np.reshape(x11, (x11.shape[0], 1))
    scaler_x11 = Scaler()
    x11 = scaler_x11.fit_standardize(x11)

    x12 = np.array(data['BMI'])
    x12 = np.reshape(x12, (x12.shape[0], 1))
    scaler_x12 = Scaler()
    x12 = scaler_x12.fit_standardize(x12)

    x13 = np.array(data['heartRate'])
    x13 = np.reshape(x13, (x13.shape[0], 1))
    scaler_x13 = Scaler()
    x13 = scaler_x13.fit_standardize(x13)

    x14 = np.array(data['glucose'])
    x14 = np.reshape(x14, (x14.shape[0], 1))
    scaler_x14 = Scaler()
    x14 = scaler_x14.fit_standardize(x14)

    x15 = np.array(data['education'])
    x3 = np.reshape(x3, (x3.shape[0], 1))

    X = [x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15]
    X = np.hstack(X)

    y = np.array(data['TenYearCHD'])
    y = np.reshape(y, (y.shape[0], 1))
    
    return X, y

In [4]:
def load_data():
    data = pd.read_csv('../framingham.csv')
    data = data.dropna()
    train, test = train_test_split(data, train_size=0.85, random_state=10)
    return train, test

In [5]:
train, test = load_data()
train.shape

(3107, 16)

In [6]:
X_train, y_train = get_features(train)

In [7]:
X_train.shape

(3107, 15)

In [8]:
X_test, y_test = get_features(test)

In [9]:
X_test.shape

(549, 15)

In [11]:
X_train

array([[ 1.        ,  1.45632121,  4.        , ..., -0.24814503,
        -0.31840178, -0.40621065],
       [ 1.        ,  1.45632121,  1.        , ...,  0.04271922,
         2.42174032, -0.24199277],
       [ 1.        ,  0.8710576 ,  3.        , ..., -0.15361415,
         0.92711736,  0.78436895],
       ...,
       [ 0.        ,  0.51989944,  1.        , ..., -0.4759887 ,
        -0.65054022,  0.04538851],
       [ 0.        , -0.29946961,  2.        , ...,  0.39902794,
        -1.31481709,  0.4559332 ],
       [ 1.        , -1.46999683,  4.        , ..., -0.46144548,
         1.01015197,  0.04538851]])

In [8]:
train.head(10)

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
726,1,62,4.0,1,20.0,0.0,0,0,0,292.0,130.0,77.5,24.75,72.0,72.0,0
1783,1,62,1.0,1,10.0,0.0,0,0,0,157.0,134.0,84.0,25.95,105.0,76.0,1
2061,1,57,3.0,1,20.0,0.0,0,0,0,257.0,138.5,90.0,25.14,87.0,101.0,0
3178,0,49,3.0,0,0.0,0.0,0,0,0,278.0,131.0,93.0,31.4,80.0,66.0,0
290,0,44,3.0,0,0.0,0.0,0,0,0,270.0,131.5,76.0,22.19,68.0,113.0,0
1173,0,36,3.0,0,0.0,0.0,0,0,0,185.0,123.0,69.0,18.98,79.0,75.0,0
908,1,64,2.0,1,40.0,0.0,0,0,0,206.0,126.0,82.0,24.35,95.0,97.0,0
3686,0,51,2.0,1,5.0,0.0,0,0,0,315.0,119.0,75.0,25.79,75.0,55.0,0
2499,0,46,2.0,1,15.0,0.0,0,0,0,232.0,115.0,70.0,25.18,75.0,59.0,1
892,1,38,1.0,1,15.0,0.0,0,0,0,180.0,111.0,61.0,21.51,66.0,75.0,0


In [70]:
def initialize_params(X):
    n = X.shape[1]
    w = np.random.randn(n, 1) * 0.1
    b = np.zeros((1, 1))
    params = {'w':w, 'b':b}
    return params

In [71]:
def sigmoid(z):
    a = 1/(1 + np.exp(-z))
    return a

In [72]:
def forward_prop(X, params):
    w = params['w']
    b = params['b']
    z = X @ w + b
    yhat = sigmoid(z)
    return yhat

In [73]:
def calculate_cost(yhat, y):
    m = y.shape[0]
    losses = y * np.log(yhat) + (1 - y) * np.log(1 - yhat)
    cost = -np.sum(losses, axis=0, keepdims=True)/m
    return cost

In [74]:
def back_prop(y, yhat, X):
    m = y.shape[0]
    dc_dyhat = (-1/m) * ((y/yhat) - ((1-y)/(1-yhat)))
    dyhat_dz = yhat * (1 - yhat)
    dc_dz = dc_dyhat * dyhat_dz
    dc_dw = np.matmul(X.T, dc_dz)
    dc_db = np.sum(dc_dz, axis=0, keepdims=True)
    grads = {'dw':dc_dw, 'db':dc_db}
    return grads
    

In [75]:
def update_params(params, grads, lr=0.1):
    w = params['w']
    b = params['b']
    dw = grads['dw']
    db = grads['db']
    w = w - lr * dw
    b = b - lr * db
    new_parms = {'w':w,'b':b}
    return new_parms

In [76]:
def save_params(model, params):
    path = '../src/models/' + model
    for p in params:
        param_path = path + '/' + p + '.npy'
        np.save(param_path, params[p])

In [93]:
def load_params(model, X_train):
    params = initialize_params(X_train)
    path = '../src/models/' + model
    if not os.path.exists(path):
        os.makedirs(path)
        #print(f'created new model {model}')
        return params
    else:
        for p in params:
            try: 
                param_path = path + '/' + p + '.npy'
                params[p] = np.load(param_path)
                #print(f'loaded param {p} with value {params[p]}')
            except FileNotFoundError:
                print(f'couldnt find param {p}, continuing with default value')

    return params

In [99]:
def train_loop(model, epochs, X_train, y_train, lr):
    params = load_params(model, X_train)
    for epoch in range(epochs):
        yhat = forward_prop(X_train, params)
        cost = calculate_cost(yhat, y_train)
        grads = back_prop(y_train, yhat, X_train)
        params = update_params(params, grads, lr)
        print(f'epoch {epoch} cost {cost}')
    save_params(model, params)
    print(f'saved params {params} in model {model}')

In [100]:
def predict(model, X_test, y_test):
    params = load_params(model, np.random.randn(80,2))
    yhat = forward_prop(X_test, params)
    ypred = (yhat > 0.5).astype(int)
    match = (ypred == y_test)
    score = np.sum(match == 1)/match.size
    return score

In [101]:
model = 'm2'
epochs = 1
lr = 0.1

train_loop(model, epochs, X_train, y_train, lr)

epoch 0 cost [[0.36881446]]
saved params {'w': array([[ 0.52844298],
       [ 0.51755028],
       [-0.0593299 ],
       [ 0.16646724],
       [ 0.1064266 ],
       [ 0.22807806],
       [ 0.31169054],
       [ 0.29072745],
       [ 0.00482143],
       [ 0.08294769],
       [ 0.25492814],
       [ 0.00620659],
       [ 0.06544819],
       [ 0.01366865],
       [ 0.17139746]]), 'b': array([[-2.33365928]])} in model m2


In [98]:
model = 'm2'

score = predict(model, X_test, y_test)
score

0.8142076502732241