In [110]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import matplotlib.pyplot as plt
%matplotlib inline
import platform

In [111]:
if True:
    import sys
    system = platform.system()
    if system == "Windows":
        sys.path.insert(0, 'C:/Users/Lorenzo/Desktop/Workspace/Github/Project-3/src')
    elif system == "Darwin":
        sys.path.insert(0, '/Users/lorenzogurrola/workspace/github.com/LorenzoGurrola/Project-3/src')
    from data_loader import prepare_train, prepare_test

In [112]:
def load_data():
    data = pd.read_csv('../framingham.csv')
    data = data.dropna()
    train, test = train_test_split(data, train_size=0.85, random_state=10)
    X_train, y_train, scalers = prepare_train(train)
    X_test, y_test = prepare_test(test, scalers)
    return X_train, y_train, X_test, y_test

In [113]:
X_train, y_train, X_test, y_test = load_data()
X_train.shape

(3107, 18)

In [114]:
X_train

array([[ 1.        ,  1.45632121,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 1.        ,  1.45632121,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.        ,  0.8710576 ,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.        ,  0.51989944,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.29946961,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 1.        , -1.46999683,  1.        , ...,  0.        ,
         0.        ,  1.        ]])

In [115]:
def initialize_params(X):
    n = X.shape[1]
    w = np.random.randn(n, 1) * 0.1
    b = np.zeros((1, 1))
    params = {'w':w, 'b':b}
    return params

In [116]:
def sigmoid(z):
    a = 1/(1 + np.exp(-z))
    return a

In [117]:
def forward_prop(X, params):
    w = params['w']
    b = params['b']
    z = X @ w + b
    yhat = sigmoid(z)
    return yhat

In [118]:
def calculate_cost(yhat, y):
    m = y.shape[0]
    losses = y * np.log(yhat) + (1 - y) * np.log(1 - yhat)
    cost = -np.sum(losses, axis=0, keepdims=True)/m
    return cost

In [119]:
def back_prop(y, yhat, X):
    m = y.shape[0]
    dc_dyhat = (-1/m) * ((y/yhat) - ((1-y)/(1-yhat)))
    dyhat_dz = yhat * (1 - yhat)
    dc_dz = dc_dyhat * dyhat_dz
    dc_dw = np.matmul(X.T, dc_dz)
    dc_db = np.sum(dc_dz, axis=0, keepdims=True)
    grads = {'dw':dc_dw, 'db':dc_db}
    return grads
    

In [120]:
def update_params(params, grads, lr=0.1):
    w = params['w']
    b = params['b']
    dw = grads['dw']
    db = grads['db']
    w = w - lr * dw
    b = b - lr * db
    new_parms = {'w':w,'b':b}
    return new_parms

In [121]:
def save_params(model, params):
    path = '../src/models/' + model
    for p in params:
        param_path = path + '/' + p + '.npy'
        np.save(param_path, params[p])

In [122]:
def load_params(model, X_train):
    params = initialize_params(X_train)
    path = '../src/models/' + model
    if not os.path.exists(path):
        os.makedirs(path)
        #print(f'created new model {model}')
        return params
    else:
        for p in params:
            try: 
                param_path = path + '/' + p + '.npy'
                params[p] = np.load(param_path)
                #print(f'loaded param {p} with value {params[p]}')
            except FileNotFoundError:
                print(f'couldnt find param {p}, continuing with default value')

    return params

In [123]:
def train_loop(model, epochs, X_train, y_train, lr):
    params = load_params(model, X_train)
    for epoch in range(epochs):
        yhat = forward_prop(X_train, params)
        cost = calculate_cost(yhat, y_train)
        grads = back_prop(y_train, yhat, X_train)
        params = update_params(params, grads, lr)
        print(f'epoch {epoch} cost {cost}')
    save_params(model, params)
    print(f'saved params {params} in model {model}')

In [124]:
def predict(model, X_test, y_test):
    params = load_params(model, np.random.randn(80,2))
    yhat = forward_prop(X_test, params)
    ypred = (yhat > 0.5).astype(int)
    match = (ypred == y_test)
    score = np.sum(match == 1)/match.size
    return score

In [134]:
model = 'm8'
epochs = 10000
lr = 0.1

train_loop(model, epochs, X_train, y_train, lr)

epoch 0 cost [[0.7029395]]
epoch 1 cost [[0.67843666]]
epoch 2 cost [[0.6562108]]
epoch 3 cost [[0.63604141]]
epoch 4 cost [[0.61772525]]
epoch 5 cost [[0.60107662]]
epoch 6 cost [[0.58592702]]
epoch 7 cost [[0.5721244]]
epoch 8 cost [[0.55953208]]
epoch 9 cost [[0.54802762]]
epoch 10 cost [[0.53750153]]
epoch 11 cost [[0.52785608]]
epoch 12 cost [[0.51900407]]
epoch 13 cost [[0.51086775]]
epoch 14 cost [[0.50337779]]
epoch 15 cost [[0.49647236]]
epoch 16 cost [[0.49009624]]
epoch 17 cost [[0.48420012]]
epoch 18 cost [[0.47873992]]
epoch 19 cost [[0.47367616]]
epoch 20 cost [[0.46897346]]
epoch 21 cost [[0.46460009]]
epoch 22 cost [[0.46052752]]
epoch 23 cost [[0.45673009]]
epoch 24 cost [[0.45318467]]
epoch 25 cost [[0.44987041]]
epoch 26 cost [[0.44676845]]
epoch 27 cost [[0.44386175]]
epoch 28 cost [[0.44113485]]
epoch 29 cost [[0.43857372]]
epoch 30 cost [[0.43616565]]
epoch 31 cost [[0.43389903]]
epoch 32 cost [[0.43176331]]
epoch 33 cost [[0.42974887]]
epoch 34 cost [[0.42784692]

In [136]:
model = 'm7'

score = predict(model, X_test, y_test)
score

0.6812386156648452