In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score


s3= boto3.resource('s3')
bucket_name= 'macklins-bucket'
bucket= s3.Bucket(bucket_name)

file_key= 'framingham.csv'

bucket_object= bucket.Object(file_key)
file_object= bucket_object.get()
file_content_stream= file_object.get('Body')

## Reading the data file
heart= pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [2]:
## Removing observations with NA
heart= heart.dropna()

In [3]:
## Defining the input and target variables
X = heart[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate']]
Y= heart['TenYearCHD']

## Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [5]:
## Building the logistic model
logit_md = LogisticRegression().fit(X_train, Y_train)

## Predicting on the test dataset
logit_pred = logit_md.predict_proba(X_test)[:,1]
logit_pred

array([0.2697741 , 0.17085718, 0.30736246, 0.15919261, 0.04029578,
       0.04530155, 0.0444711 , 0.23276604, 0.08264551, 0.14715548,
       0.27143597, 0.2559753 , 0.03211807, 0.11354654, 0.08358672,
       0.26362986, 0.09593516, 0.13727856, 0.09171493, 0.11081168,
       0.10015417, 0.23903843, 0.073364  , 0.15217816, 0.17701163,
       0.07554785, 0.05807901, 0.14071803, 0.07304673, 0.0719338 ,
       0.05580748, 0.17997952, 0.25288835, 0.05193155, 0.12372307,
       0.07602834, 0.13069575, 0.11714308, 0.19744683, 0.10557462,
       0.15816031, 0.06340559, 0.06792692, 0.12829768, 0.20054978,
       0.1624947 , 0.06916048, 0.29835977, 0.18293107, 0.30522231,
       0.32813231, 0.05079721, 0.12786727, 0.31141791, 0.10342646,
       0.04032323, 0.14849774, 0.16495218, 0.04784679, 0.11568315,
       0.2553514 , 0.10152689, 0.20772302, 0.0587106 , 0.10363748,
       0.28959545, 0.05332102, 0.21971564, 0.05441342, 0.09248442,
       0.1578664 , 0.19925881, 0.22021214, 0.12291989, 0.09913

In [6]:
## Changing the likelihoods of labels
logit_label= np.where(logit_pred < 0.25, 0, 1)
logit_label

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,

In [7]:
## Constructing the confusion matrix
confusion_matrix(Y_test, logit_label)

array([[534,  95],
       [ 64,  39]])

In [8]:
## Computing the accuracy
confusion_matrix(Y_test, logit_label)

array([[534,  95],
       [ 64,  39]])