In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler


In [4]:
df = pd.read_csv('./framingham.csv')
df

Unnamed: 0.1,Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,0,male,39,4.0,No,0.0,No,No,No,No,195.0,106.0,70.0,26.97,80.0,77.0,0
1,1,female,46,2.0,No,0.0,No,No,No,No,250.0,121.0,81.0,28.73,95.0,76.0,0
2,2,male,48,1.0,Yes,20.0,No,No,No,No,245.0,127.5,80.0,25.34,75.0,70.0,0
3,3,female,61,3.0,Yes,30.0,No,No,Yes,No,225.0,150.0,95.0,28.58,65.0,103.0,1
4,4,female,46,3.0,Yes,23.0,No,No,No,No,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4233,4233,male,50,1.0,Yes,1.0,No,No,Yes,No,313.0,179.0,92.0,25.97,66.0,86.0,1
4234,4234,male,51,3.0,Yes,43.0,No,No,No,No,207.0,126.5,80.0,19.71,65.0,68.0,0
4235,4235,female,48,2.0,Yes,20.0,,No,No,No,248.0,131.0,72.0,22.00,84.0,86.0,0
4236,4236,female,44,1.0,Yes,15.0,No,No,No,No,210.0,126.5,87.0,19.16,86.0,,0


**Description of Variables**
- sex - male or female
- age - Age of the patient
- education
- currentsmoker - whether or not the patient is a current smoker
- cigsperday - the number of cigarettes that the person smoked on average in one day
- BPMeds - whether or not the patient was on blood pressure medication
- PrevalentStroke - whether or not the patient had previously had a stroke
- PrevalentHyp - whether or not the patient was hypertensive 
- diabetes - whether or not the patient had diabetes
- TotChol - total cholesterol level
- SysBP - systolic blood pressure
- diaBP - diastolic blood pressure
- BMI - Body Mass Index
- heartrate - heart rate
- glucose - glucose level
- TenYearCHD - 10 year risk of coronary heart disease CHD (“1”, means “Yes”, “0” means “No”)

## DATA EXPLORATION

In [9]:
df.shape

(4238, 17)

There are 4238 observations and 17 features

In [98]:
#Identify and replacing missing values

df[categ_cols] = df[categ_cols].astype(str)

num_cols = df.select_dtypes(include = ['int64', 'float64']).columns
categ_cols = df.select_dtypes(include = ['object']).columns

#We have chosen to fill in the missing numerical and categorical values in the dataset with the 
#most frequently occurring data. This decision is motivated by our belief that the prevalence of 
#heart disease may be linked to the dietary habits of residents based on their geographical 
#location. We are assuming that most patients fall within a similar income bracket and, as a result, 
#are only able to afford certain types of meals.
num_imputer = SimpleImputer(strategy = 'most_frequent')
df[num_cols] = num_imputer.fit_transform(df[num_cols])


In [100]:

cat_imputer = SimpleImputer(strategy = 'most_frequent')
df[categ_cols] = cat_imputer.fit_transform(df[categ_cols])


ValueError: at least one array or dtype is required

In [95]:

print("Missing values:")
print(df.isnull().sum())


Missing values:
Unnamed: 0             0
age                    0
education              0
cigsPerDay             0
totChol                0
sysBP                  0
diaBP                  0
BMI                    0
heartRate              0
glucose                0
TenYearCHD             0
sex_male               0
currentSmoker_Yes      0
BPMeds_Yes             0
prevalentStroke_Yes    0
prevalentHyp_Yes       0
diabetes_Yes           0
dtype: int64


In [63]:
CHD_col = 'TenYearCHD' 
X = df.drop(CHD_col, axis=1)
y = df[CHD_col]

In [64]:
# Dividing the data set into training(60%), validation(20%) and testing(20%)
x_train, x_remaining, y_train, y_remaining = train_test_split(x, y, test_size=0.4, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_remaining, y_remaining, test_size=0.5, random_state=42)

In [65]:
print(f"Training set size: {x_train.shape[0]}")
print(f"Validation set size: {x_val.shape[0]}")
print(f"Testing set size: {x_test.shape[0]}")

Training set size: 2542
Validation set size: 848
Testing set size: 848
