In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler


In [48]:
df = pd.read_csv('./framingham.csv')

In [54]:
# Display dataset
print(df.info())
print(df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       4238 non-null   int64  
 1   sex              4238 non-null   object 
 2   age              4238 non-null   int64  
 3   education        4133 non-null   float64
 4   currentSmoker    4238 non-null   object 
 5   cigsPerDay       4209 non-null   float64
 6   BPMeds           4185 non-null   object 
 7   prevalentStroke  4238 non-null   object 
 8   prevalentHyp     4238 non-null   object 
 9   diabetes         4238 non-null   object 
 10  totChol          4188 non-null   float64
 11  sysBP            4238 non-null   float64
 12  diaBP            4238 non-null   float64
 13  BMI              4219 non-null   float64
 14  heartRate        4237 non-null   float64
 15  glucose          3850 non-null   float64
 16  TenYearCHD       4238 non-null   int64  
dtypes: float64(8),

In [98]:
#Identify and replacing missing values

df[categ_cols] = df[categ_cols].astype(str)

num_cols = df.select_dtypes(include = ['int64', 'float64']).columns
categ_cols = df.select_dtypes(include = ['object']).columns

#We have chosen to fill in the missing numerical and categorical values in the dataset with the 
#most frequently occurring data. This decision is motivated by our belief that the prevalence of 
#heart disease may be linked to the dietary habits of residents based on their geographical 
#location. We are assuming that most patients fall within a similar income bracket and, as a result, 
#are only able to afford certain types of meals.
num_imputer = SimpleImputer(strategy = 'most_frequent')
df[num_cols] = num_imputer.fit_transform(df[num_cols])


In [99]:
# Ensure categorical columns are of type 'object'
df[categ_cols] = df[categ_cols].astype(str)

# Check if there are any missing values in categorical columns
print("Missing values in categorical columns before imputation:")
print(df[categ_cols].isnull().sum())

# Impute missing values for numerical columns
num_imputer = SimpleImputer(strategy='most_frequent')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Impute missing values for categorical columns
cat_imputer = SimpleImputer(strategy='most_frequent')
df[categ_cols] = cat_imputer.fit_transform(df[categ_cols])

# Verify that there are no more missing values
print("Missing values after imputation:")
print(df.isnull().sum())

# Encoding of categorical values, drop the categorical columns and add new columns with encoded values
onehot_encoder = OneHotEncoder(drop='first', sparse=False)
encoded_cat_df = pd.DataFrame(onehot_encoder.fit_transform(df[categ_cols]), 
                              columns=onehot_encoder.get_feature_names_out(categ_cols))

# Verify the encoded DataFrame
print("Encoded categorical DataFrame:")
print(encoded_cat_df.head())

# Drop original categorical columns and concatenate the encoded ones
df = df.drop(columns=categ_cols)
df = pd.concat([df.reset_index(drop=True), encoded_cat_df.reset_index(drop=True)], axis=1)


Missing values in categorical columns before imputation:
Series([], dtype: float64)


ValueError: at least one array or dtype is required

In [100]:

cat_imputer = SimpleImputer(strategy = 'most_frequent')
df[categ_cols] = cat_imputer.fit_transform(df[categ_cols])


ValueError: at least one array or dtype is required

In [95]:

print("Missing values:")
print(df.isnull().sum())


Missing values:
Unnamed: 0             0
age                    0
education              0
cigsPerDay             0
totChol                0
sysBP                  0
diaBP                  0
BMI                    0
heartRate              0
glucose                0
TenYearCHD             0
sex_male               0
currentSmoker_Yes      0
BPMeds_Yes             0
prevalentStroke_Yes    0
prevalentHyp_Yes       0
diabetes_Yes           0
dtype: int64


In [63]:
CHD_col = 'TenYearCHD' 
X = df.drop(CHD_col, axis=1)
y = df[CHD_col]

In [64]:
# Dividing the data set into training(60%), validation(20%) and testing(20%)
x_train, x_remaining, y_train, y_remaining = train_test_split(x, y, test_size=0.4, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_remaining, y_remaining, test_size=0.5, random_state=42)

In [65]:
print(f"Training set size: {x_train.shape[0]}")
print(f"Validation set size: {x_val.shape[0]}")
print(f"Testing set size: {x_test.shape[0]}")

Training set size: 2542
Validation set size: 848
Testing set size: 848
