# Cardiovascular Disease Risk Prediction

In [55]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

In [56]:
disease_data_filepath = "CVD_cleaned.csv"
disease_data = pd.read_csv(disease_data_filepath)
print(disease_data.head())
print(disease_data.shape)

  General_Health                  Checkup Exercise Heart_Disease Skin_Cancer  \
0           Poor  Within the past 2 years       No            No          No   
1      Very Good     Within the past year       No           Yes          No   
2      Very Good     Within the past year      Yes            No          No   
3           Poor     Within the past year      Yes           Yes          No   
4           Good     Within the past year       No            No          No   

  Other_Cancer Depression Diabetes Arthritis     Sex Age_Category  \
0           No         No       No       Yes  Female        70-74   
1           No         No      Yes        No  Female        70-74   
2           No         No      Yes        No  Female        60-64   
3           No         No      Yes        No    Male        75-79   
4           No         No       No        No    Male          80+   

   Height_(cm)  Weight_(kg)    BMI Smoking_History  Alcohol_Consumption  \
0        150.0        32.66  

In [57]:
disease_data.columns.isnull()
y = disease_data.Heart_Disease
disease_features = ["Exercise", "Diabetes", "Arthritis", "Sex", "Age_Category", "BMI", "Smoking_History",
                    "Alcohol_Consumption", "Fruit_Consumption", "Green_Vegetables_Consumption", "FriedPotato_Consumption"]
X = disease_data[disease_features]

In [58]:
X.describe()
X.head()

Unnamed: 0,Exercise,Diabetes,Arthritis,Sex,Age_Category,BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,No,No,Yes,Female,70-74,14.54,Yes,0.0,30.0,16.0,12.0
1,No,Yes,No,Female,70-74,28.29,No,0.0,30.0,0.0,4.0
2,Yes,Yes,No,Female,60-64,33.47,No,4.0,12.0,3.0,16.0
3,Yes,Yes,No,Male,75-79,28.73,No,0.0,30.0,30.0,8.0
4,No,No,No,Male,80+,24.37,Yes,0.0,8.0,4.0,0.0


In [59]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)
s = (X_train.dtypes=='object')
object_cols = list(s[s].index)
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_val.select_dtypes(exclude=['object'])


In [60]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_val[object_cols]))
# OH encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_val.index
# Remove categorical columns and replace them with one hot encoding
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_val.drop(object_cols, axis=1)

# Add one hot encoded columns to numerical
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# Make sure all columns have dtype 'str'ArithmeticError
OH_X_train.columns = OH_X_train.columns.astype('str')
OH_X_valid.columns = OH_X_valid.columns.astype('str')

In [61]:
disease_model = DecisionTreeClassifier(random_state=1)
disease_model.fit(OH_X_train, y_train)
y_pred = disease_model.predict(OH_X_valid)
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy: .2f}")

Accuracy:  0.86
