In [1]:
# Import required libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
# Read in data
df = pd.read_csv('../data/raw/CVD_cleaned.csv')

In [3]:
# Get snippet of data
df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


In [4]:
# Columns in dataset
df.columns

Index(['General_Health', 'Checkup', 'Exercise', 'Heart_Disease', 'Skin_Cancer',
       'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex',
       'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI', 'Smoking_History',
       'Alcohol_Consumption', 'Fruit_Consumption',
       'Green_Vegetables_Consumption', 'FriedPotato_Consumption'],
      dtype='object')

In [5]:
# Unique values of categorical features
cat_cols = ['General_Health', 'Checkup', 'Exercise', 'Heart_Disease', 'Skin_Cancer', 
            'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex', 'Age_Category', 
            'Smoking_History']

for col in cat_cols:
    print(col)
    print(set(df[col]))
    print('\n')

General_Health
{'Poor', 'Very Good', 'Good', 'Fair', 'Excellent'}


Checkup
{'5 or more years ago', 'Never', 'Within the past 5 years', 'Within the past year', 'Within the past 2 years'}


Exercise
{'Yes', 'No'}


Heart_Disease
{'Yes', 'No'}


Skin_Cancer
{'Yes', 'No'}


Other_Cancer
{'Yes', 'No'}


Depression
{'Yes', 'No'}


Diabetes
{'Yes', 'No, pre-diabetes or borderline diabetes', 'No', 'Yes, but female told only during pregnancy'}


Arthritis
{'Yes', 'No'}


Sex
{'Male', 'Female'}


Age_Category
{'40-44', '25-29', '60-64', '70-74', '45-49', '35-39', '65-69', '75-79', '18-24', '80+', '50-54', '55-59', '30-34'}


Smoking_History
{'Yes', 'No'}




In [6]:
df.isna().sum()

General_Health                  0
Checkup                         0
Exercise                        0
Heart_Disease                   0
Skin_Cancer                     0
Other_Cancer                    0
Depression                      0
Diabetes                        0
Arthritis                       0
Sex                             0
Age_Category                    0
Height_(cm)                     0
Weight_(kg)                     0
BMI                             0
Smoking_History                 0
Alcohol_Consumption             0
Fruit_Consumption               0
Green_Vegetables_Consumption    0
FriedPotato_Consumption         0
dtype: int64

In [7]:
# Encode categorical features
df['General_Health'] = df['General_Health'].replace({'Poor': 0, 'Fair': 1, 'Good': 2, 'Very Good': 3, 'Excellent': 4})
df['Checkup'] = df['Checkup'].replace({'Never': 0, '5 or more years ago': 1, 'Within the past 5 years': 2, 
                                       'Within the past 2 years': 3, 'Within the past year': 4})
df['General_Health'] = df['General_Health'].replace({'Poor': 0, 'Fair': 1, 'Good': 2, 'Very Good': 3, 'Excellent': 4})
df = df.replace({'No': 0, 'Yes': 1})
df['Diabetes'] = df['General_Health'].replace({'No, pre-diabetes or borderline diabetes': 0, 
                                               'Yes, but female told only during pregnancy': 1})
df['Sex'] = df['Sex'].replace({'Female': 0, 'Male': 1})
df['Age_Category'] = df['Age_Category'].replace({'18-24': 0, '25-29': 1, '30-34': 2, '35-39': 3,
                                                '40-44': 4, '45-49': 5, '50-54': 6, '55-59': 7,
                                                '60-64': 8, '65-69': 9, '70-74': 10,
                                                '75-79': 11, '80+': 12})

In [8]:
# Get df after preprocessing
df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,0,3,0,0,0,0,0,0,1,0,10,150.0,32.66,14.54,1,0.0,30.0,16.0,12.0
1,3,4,0,1,0,0,0,3,0,0,10,165.0,77.11,28.29,0,0.0,30.0,0.0,4.0
2,3,4,1,0,0,0,0,3,0,0,8,163.0,88.45,33.47,0,4.0,12.0,3.0,16.0
3,0,4,1,1,0,0,0,0,0,1,11,180.0,93.44,28.73,0,0.0,30.0,30.0,8.0
4,2,4,0,0,0,0,0,2,0,1,12,191.0,88.45,24.37,1,0.0,8.0,4.0,0.0


In [9]:
# Get X & y features
y = df['Heart_Disease']
X = df.drop('Heart_Disease', axis = 1)

In [10]:
# Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [11]:
# Get train and test df
X_train['Heart_Disease'] = y_train
X_test['Heart_Disease'] = y_test

In [12]:
# Save to .csv files
X_train.to_csv('../data/model_input/train.csv', index = False)
X_test.to_csv('../data/model_input/test.csv', index = False)