In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import os as os


## Build a model using training data that will predict the heart attack risk of the test set 

In [6]:
current_dir = os.getcwd()
current_dir

training_set = pd.read_csv(current_dir + '/Data/train.csv')
test_set = pd.read_csv(current_dir + '/Data/test.csv')


In [9]:
training_set.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,RDG0550,33,Male,200,129/90,48,0,1,1,1,...,0.138443,184066,30.449815,63,6,7,Argentina,South America,Southern Hemisphere,1
1,NMA3851,56,Female,262,159/105,46,1,0,1,0,...,0.369552,211755,34.973685,333,7,8,Nigeria,Africa,Northern Hemisphere,1
2,TUI5807,19,Female,140,161/109,54,0,1,0,0,...,8.646334,252203,30.554246,537,2,10,Thailand,Asia,Northern Hemisphere,0
3,YYT5016,50,Female,163,120/62,53,0,1,1,1,...,1.107884,121954,35.390265,591,0,9,Spain,Europe,Southern Hemisphere,1
4,ZAC5937,89,Female,144,153/110,92,1,0,1,0,...,1.33757,180121,39.575483,145,2,5,Germany,Europe,Northern Hemisphere,1


In [10]:
# Getting rid of the Patient ID column because it will not be used in the model
training_set = training_set.drop('Patient ID', axis=1)


In [11]:
# Formatting the data to be used in the model
gender_values = {"Male": 0, "Female": 1}
training_set['Sex'] = training_set['Sex'].map(gender_values)


In [12]:

# Split the 'blood pressure' column
training_set[['Systolic_BP', 'Diastolic_BP']] = training_set['Blood Pressure'].str.split('/', expand=True)

# Convert the new columns to numeric types
training_set['Systolic_BP'] = pd.to_numeric(training_set['Systolic_BP'])
training_set['Diastolic_BP'] = pd.to_numeric(training_set['Diastolic_BP'])
training_set = training_set.drop('Blood Pressure', axis=1) # Has been replaced.


In [11]:
training_set.head()

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,...,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk,Systolic_BP,Diastolic_BP
0,33,0,200,48,0,1,1,1,1,7.80769,...,30.449815,63,6,7,Argentina,South America,Southern Hemisphere,1,129,90
1,56,1,262,46,1,0,1,0,0,17.297628,...,34.973685,333,7,8,Nigeria,Africa,Northern Hemisphere,1,159,105
2,19,1,140,54,0,1,0,0,1,8.695288,...,30.554246,537,2,10,Thailand,Asia,Northern Hemisphere,0,161,109
3,50,1,163,53,0,1,1,1,1,10.161587,...,35.390265,591,0,9,Spain,Europe,Southern Hemisphere,1,120,62
4,89,1,144,92,1,0,1,0,1,16.436787,...,39.575483,145,2,5,Germany,Europe,Northern Hemisphere,1,153,110
