In [26]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import os as os


## Build a model using training data that will predict the heart attack risk of the test set 

In [40]:
current_dir = os.getcwd()
current_dir

training_set = pd.read_csv(current_dir + '/Data/train.csv')
test_set = pd.read_csv(current_dir + '/Data/test.csv')


In [41]:
training_set.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,RDG0550,33,Male,200,129/90,48,0,1,1,1,...,0.138443,184066,30.449815,63,6,7,Argentina,South America,Southern Hemisphere,1
1,NMA3851,56,Female,262,159/105,46,1,0,1,0,...,0.369552,211755,34.973685,333,7,8,Nigeria,Africa,Northern Hemisphere,1
2,TUI5807,19,Female,140,161/109,54,0,1,0,0,...,8.646334,252203,30.554246,537,2,10,Thailand,Asia,Northern Hemisphere,0
3,YYT5016,50,Female,163,120/62,53,0,1,1,1,...,1.107884,121954,35.390265,591,0,9,Spain,Europe,Southern Hemisphere,1
4,ZAC5937,89,Female,144,153/110,92,1,0,1,0,...,1.33757,180121,39.575483,145,2,5,Germany,Europe,Northern Hemisphere,1


In [42]:
def data_clean(df):# Getting rid of the Patient ID column because it will not be used in the model
    cleaned_data = df.drop('Patient ID', axis=1)

    # Formatting the data to be used in the model
    gender_values = {"Male": 0, "Female": 1}
    cleaned_data['Sex'] = df['Sex'].map(gender_values)

    Diet_values= {"Unhealthy": 1, "Average": 2 ,"Healthy": 3}
    cleaned_data['Diet'] = df['Diet'].map(Diet_values)


    # Split the 'blood pressure' column
    cleaned_data[['Systolic_BP', 'Diastolic_BP']] = df['Blood Pressure'].str.split('/', expand=True)

    # Convert the new columns to numeric types
    cleaned_data['Systolic_BP'] = pd.to_numeric(cleaned_data['Systolic_BP'])
    cleaned_data['Diastolic_BP'] = pd.to_numeric(cleaned_data['Diastolic_BP'])
    cleaned_data = cleaned_data.drop('Blood Pressure', axis=1) # Has been replaced.

    # One-hot encode
    cleaned_data = pd.get_dummies(cleaned_data, columns=['Country', 'Continent', 'Hemisphere']).astype(int)

    return cleaned_data


In [43]:
clean_trained = data_clean(training_set)
clean_test = data_clean(test_set)

In [47]:
features = clean_trained.drop('Heart Attack Risk', axis=1)
target = clean_trained['Heart Attack Risk']

# Training the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(features, target)

# Predict the heart attack risk on the test data
predictions = pd.DataFrame(model.predict(clean_test))
predictions.columns = ['Heart Attack Risk']

In [48]:
# Assuming that 'Patient ID' was kept in 'test_set' DataFrame
predictions_df = pd.DataFrame(predictions, columns=['Heart Attack Risk'])
predictions_df['Patient ID'] = test_set['Patient ID']

# Reorder the columns
predictions_df = predictions_df[['Patient ID', 'Heart Attack Risk']]
predictions_df

Unnamed: 0,Patient ID,Heart Attack Risk
0,BMW7812,0.35
1,XXM0972,0.40
2,RQX1211,0.25
3,RVN4963,0.16
4,NCU1956,0.39
...,...,...
1748,GQZ5013,0.31
1749,FDK8693,0.30
1750,SVO4635,0.38
1751,KQR8949,0.35


In [49]:
# Saving
predictions_df.to_csv(current_dir + '/Data/predictions.csv', index=False)