# 1. Imports

In [36]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import category_encoders as ce

# 2. Functions and classes

In [37]:
def to_encoded(frame: pd.DataFrame) -> pd.DataFrame:
    """
    This function converts a dataframe to another dataframe which can be used
    to train a model or predict values from the model.

    - Replaces cholesterol 0 values with mean values of other cholesterol
    - deletes the ID field which shouldn't be used for training or predicting
    - Encodes the data to integers usable in training or predicting
    """
    # Replace all 0 ch values with mean of set
    cholesterol = frame['Cholesterol']
    cholesterol_mean = int(round(sum((f:=[c for c in cholesterol if c != 0])) / len(f), 0)) # magic mean function for non 0 values
    cholesterol.replace(0, cholesterol_mean, inplace=True)

    # Delete ID field
    del frame['id']

    # Encode the non int columns
    encodable_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'] # non-integer columns in our dataset
    to_encode_cols = frame[encodable_cols]

    encoded_cols = ce.OrdinalEncoder(cols=encodable_cols).fit_transform(to_encode_cols)

    # delete the non-encoded columns from the frame
    for col in encodable_cols:
        del frame[col]

    # Add (merge) the encoded cols to the frame
    encoded_frame = pd.merge(frame, encoded_cols, right_index = True, left_index = True)

    return encoded_frame

# 3. Loading and preparing training set

First we load the train_heart.csv that we'll use to train our model.
We encode the data and remove some data so that it can become a model.

In [38]:
# Load the training set
training_set = pd.read_csv('train_heart.csv')

training_set_encoded = to_encoded(training_set)
training_set_encoded.head(10)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope
0,55,135,204,1,126,1.1,1,1,1,1,1,1
1,67,160,286,0,108,1.5,1,1,1,2,1,1
2,56,120,242,0,100,-1.0,1,1,1,1,1,2
3,56,120,236,0,178,0.8,0,1,2,3,2,3
4,75,170,203,1,108,0.0,1,1,1,1,2,1
5,51,110,175,0,123,0.6,0,1,3,3,2,3
6,60,135,242,0,63,0.5,1,1,1,3,1,3
7,61,146,241,0,148,3.0,1,1,1,3,1,2
8,54,150,365,0,134,1.0,0,1,1,1,2,3
9,50,110,254,0,159,0.0,0,2,1,2,2,3


# 4. Split data

Here, we split our training set into 70% training data, and 30% testing data

In [39]:
# extract the label and feature colums from the dataframe
label = training_set_encoded['HeartDisease']
features = training_set_encoded.loc[:, training_set_encoded.columns != 'HeartDisease']

# Split for training
train_x, test_x, train_y, test_y = train_test_split(features, label, test_size=0.3) # 70% training, 30% test


# 5. Train data

In these cells we train our model based on the data we created in the previous sections

In [40]:
# Create a random forest
random_forest = RandomForestClassifier(n_estimators=100)

# Train the classifiers
random_forest.fit(train_x, train_y)
predictions = random_forest.predict(test_x) # predicts the values for heartdiseases
df_predictions = pd.Series(predictions, name="HeartDisease") # creates a dataframe from the predictions
test_x = pd.merge(test_x, df_predictions, right_index = True,left_index = True) # merges the predictions with our test data

# 6. Evaluation

evaluate the model and its performance in comparison to our test data (30%)
Also include a confusion matrix

In [41]:
# Model Classification report
classification_report = classification_report(test_y, predictions)
print("\nClassification Report:\n", classification_report)


Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.83      0.87        87
           1       0.87      0.94      0.90       106

    accuracy                           0.89       193
   macro avg       0.90      0.89      0.89       193
weighted avg       0.89      0.89      0.89       193



In [42]:
# Confusion Matrix
conf_matrix = confusion_matrix(test_y, predictions)
print("\nConfusion Matrix:\n", conf_matrix)


Confusion Matrix:
 [[ 72  15]
 [  6 100]]


# ------------------------------------------------
# 7. testing and submission

## 7.1 Run Test

We have now created (and tested) our model. It can now be used with other
data to predict the results of features of which we don't know whether or not they
are a heart disease

First we load and encode our new testing data (test_heart.csv).
Once this data is loaded we can just use the already trained model to predict the values of the data. After which we put them into a new csv.

## 7.2 to .csv -> Kaggle

After making the predictions, save the results to a csv file and submit to Kaggle

In [53]:
# load and encode testing set
test_set = pd.read_csv('test_heart.csv')
test_set_encoded = to_encoded(test_set.copy(deep=True)) # Use a copy otherwise all will be in_place and "ID" will be lost
training_set_encoded.head(10)

# predict values from test set
test_predictions = random_forest.predict(test_set_encoded)
# convert back to serie
test_predictions = pd.Series(test_predictions, name="HeartDisease")

# merge predictions with the ID's of the test set
result = pd.merge(test_set['id'], test_predictions, right_index = True,left_index = True)


Unnamed: 0,id,HeartDisease
0,637,1
1,430,1
2,711,1
3,375,0
4,183,1
...,...,...
271,133,1
272,66,0
273,470,1
274,898,1


In [54]:
result[['id', 'HeartDisease']].to_csv('tester.csv', index = False, encoding='utf-8')