# 1. Import + Inputs

In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import category_encoders as ce


df_train = pd.read_csv('train_heart.csv')
df_test = pd.read_csv('test_heart.csv')

# 2. EDA


In [2]:
# Voeg de gemiddelde waarde van cholesterol in de 0 waarde
cholesterol = df_train['Cholesterol']

# Berekent gemiddelde van cholestrerol zonder 0 waarde
cholesterol_mean = int(round(sum((f:=[c for c in cholesterol if c != 0])) / len(f), 0))
cholesterol.replace(0, cholesterol_mean, inplace=True)

df_train = df_train[df_train['Oldpeak'] >= 0]

del df_train['id']

In [3]:
# Haal je target kolom uit de data
target_name = df_train.columns[-1]

In [4]:
# Toon de eerste 10 rijen van de data
df_train.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,55,M,ASY,135,204,1,ST,126,Y,1.1,Flat,1
1,67,M,ASY,160,286,0,LVH,108,Y,1.5,Flat,1
3,56,M,ATA,120,236,0,Normal,178,N,0.8,Up,0
4,75,M,ASY,170,203,1,ST,108,N,0.0,Flat,1
5,51,M,NAP,110,175,0,Normal,123,N,0.6,Up,0
6,60,M,ASY,135,242,0,Normal,63,Y,0.5,Up,1
7,61,M,ASY,146,241,0,Normal,148,Y,3.0,Down,1
8,54,M,ASY,150,365,0,ST,134,N,1.0,Up,0
9,50,F,ASY,110,254,0,LVH,159,N,0.0,Up,0
10,61,M,ASY,130,242,1,Normal,77,N,2.5,Flat,1


# 3. Encoding

In [5]:
# Haal al de kolommen uit de dataset
feature_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
X = df_train[feature_cols]

# Verander de strings naar integers
ce_ord = ce.OrdinalEncoder(cols = feature_cols)
X_cat = ce_ord.fit_transform(X)

for i in feature_cols:
    del df_train[i]
    
df_train_clean = pd.merge(df_train, X_cat, right_index = True, left_index = True)
df_train_clean

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope
0,55,135,204,1,126,1.1,1,1,1,1,1,1
1,67,160,286,0,108,1.5,1,1,1,2,1,1
3,56,120,236,0,178,0.8,0,1,2,3,2,2
4,75,170,203,1,108,0.0,1,1,1,1,2,1
5,51,110,175,0,123,0.6,0,1,3,3,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...
637,50,170,209,0,116,0.0,0,1,2,1,2,2
638,60,125,242,1,110,0.1,1,1,1,3,2,2
639,50,140,288,0,140,0.0,1,2,3,3,1,1
640,65,138,282,1,174,1.4,1,1,4,2,2,1


# 4. Split


In [6]:
del df_train_clean['HeartDisease']

X = df_train_clean # Features
y = df_train[target_name]  # Labels
 
# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training, 30% test

# 5. Train


In [7]:
# Build the models
random_forest = RandomForestClassifier(n_estimators=100)

# Train the classifiers
random_forest.fit(x_train, y_train)

In [8]:
rf_pred = random_forest.predict(x_test)
rf_pred_serie = pd.Series(rf_pred, name="HeartDisease")
x_test = pd.merge(x_test, rf_pred_serie, right_index = True,left_index = True)

# 6. Evaluation


In [9]:
classification_rep_rf = classification_report(y_test, rf_pred)
print("\nClassification Report:\n", classification_rep_rf)


Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.75      0.74        72
           1       0.85      0.84      0.84       118

    accuracy                           0.81       190
   macro avg       0.79      0.79      0.79       190
weighted avg       0.81      0.81      0.81       190



In [10]:
conf_matrix_rf = confusion_matrix(y_test, rf_pred)
print("\nConfusion Matrix:\n", conf_matrix_rf)


Confusion Matrix:
 [[54 18]
 [19 99]]


# 7. Run Test


In [11]:
df_test_copy = df_test.copy()

# Voeg de gemiddelde waarde van cholesterol in de 0 waarde
cholesterol = df_test_copy['Cholesterol']

# Berekent gemiddelde van cholestrerol zonder 0 waarde
cholesterol_mean = int(round(sum((f:=[c for c in cholesterol if c != 0])) / len(f), 0))
cholesterol.replace(0, cholesterol_mean, inplace=True)

del df_test_copy['id']

df_test_copy

# Haal al de kolommen uit de dataset
feature_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
X = df_test_copy[feature_cols]

# Verander de strings naar integers
ce_ord = ce.OrdinalEncoder(cols = feature_cols)
X_cat = ce_ord.fit_transform(X)

for i in feature_cols:
    del df_test_copy[i]
    
df_test_copy_clean = pd.merge(df_test_copy, X_cat, right_index = True, left_index = True)
df_test_copy_clean

rf_pred_test = random_forest.predict(df_test_copy_clean)
rf_pred_serie = pd.Series(rf_pred_test, name="HeartDisease")
df_test_copy_clean = pd.merge(df_test_copy_clean, rf_pred_serie, right_index = True,left_index = True)

result = pd.merge(df_test['id'], df_test_copy_clean['HeartDisease'], right_index = True,left_index = True)

result

Unnamed: 0,id,HeartDisease
0,637,1
1,430,1
2,711,1
3,375,0
4,183,1
...,...,...
271,133,1
272,66,0
273,470,1
274,898,0


# 8. to .csv -> Kaggle

In [12]:
result[['id', 'HeartDisease']].to_csv('tester.csv', index = False, encoding='utf-8')