# **Heartbit data analysis**

---



*   **Marcin Bieganek**


In [None]:
%matplotlib inline
import pandas as pd
import numpy as np

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.impute import KNNImputer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Loading medical data from a CSV file.

In [None]:
heartbit = pd.read_csv('/content/drive/MyDrive/Data Science/heartbit.csv', skiprows=1)

heartbit

Unnamed: 0,ID CODES,DEATH?,DEATHDATE,TIMEFU,QOL,OQLsub1,OQLsub2,DOB,DOE,AGE,...,EXERCISE5,CPX.TIME,CPX.PEAKVO2,CPX.PEAKVO2FORBM,RER,SLOPE,METS,WEBER,PEAK>18,SLOPE>35
0,HB1,0.0,3-2-2012,2076,27.0,23.0,4.0,14-6-1959,29-5-2006,46.99,...,-10.0,14.523,2577.8,23.98,1.04,37.485,7.000000,1.0,1.0,1.0
1,HB2,0.0,12-8-2012,2316,42.0,37.0,5.0,22-12-1958,10-4-2006,47.33,...,-13.0,17.267,1555.8,22.22,1.00,37.232,6.348571,1.0,1.0,1.0
2,HB3,0.0,21-2-2012,2349,0.0,0.0,0.0,25-11-1945,16-9-2005,59.85,...,-22.0,15.195,1748.6,21.86,1.17,44.719,6.245714,1.0,1.0,1.0
3,HB4,0.0,3-2-2013,2459,10.0,7.0,3.0,20-3-1945,12-5-2006,61.19,...,-38.0,13.450,1799.8,20.58,1.16,39.423,5.880000,1.0,1.0,1.0
4,HB5,0.0,3-2-2013,2629,4.0,2.0,2.0,7-3-1982,23-11-2005,23.73,...,4.0,16.380,2626.4,26.54,1.21,21.051,7.582857,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,,,,,,,,,,,...,,,,,,,,,,
495,,,,,,,,,,,...,,,,,,,,,,
496,,,,,,,,,,,...,,,,,,,,,,
497,,,,,,,,,,,...,,,,,,,,,,


The NYHA values in the dataset are 1, 1.5, 2, 2.5, 3, 3.5, and 4 (not just 1, 2, 3, 4 as stated in the data description). Values 1 and 1.5 can be combined, 2 and 2.5 can be combined, and 3.5 can be combined with 4.

Below is the transformation where the NYHA values have been merged accordingly:

In [None]:
# Merge categories as described
def merge_nyha(values):
    if values in [1, 1.5]:
        return 1
    elif values in [2, 2.5]:
        return 2
    elif values in [3, 3.5]:
        return 3
    elif values == 4:
        return 4

# Apply merging to NYHA column
heartbit['NYHA'] = heartbit['NYHA'].apply(merge_nyha)

heartbit

Unnamed: 0,ID CODES,DEATH?,DEATHDATE,TIMEFU,QOL,OQLsub1,OQLsub2,DOB,DOE,AGE,...,EXERCISE5,CPX.TIME,CPX.PEAKVO2,CPX.PEAKVO2FORBM,RER,SLOPE,METS,WEBER,PEAK>18,SLOPE>35
0,HB1,0.0,3-2-2012,2076,27.0,23.0,4.0,14-6-1959,29-5-2006,46.99,...,-10.0,14.523,2577.8,23.98,1.04,37.485,7.000000,1.0,1.0,1.0
1,HB2,0.0,12-8-2012,2316,42.0,37.0,5.0,22-12-1958,10-4-2006,47.33,...,-13.0,17.267,1555.8,22.22,1.00,37.232,6.348571,1.0,1.0,1.0
2,HB3,0.0,21-2-2012,2349,0.0,0.0,0.0,25-11-1945,16-9-2005,59.85,...,-22.0,15.195,1748.6,21.86,1.17,44.719,6.245714,1.0,1.0,1.0
3,HB4,0.0,3-2-2013,2459,10.0,7.0,3.0,20-3-1945,12-5-2006,61.19,...,-38.0,13.450,1799.8,20.58,1.16,39.423,5.880000,1.0,1.0,1.0
4,HB5,0.0,3-2-2013,2629,4.0,2.0,2.0,7-3-1982,23-11-2005,23.73,...,4.0,16.380,2626.4,26.54,1.21,21.051,7.582857,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,,,,,,,,,,,...,,,,,,,,,,
495,,,,,,,,,,,...,,,,,,,,,,
496,,,,,,,,,,,...,,,,,,,,,,
497,,,,,,,,,,,...,,,,,,,,,,


Categorical data has been encoded below:

In [None]:
# Encode categorical features before imputation
label_encoders = {}
for col in heartbit.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    heartbit.loc[:, col] = le.fit_transform(heartbit[col].astype(str))
    label_encoders[col] = le

heartbit

Unnamed: 0,ID CODES,DEATH?,DEATHDATE,TIMEFU,QOL,DOB,DOE,AGE,HEIGHT.CM,WEIGHT.KG,...,EXERCISE4,EXERCISE5,CPX.TIME,CPX.PEAKVO2FORBM,RER,SLOPE,METS,WEBER,PEAK>18,SLOPE>35
0,0,0.0,203,66,27.0,81,247,46.99,170.0,116.0,...,-26.0,-10.0,14.523,23.98,1.04,37.485,7.000000,1.0,1.0,1.0
1,111,0.0,40,103,42.0,206,23,47.33,165.0,70.0,...,2.0,-13.0,17.267,22.22,1.00,37.232,6.348571,1.0,1.0,1.0
2,222,0.0,137,105,0.0,239,105,59.85,177.0,80.0,...,-7.0,-22.0,15.195,21.86,1.17,44.719,6.245714,1.0,1.0,1.0
3,333,0.0,204,114,10.0,177,51,61.19,173.0,87.5,...,-8.0,-38.0,13.450,20.58,1.16,39.423,5.880000,1.0,1.0,1.0
4,414,0.0,204,130,4.0,369,180,23.73,180.0,99.0,...,-4.0,4.0,16.380,26.54,1.21,21.051,7.582857,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,469,,271,317,,396,356,,,,...,,,,,,,,,,
495,469,,271,317,,396,356,,,,...,,,,,,,,,,
496,469,,271,317,,396,356,,,,...,,,,,,,,,,
497,469,,271,317,,396,356,,,,...,,,,,,,,,,


As for the missing values, features with more than 50% missing data were excluded. For the remaining missing values, the k-nearest neighbors algorithm with k=5 was used for imputation.

In [None]:
# Drop features with a high number of missing values (e.g., >50%)
missing_threshold = 0.5
missing_ratios = heartbit.isnull().mean()
heartbit = heartbit.loc[:, missing_ratios < missing_threshold]

# Handle remaining missing values using KNN Imputation
imputer = KNNImputer(n_neighbors=5)
heartbit_imputed = pd.DataFrame(imputer.fit_transform(heartbit), columns=heartbit.columns)

heartbit_imputed

Unnamed: 0,ID CODES,DEATH?,DEATHDATE,TIMEFU,QOL,DOB,DOE,AGE,HEIGHT.CM,WEIGHT.KG,...,EXERCISE4,EXERCISE5,CPX.TIME,CPX.PEAKVO2FORBM,RER,SLOPE,METS,WEBER,PEAK>18,SLOPE>35
0,0.0,0.0,203.0,66.0,27.0,81.0,247.0,46.990,170.0,116.0,...,-26.0,-10.0,14.523000,23.98,1.040,37.485,7.000000,1.0,1.0,1.0
1,111.0,0.0,40.0,103.0,42.0,206.0,23.0,47.330,165.0,70.0,...,2.0,-13.0,17.267000,22.22,1.000,37.232,6.348571,1.0,1.0,1.0
2,222.0,0.0,137.0,105.0,0.0,239.0,105.0,59.850,177.0,80.0,...,-7.0,-22.0,15.195000,21.86,1.170,44.719,6.245714,1.0,1.0,1.0
3,333.0,0.0,204.0,114.0,10.0,177.0,51.0,61.190,173.0,87.5,...,-8.0,-38.0,13.450000,20.58,1.160,39.423,5.880000,1.0,1.0,1.0
4,414.0,0.0,204.0,130.0,4.0,369.0,180.0,23.730,180.0,99.0,...,-4.0,4.0,16.380000,26.54,1.210,21.051,7.582857,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,469.0,0.2,271.0,317.0,35.8,396.0,356.0,59.186,174.4,86.8,...,-6.0,-9.8,10.866733,16.36,1.014,36.198,5.530286,2.4,0.2,0.4
495,469.0,0.2,271.0,317.0,35.8,396.0,356.0,59.186,174.4,86.8,...,-6.0,-9.8,10.866733,16.36,1.014,36.198,5.530286,2.4,0.2,0.4
496,469.0,0.2,271.0,317.0,35.8,396.0,356.0,59.186,174.4,86.8,...,-6.0,-9.8,10.866733,16.36,1.014,36.198,5.530286,2.4,0.2,0.4
497,469.0,0.2,271.0,317.0,35.8,396.0,356.0,59.186,174.4,86.8,...,-6.0,-9.8,10.866733,16.36,1.014,36.198,5.530286,2.4,0.2,0.4


Below, we define the features and the target variable:

In [None]:
# Define features and target
target = 'NYHA'
X = heartbit_imputed.drop(columns=[target])
y = heartbit_imputed[target].astype(int)

The dataset contains highly correlated features. Below, features with a correlation greater than 0.9 were removed.

In [None]:
# Remove highly correlated features
correlation_matrix = X.corr().abs()
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
# Drop features with correlation > 0.9
high_correlation_columns = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.9)]
X = X.drop(columns=high_correlation_columns)

For numerical features, it is advisable to perform standardization.

In [None]:
# Standardize numerical features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

Below, the data was split into training and test sets:

In [None]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Training the logistic regression model:

In [None]:
# Train a Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)

Model evaluation:

In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           1       0.63      0.68      0.65        25
           2       0.71      0.69      0.70        51
           3       0.65      0.59      0.62        22
           4       0.50      1.00      0.67         2

    accuracy                           0.67       100
   macro avg       0.62      0.74      0.66       100
weighted avg       0.67      0.67      0.67       100



Training the Random Forest model:

In [None]:
# Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

Random Forest model evaluation:

In [None]:
# Make predictions using Random Forest
y_pred_rf = rf_model.predict(X_test)

# Evaluate the Random Forest model
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Classification Report:
              precision    recall  f1-score   support

           1       0.88      0.56      0.68        25
           2       0.68      0.88      0.77        51
           3       0.67      0.55      0.60        22
           4       0.00      0.00      0.00         2

    accuracy                           0.71       100
   macro avg       0.56      0.50      0.51       100
weighted avg       0.71      0.71      0.70       100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
