In [41]:
import pandas as pd
import numpy as np
import mrmr
import os

In [42]:
"""
Declare the dataframes and the path to the data
Please change to normal cleveland if you want to.
"""
df = pd.read_csv(os.path.join('data', 'cleveland-challenge.csv'))

df['num'] = df['num'].apply(lambda x: 1 if x > 0 else 0)

In [43]:
"""
Declare categorical and continuous features
"""
categoricalFeatures = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
continuousFeatures = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']


In [44]:
"""
Prepare categorical features
"""
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Remove '?' from the categorical features
for feature in categoricalFeatures:
    df = df[df[feature] != '?']

# Use the label encoder to convert categorical features to numerical
label_encoder = LabelEncoder()
min_max_scaler = MinMaxScaler()
for col in categoricalFeatures:
    df[col] = label_encoder.fit_transform(df[col])

# Normalize the encoded values to a range of 0 to 1
df[categoricalFeatures] = min_max_scaler.fit_transform(df[categoricalFeatures])

In [45]:
"""
Clean up Continuous Features
"""

# remove rows that are not numeric
for feature in continuousFeatures:
    print(f"Converting {feature} to numeric")
    df = df[pd.to_numeric(df[feature], errors='coerce').notna()]

# Z-score normalization
for feature in continuousFeatures:
    mean = df[feature].mean()
    std = df[feature].std()
    df[feature] = (df[feature] - mean) / std

Converting age to numeric
Converting trestbps to numeric
Converting chol to numeric
Converting thalach to numeric
Converting oldpeak to numeric


In [46]:

# Convert the data types to float16
df[categoricalFeatures] = df[categoricalFeatures].astype('float16')
df[continuousFeatures] = df[continuousFeatures].astype('float16')

In [47]:
""" Now with all features cleaned, we can perform mRMR on the data"""
from mrmr import mrmr_classif

allFeatures = df.columns[:-1]
selectedFeatures = mrmr_classif(X=df[allFeatures], y=df['num'], K=len(allFeatures))
print(f'ranked features according to mrmr: {selectedFeatures}')

# Take the top 9 features
selectedFeatures = selectedFeatures[:9] + ['num']
df = df[selectedFeatures]


100%|██████████| 14/14 [00:04<00:00,  3.14it/s]

ranked features according to mrmr: ['exang', 'slope', 'cp', 'restecg', 'thal', 'thalach', 'chol', 'sex', 'ca', 'oldpeak', 'fbs', 'age', 'Unnamed: 0', 'trestbps']





In [48]:
""" Now lets save the data """
print(df.info())
print(f"\n\n DF HEAD")
print(df.head(5))
df.to_csv(os.path.join('data', 'cleveland_cleaned.csv'), index=False)


<class 'pandas.core.frame.DataFrame'>
Index: 38 entries, 0 to 39
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   exang    38 non-null     float16
 1   slope    38 non-null     float16
 2   cp       38 non-null     float16
 3   restecg  38 non-null     float16
 4   thal     38 non-null     float16
 5   thalach  38 non-null     float16
 6   chol     38 non-null     float16
 7   sex      38 non-null     float16
 8   ca       38 non-null     float16
 9   num      38 non-null     int64  
dtypes: float16(9), int64(1)
memory usage: 1.3 KB
None


 DF HEAD
   exang  slope        cp  restecg  thal   thalach      chol  sex        ca   
0    1.0    0.5  1.000000      0.0   1.0 -0.175659  1.469727  0.0  0.000000  \
1    1.0    0.5  1.000000      1.0   0.5 -1.396484  0.802734  1.0  0.333252   
2    0.0    0.0  0.333252      0.0   0.0  1.330078 -1.059570  0.0  0.666504   
3    0.0    0.0  0.000000      0.0   0.0  0.190674 -0.047791  0.