In [8]:
import pandas as pd
import numpy as np
import torch
import mrmr

In [9]:
df = pd.read_csv('cleveland-challenge.csv')
print(df.columns)

""" Convert num into a 0 or 1"""
df['num'] = df['num'].apply(lambda x: 1 if x > 0 else 0)

Index(['Unnamed: 0', 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
       'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')


In [10]:
"""
Read in the dataframe and refactor the disease column to be binary
Sort the columns into categorical and continuous features
"""

df = pd.read_csv('cleveland.csv')
df['num'] = df['num'].apply(lambda x: 1 if x > 0 else 0)
df['num'] = df['num'].astype('category')

selectedFeatures = ['thal', 'cp', 'ca', 'oldpeak', 'thalach', 
                    'exang', 'chol', 'sex', 'trestbps', 'fbs',
                    'age', 'restecg', 'slope']

# These features are the ones I want to select
categoricalFeatures = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
continuousFeatures = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']



In [11]:
"""
Prepare categorical features
"""
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Remove '?' from the categorical features
for feature in categoricalFeatures:
    df = df[df[feature] != '?']

# Use the label encoder to convert categorical features to numerical
label_encoder = LabelEncoder()
min_max_scaler = MinMaxScaler()
for col in categoricalFeatures:
    df[col] = label_encoder.fit_transform(df[col])

# Normalize the encoded values to a range of 0 to 1
df[categoricalFeatures] = min_max_scaler.fit_transform(df[categoricalFeatures])

In [12]:
"""
Clean up Continuous Features
"""

# remove rows that are not numeric
for feature in continuousFeatures:
    print(f"Converting {feature} to numeric")
    df = df[pd.to_numeric(df[feature], errors='coerce').notna()]

# Z-score normalization
for feature in continuousFeatures:
    mean = df[feature].mean()
    std = df[feature].std()
    df[feature] = (df[feature] - mean) / std

Converting age to numeric
Converting trestbps to numeric
Converting chol to numeric
Converting thalach to numeric
Converting oldpeak to numeric


In [13]:

# Convert the data types to float16
df[categoricalFeatures] = df[categoricalFeatures].astype('float16')
df[continuousFeatures] = df[continuousFeatures].astype('float16')

In [14]:
""" Now with all features cleaned, we can perform mRMR on the data"""
from mrmr import mrmr_classif

allFeatures = df.columns[:-1]
selectedFeatures = mrmr_classif(X=df[allFeatures], y=df['num'], K=len(allFeatures))
print(f'ranked features according to mrmr: {selectedFeatures}')

# Take the top 9 features
selectedFeatures = selectedFeatures[:9] + ['num']
df = df[selectedFeatures]


100%|██████████| 13/13 [00:04<00:00,  2.90it/s]

ranked features according to mrmr: ['thal', 'restecg', 'ca', 'exang', 'thalach', 'oldpeak', 'cp', 'sex', 'slope', 'age', 'trestbps', 'chol', 'fbs']





In [15]:
""" Now lets save the data """
print(df.info())
print(f"\n\n DF HEAD")
print(df.head(5))
df.to_csv('cleveland_cleaned.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 0 to 301
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   thal     297 non-null    float16 
 1   restecg  297 non-null    float16 
 2   ca       297 non-null    float16 
 3   exang    297 non-null    float16 
 4   thalach  297 non-null    float16 
 5   oldpeak  297 non-null    float16 
 6   cp       297 non-null    float16 
 7   sex      297 non-null    float16 
 8   slope    297 non-null    float16 
 9   num      297 non-null    category
dtypes: category(1), float16(9)
memory usage: 8.0 KB
None


 DF HEAD
   thal  restecg        ca  exang   thalach   oldpeak        cp  sex  slope   
0   0.5      1.0  0.000000    0.0  0.017471  1.067383  0.000000  1.0    1.0  \
1   0.0      1.0  1.000000    1.0 -1.813477  0.381104  1.000000  1.0    0.5   
2   1.0      1.0  0.666504    1.0 -0.897949  1.324219  1.000000  1.0    0.5   
3   0.0      0.0  0.000000    0.0  1.629883  2.