In [None]:
import pandas as pd
import numpy as np
import mrmr

In [None]:
df = pd.read_csv('data.csv')
print(f'\ncolumns: {df.columns}')
print(f'\n head(2): {df.head(2)}')
print(f'\nINFO: {df.info()}')

In [None]:
"""
Declare categorical and continuous features
"""
categoricalFeatures = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
continuousFeatures = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']


In [None]:
"""
Prepare categorical features
"""
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Remove '?' from the categorical features
for feature in categoricalFeatures:
    df = df[df[feature] != '?']

# Use the label encoder to convert categorical features to numerical
label_encoder = LabelEncoder()
min_max_scaler = MinMaxScaler()
for col in categoricalFeatures:
    df[col] = label_encoder.fit_transform(df[col])

# Normalize the encoded values to a range of 0 to 1
df[categoricalFeatures] = min_max_scaler.fit_transform(df[categoricalFeatures])

In [None]:
"""
Clean up Continuous Features
"""

# remove rows that are not numeric
for feature in continuousFeatures:
    print(f"Converting {feature} to numeric")
    df = df[pd.to_numeric(df[feature], errors='coerce').notna()]

# Z-score normalization
for feature in continuousFeatures:
    mean = df[feature].mean()
    std = df[feature].std()
    df[feature] = (df[feature] - mean) / std

In [None]:

# Convert the data types to float16
df[categoricalFeatures] = df[categoricalFeatures].astype('float16')
df[continuousFeatures] = df[continuousFeatures].astype('float16')

In [None]:
""" Now with all features cleaned, we can perform mRMR on the data"""
from mrmr import mrmr_classif

allFeatures = df.columns[:-1]
selectedFeatures = mrmr_classif(X=df[allFeatures], y=df['num'], K=len(allFeatures))
print(f'ranked features according to mrmr: {selectedFeatures}')

# Take the top 9 features
selectedFeatures = selectedFeatures[:9] + ['num']
df = df[selectedFeatures]


In [None]:
""" Now lets save the data """
print(df.info())
print(f"\n\n DF HEAD")
print(df.head(5))
df.to_csv('data_cleaned.csv', index=False)
