In [None]:
import pandas as pd
import numpy as np
import os
import mrmr

In [None]:
"""
read in the dataframe and name it 'df'. We will use this dataframe throughout the rest of the code.
this code prints out the unique values in each column of the dataframe
"""
df = pd.read_csv(os.path.join('data', 'adult.csv'), index_col=False)
for col in df.columns:
    print(f"Unique values in Columns '{col}'")
    print(f'{col}: {df[col].unique()}\n')

print(f'\ncolumns: {df.columns}')
print(f'\n head(2): {df.head(2)}')
print(f'\nINFO: {df.info()}')


In [None]:
"""
Declare categorical and continuous features
"""
featureTypes = pd.read_csv(os.path.join('data', 'featureTypes.csv'), index_col=False)
categoricalFeatures = featureTypes.query("type == 'categorical'")['feature'].tolist()
ordinalFeatures = featureTypes.query("type == 'ordinal'")['feature'].tolist()
numericalFeatures = featureTypes.query("type in ['discrete', 'continuous']")['feature'].tolist()

print(f'categoricalFeatures: {categoricalFeatures}')
print(f'ordinalFeatures: {ordinalFeatures}')
print(f'numericalFeatures: {numericalFeatures}')


In [None]:
"""
Prepare categorical features
"""
# Remove '?' from the categorical features
for feature in categoricalFeatures:
    df = df[df[feature] != '?']

In [None]:
"""
Clean up Continuous Features
"""

# remove rows that are not numeric
for feature in continuousFeatures:
    print(f"Converting {feature} to numeric")
    df = df[pd.to_numeric(df[feature], errors='coerce').notna()]

# Z-score normalization
for feature in continuousFeatures:
    mean = df[feature].mean()
    std = df[feature].std()
    df[feature] = (df[feature] - mean) / std

In [None]:

# Convert the data types to float16
df[categoricalFeatures] = df[categoricalFeatures].astype('float16')
df[continuousFeatures] = df[continuousFeatures].astype('float16')

In [None]:
""" Now with all features cleaned, we can perform mRMR on the data"""
from mrmr import mrmr_classif

allFeatures = df.columns[:-1]
selectedFeatures = mrmr_classif(X=df[allFeatures], y=df['num'], K=len(allFeatures))
print(f'ranked features according to mrmr: {selectedFeatures}')

In [None]:
""" Now lets save the data """
print(df.info())
print(f"\n\n DF HEAD")
print(df.head(5))
df.to_csv('data_cleaned.csv', index=False)
