In [31]:
import pandas as pd
import numpy as np
import os
import mrmr

In [32]:
"""
read in the dataframe and name it 'df'. We will use this dataframe throughout the rest of the code.
this code prints out the unique values in each column of the dataframe
"""
df = pd.read_csv(os.path.join('data', 'adult.csv'), index_col=False)
for col in df.columns:
    print(f"Unique values in Columns '{col}'")
    print(f'{col}: {df[col].unique()}\n')

# remove spaces in every entry in the dataframe
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df.columns = df.columns.str.strip()

# Convert <=50k to 0 and >50k to 1
df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})

print(f'\ncolumns: {df.columns}')
print(f'\n head(2): {df.head(2)}')
print(f'\nINFO: {df.info()}')


Unique values in Columns 'age'
age: [39 50 38 53 28 37 49 52 31 42 30 23 32 40 34 25 43 54 35 59 56 19 20 45
 22 48 21 24 57 44 41 29 18 47 46 36 79 27 67 33 76 17 55 61 70 64 71 68
 66 51 58 26 60 90 75 65 77 62 63 80 72 74 69 73 81 78 88 82 83 84 85 86
 87]

Unique values in Columns ' workclass'
 workclass: [' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked']

Unique values in Columns ' fnlwgt'
 fnlwgt: [ 77516  83311 215646 ...  34066  84661 257302]

Unique values in Columns ' education'
 education: [' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']

Unique values in Columns ' education-num'
 education-num: [13  9  7 14  5 10 12 11  4 16 15  3  6  2  1  8]

Unique values in Columns ' marital-status'
 marital-status: [' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Marrie

In [33]:
"""
Declare categorical and continuous features
"""
featureTypes = pd.read_csv(os.path.join('data', 'featureTypes.csv'), index_col=False)
categoricalFeatures = featureTypes.query("type == 'categorical'")['feature'].tolist()
ordinalFeatures = featureTypes.query("type == 'ordinal'")['feature'].tolist()
numericalFeatures = featureTypes.query("type in ['discrete', 'continuous']")['feature'].tolist()

print(f'categoricalFeatures: {categoricalFeatures}')
print(f'ordinalFeatures: {ordinalFeatures}')
print(f'numericalFeatures: {numericalFeatures}')


categoricalFeatures: ['workclass', 'education', 'marital-status', 'occupation', 'race', 'sex', 'native country']
ordinalFeatures: ['age']
numericalFeatures: ['fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']


In [34]:
"""
Prepare categorical features
"""
# Remove '?' from the categorical features
for feature in categoricalFeatures:
    df = df[df[feature] != '?']
    # print(df[feature].value_counts())

# one-hot encode the categorical features
df = pd.get_dummies(df, columns=categoricalFeatures, drop_first=True)

In [35]:
"""
Clean up Continuous Features
"""

# remove rows that are not numeric
for feature in numericalFeatures:
    print(f"Converting {feature} to numeric")
    df = df[pd.to_numeric(df[feature], errors='coerce').notna()]

# Z-score normalization
for feature in numericalFeatures:
    mean = df[feature].mean()
    std = df[feature].std()
    df[feature] = (df[feature] - mean) / std

Converting fnlwgt to numeric
Converting education-num to numeric
Converting capital-gain to numeric
Converting capital-loss to numeric
Converting hours-per-week to numeric


In [36]:

# Convert the data types to float16
df[ordinalFeatures] = df[ordinalFeatures].astype('float16')
df[numericalFeatures] = df[numericalFeatures].astype('float16')

In [37]:
""" Now with all features cleaned, we can perform mRMR on the data"""
from mrmr import mrmr_classif

allFeatures = df.columns[:-1]
selectedFeatures = mrmr_classif(X=df[allFeatures], y=df['income'], K=len(allFeatures))
print(f'ranked features according to mrmr: {selectedFeatures}')

KeyError: ''

In [None]:
""" Now lets save the data """
print(df.info())
print(f"\n\n DF HEAD")
print(df.head(5))
df.to_csv('data_cleaned.csv', index=False)
