In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from kmodes.kmodes import KModes

# Data preparing

In [24]:
#adult dataset from UCI
data = pd.read_csv('../data/adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [25]:
#drop some continuous values columns that are not okay with KModes
data = data.drop(['fnlwgt','education-num', 'capital-gain','country','capital-loss'],axis = 1)

In [26]:
#for age and hours-per-week, we will group them 
#into two or three groups to become categorical column
data[['age','hours-per-week']].head(3)

Unnamed: 0,age,hours-per-week
0,39,40
1,50,13
2,38,40


In [27]:
def get_age_group(value):
    
    if value == 0:
        return 'Unknown'
    elif value < 20:
        return '<20'
    elif 20 <= value <= 40:
        return '20 - 40'
    elif 40 <= value <= 60:
        return '40 - 60'
    elif value > 60:
        return '60+'
    else:
        return 'Unknown'

In [41]:
def get_hpw_group(value): #hours-per-week
    
    if value < 20:
        return '<20'
    elif 20 <= value <= 40:
        return '20 - 40'
    elif 40 <= value <= 60:
        return '40 - 60'
    elif value > 60:
        return '60+'
    else:
        return 'Unknown'

In [33]:
#apply above function to 'age' column to get categorical column
data['age_group'] = data['age'].apply(get_age_group)
data = data.drop(['age'], axis = 1)

In [47]:
#apply above function to 'hours-per-week' column to get categorical column
data['hpw_group'] = data['hours-per-week'].apply(get_hpw_group)
data = data.drop(['hours-per-week'], axis = 1)

In [48]:
data

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,salary,age_group,hpw_group
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,<=50K,20 - 40,20 - 40
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,<=50K,40 - 60,<20
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,<=50K,20 - 40,20 - 40
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,<=50K,40 - 60,20 - 40
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,<=50K,20 - 40,20 - 40
...,...,...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,<=50K,20 - 40,20 - 40
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,>50K,20 - 40,20 - 40
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,<=50K,40 - 60,20 - 40
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,<=50K,20 - 40,20 - 40
