In [1]:
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn import preprocessing

In [2]:
quantiles = [0, 0.25, 0.5, 0.75, 1.0]
ageBins   = [0,20,30,40,50,np.inf]
hrsperhrbin = [0,40,50,60,np.inf]

In [3]:
columns = ['index','train','target','agegroup','fnlwgtgroup','workclass','education','maritalstatus','occupation','relationship','race','sex','cptgaingroup','cptlossgroup','nativecountry']
finalWeightLabels = ['fnlwgt00','fnlwgt01','fnlwgt02','fnlwgt03','fnlwgt04','fnlwgt05','fnlwgt06','fnlwgt07','fnlwgt08','fnlwgt09']
agelabels = ['ageadultunk','ageadult01','ageadult02','ageadult03','ageadult04']
capitalGainLabels = ['cptgain00','cptgain01','cptgain02','cptgain03']
capitalLossLabels = ['cptloss00','cptloss01','cptloss02','cptloss03']
hoursWeekLabels = ['hrsmissing','parttime','fulltime','overtime']

In [4]:
data = pd.read_csv('data/adult_alldata.csv',low_memory=False)
data.drop(['educationnum'],inplace=True,axis=1)

In [5]:
data = data.applymap(lambda x: x.lower().strip() if isinstance(x, str) else x)

In [6]:
data.head()

Unnamed: 0,index,train,target,age,workclass,finalweight,education,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,nativecountry
0,1,1,0,39,state-gov,77516,bachelors,never-married,adm-clerical,not-in-family,white,male,2174,0,40,united-states
1,2,1,0,50,self-emp-not-inc,83311,bachelors,married-civ-spouse,exec-managerial,husband,white,male,0,0,13,united-states
2,3,1,0,38,private,215646,hs-grad,divorced,handlers-cleaners,not-in-family,white,male,0,0,40,united-states
3,4,1,0,53,private,234721,11th,married-civ-spouse,handlers-cleaners,husband,black,male,0,0,40,united-states
4,5,1,0,28,private,338409,bachelors,married-civ-spouse,prof-specialty,wife,black,female,0,0,40,cuba


In [7]:
# Age
data['agegroup'] = pd.cut(data.age,bins=ageBins,labels=agelabels,retbins=False)
data.drop(['age'],inplace=True,axis=1)

In [8]:
## Work Class
data.workclass[data.workclass == '?'] = 'wcmissing'

In [9]:
## Final Weight
data['fnlwgtgroup'] = pd.qcut(data.finalweight, q=10, labels=finalWeightLabels)
data.drop(['finalweight'],inplace=True,axis=1)

In [10]:
data.occupation[data.occupation == '?'] = 'occpmissing'

In [11]:
data['cptgaingroup'] = pd.cut(data.capitalgain,4, labels=capitalGainLabels)
data.drop(['capitalgain'],inplace=True,axis=1)

In [12]:
data['cptlossgroup'] = pd.cut(data.capitalloss,4, labels=capitalLossLabels)
data.drop(['capitalloss'],inplace=True,axis=1)

In [13]:
data['hrsgroup'] = pd.cut(data.hoursperweek,bins=hrsperhrbin,labels=hoursWeekLabels)
data.drop(['hoursperweek'],inplace=True,axis=1)

In [14]:
data.nativecountry[data.nativecountry == '?'] = 'countrymissing'

In [15]:
data.head()

Unnamed: 0,index,train,target,workclass,education,maritalstatus,occupation,relationship,race,sex,nativecountry,agegroup,fnlwgtgroup,cptgaingroup,cptlossgroup,hrsgroup
0,1,1,0,state-gov,bachelors,never-married,adm-clerical,not-in-family,white,male,united-states,ageadult02,fnlwgt01,cptgain00,cptloss00,hrsmissing
1,2,1,0,self-emp-not-inc,bachelors,married-civ-spouse,exec-managerial,husband,white,male,united-states,ageadult03,fnlwgt01,cptgain00,cptloss00,hrsmissing
2,3,1,0,private,hs-grad,divorced,handlers-cleaners,not-in-family,white,male,united-states,ageadult02,fnlwgt06,cptgain00,cptloss00,hrsmissing
3,4,1,0,private,11th,married-civ-spouse,handlers-cleaners,husband,black,male,united-states,ageadult04,fnlwgt07,cptgain00,cptloss00,hrsmissing
4,5,1,0,private,bachelors,married-civ-spouse,prof-specialty,wife,black,female,cuba,ageadult01,fnlwgt09,cptgain00,cptloss00,hrsmissing


In [16]:
data.shape

(48842, 16)

In [17]:
data.shape[1]-3

13

In [18]:
data[columns].to_csv('data/train_data_cat.csv',index=False)