## Imports

In [131]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import os

### Read data into DFs

In [75]:
headers = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','pred']

In [106]:
original_train_df = pd.read_csv("census-income.data.csv", low_memory = False, names = headers, index_col = False)

In [128]:
original_test_df = pd.read_csv("census-income.test.csv", low_memory = False, names = headers, index_col = False)

In [107]:
original_train_df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
pred              0
dtype: int64

In [108]:
original_train_df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
pred              0
dtype: int64

In [115]:
train_df = original_train_df.copy()

In [130]:
test_df = original_test_df.copy()

In [116]:
train_df[:25]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,pred
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


## Cleanup:

### Remove whitespace

In [117]:
# show example value

train_df.iloc[0]['occupation']

' Adm-clerical'

In [118]:
def data_to_str(df):

# Iterate through dataframe, test datatype, append string-type columns into our list,
# use list to strip whitespace from string values in columns
    
    to_string_list = [] # create empty list to fill with string-dtype column names
    
    for col in df: # iterate through all columns in train_df
        if pd.api.types.is_object_dtype(df[col]): # test if string-datatype
            to_string_list.append(col) # if true, append to list
    
    for col in to_string_list: # iterate through list of string-datatype columns
        df[col] = df[col].str.strip() # strip all values in column of whitespace

In [119]:
data_to_str(train_df)

In [120]:
# test example value

train_df.iloc[0]['occupation']

'Adm-clerical'

### Explicitly declare int32 datatype

In [121]:
# The integers are stored in int64 - not wrong, but unnecessarily large for these values

train_df.iloc[0]['age']

np.int64(39)

In [122]:
def data_to_int32(df):

# Iterate through dataframe, test datatype, append numeric-type columns into our list,
# use list to strip whitespace from numeric values in columns

    to_int32_list = [] # create empty list to fill with numeric-dtype column names
    
    for col in df: # iterate through all columns in train_df
        if pd.api.types.is_numeric_dtype(df[col]): # test if string-datatype
            to_int32_list.append(col) # if true, append to list
    
    for col in to_int32_list: # iterate through list of string-datatype columns
        df[col] = df[col].astype('int32') # strip all values in column of whitespace

In [123]:
data_to_int32(train_df)

In [124]:
train_df.iloc[0]['age']

np.int32(39)

In [125]:
train_df.dtypes

age                int32
workclass         object
fnlwgt             int32
education         object
education-num      int32
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int32
capital-loss       int32
hours-per-week     int32
native-country    object
pred              object
dtype: object

In [126]:
train_df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
pred              0
dtype: int64

In [127]:
train_df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
pred              0
dtype: int64

In [28]:
train_df.value_counts(['race','sex'])

race                sex   
White               Male      19174
                    Female     8642
Black               Male       1569
                    Female     1555
Asian-Pac-Islander  Male        693
                    Female      346
Amer-Indian-Eskimo  Male        192
Other               Male        162
Amer-Indian-Eskimo  Female      119
Other               Female      109
Name: count, dtype: int64

In [29]:
train_df.value_counts(['occupation'])

occupation       
Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
?                    1843
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: count, dtype: int64