## Load Libraries

In [2]:
import pandas
from pathlib import Path
import numpy as np

## Setup

In [115]:
user = "user-name"
root_path = Path(f"/home/jupyter-{user}/bootcamp")
data_name = "income"

## Read Data

In [107]:
raw_data = pandas.read_csv(root_path / f"data/raw/{data_name}.csv")

## Clean Data

### Format Column Names

In [108]:
raw_data = raw_data.rename(columns={
    'fnlwgt': 'final_weight',
    'educational-num': 'education_code',
    'marital-status': 'marital_status',
    'capital-gain': 'capital_gain',
    'capital-loss': 'capital_loss',
    'hours-per-week': 'hours_per_week',
    'native-country': "native_country",
})

### Format Missing Values

In [109]:
raw_data = raw_data.replace({
    '?': np.NaN,
})

### Format Column Types

In [110]:
raw_data = raw_data.astype({
    'age': 'float',
    'final_weight': 'float',
    'capital_gain': 'float',
    'capital_loss': 'float',
    'hours_per_week': 'float',
})

### Test for Uniquness

In [111]:
n_duplicates = raw_data.duplicated().sum()
print(f'There are {n_duplicates} duplicates.')
raw_data = raw_data.drop_duplicates()

There are 52 duplicates.


### Test for Validity

In [112]:
n_age_outliers = (raw_data['age'] > 120).sum()
print(f'There are {n_age_outliers} age outliers.')
raw_data = raw_data.drop(raw_data[raw_data['age'] > 120].index)

There are 0 age outliers.


### Test for Consistency

In [113]:
n_education_errors = (raw_data['education'].value_counts().values != raw_data['education_code'].value_counts().values).sum()
print(f"There are {n_education_errors} education errors.")

There are 0 education errors.


## Write Data

In [114]:
raw_data.to_pickle(root_path / f"data/clean/{data_name}.pkl")