% Load Libraries

In [None]:
import pandas
from pathlib import Path

% Setup

In [None]:
user = "<user-name>"
root_path = Path(f"/home/jupyter-{user}/bootcamp")
data_name = "<data-name>"

% Read Data

In [None]:
raw_data = pandas.read_csv(root_path / f"data/raw/{data_name}.csv")

% Clean Data

In [None]:
clean_data = raw_data

## Format Column Names
Make sure column names are consistently formatted e.g., `snake_case`, `PascalCase`.

```
raw_data = raw_data.rename(
    columns={
        'column1': 'name1',
        'column2': 'name2',
    }
)
```

## Format Column Types
Make sure column types are appropriately formatted e.g., `float`, `category`, `datetime`.

```
raw_data = raw_data.astype(
    {
        'column1': 'type1',
        'column2': 'type2',
    }
)
```

## Format Missing Values
Make sure missing values are correctly formatted e.g., NAs that are zero.

```
raw_data = raw_data.fillna(
    {
        'column1': 'value1',
        'column2': 'value2',
    }
)
```

## Test for Uniqueness
Test for unique rows and correct for or remove **duplicates**.

```
n_duplicates = raw_data.duplicated().sum()
print(f"There are {n_duplicates} duplicates.")
raw_data = raw_data.drop_duplicates()
```

## Test for Validity
Test for valid values and correct for or remove **outliers**.

```
n_outliers = raw_data[raw_data['column'] > 'threshold'].sum()
print(f"There are {n_outliers} outliers.")
raw_data = raw_data.drop(raw_data[raw_data['column'] > 'threshold'].index)
```

## Test for Consistency
Test for consistent columns and correct for or remove **errors**.

```
n_errors = raw_data[raw_data['column1'] == raw_data['column2']].sum()
print(f"There are {n_errors} errors.")
raw_data = raw_data.drop(raw_data[raw_data['column1'] == raw_data['column2']].index)
```

% Write Data

In [None]:
clean_data.to_pickle(root_path / f"data/clean/{data_name}.pkl")