## Data Preprocessing Class

This notebook shows an example of how you might organize your data preparation into a data reader class. It uses some of the techniques from the previous notebooks.

This class only serves as a illustrative example; many improvements can still be made. Logging would be one, moving generic functionality into a base class another.

In [None]:
import datetime as dt

import numpy as np
import pandas as pd


class SalesReader:

    _required = []

    _csv_params = {
        "sep": ",",
        "parse_dates": [3],
    }

    def _load_csv(self, path):
        """Load CSV with specified parameters."""
        
        return pd.read_csv(path, **self._csv_params)

    def _check_required(self, df):
        """Check requred columns are present."""
        
        missing = set(self._required) - set(df.columns)
        if missing:
            raise RuntimeError(
                "Missing columns in the data: "
                ", ".join(missing)
            )

    def _birthday_features(self, df):
        """Engineer birthdate related features."""
        
        return df.assign(
            age=(pd.to_datetime(dt.date.today()) - df["birthdate"]).dt.days,
            birth_month=df["birthdate"].dt.month,
        )
        
    def _sales_features(self, df):
        """Engineer sales related features."""

        return df.assign(
            avg_order_amount=df["order_amount"] / df["orders"],
        )

    def read(self, path):
        """Load and preprocess sales data."""
        
        # Load and check
        df = self._load_csv(path)
        self._check_required(df)
        
        # Pre-processing
        return(
            df
            .pipe(self._birthday_features)
            .pipe(self._sales_features)
        )


In [None]:
sr = SalesReader()

In [None]:
sr.read("sales_data.csv")