# DSTA-0000--bsr-design-sprint--munge-bsis-test-data

## Context

Load four KC63 test datasets.

## Setup

In [None]:
%load_ext autoreload
%load_ext jupyter_black

In [None]:
import logging
from pathlib import Path

import numpy as np
import pandas as pd

import _01_munge_bsis_test_data as utils

utils.set_up_logging()
logger = logging.getLogger(__name__)

In [None]:
DATA_FOLDER__INPUTS = Path("data/inputs")
DATA_FOLDER__OUTPUTS = Path("data/outputs")
DEFAULT_DATA_TYPE = str

COL__KC63_OUTPUT_BY_RUN_ID = "kc63_output_by_run_id"
COL__LINE_NUMBER = "col_b"

LINE_NUMBERS__AGE = list(map(lambda x: str(x).zfill(3), range(1, 15))) + ["999"]
LINE_NUMBERS__INVITED = ["1", "2"]
LINE_NUMBERS__HIGH_RISK = ["001 (col 1)", "001 (col 2)"]

LINE_NUMBERS_SETS = {
    "age": LINE_NUMBERS__AGE,
    "invited": LINE_NUMBERS__INVITED,
    "high-risk": LINE_NUMBERS__HIGH_RISK,
}

## Load KC63 test datasets

In [None]:
raw__bso_gpp = pd.read_csv(DATA_FOLDER__INPUTS / "bso_gpp_20231123.csv", dtype=DEFAULT_DATA_TYPE)
raw__bso = pd.read_csv(DATA_FOLDER__INPUTS / "kc63_bso_20231123_FromBSS.csv", dtype=DEFAULT_DATA_TYPE)
raw__gpp = pd.read_csv(DATA_FOLDER__INPUTS / "kc63_gpp_20231123_FromBSS.csv", dtype=DEFAULT_DATA_TYPE)
raw__utla = pd.read_csv(DATA_FOLDER__INPUTS / "kc63_utla_20231123_FromBSS.csv", dtype=DEFAULT_DATA_TYPE)

## Munge

### Perturb numeric values

In [None]:
%%time

bso_gpp = raw__bso_gpp


def perturb_dataframe_numeric_values_where_possible(df):
    cols__perturbation = df.columns.drop([COL__KC63_OUTPUT_BY_RUN_ID, COL__LINE_NUMBER])
    df_out = df.copy()
    df_out[cols__perturbation] = df_out[cols__perturbation].map(
        lambda x: utils.perturb_numeric_values_where_possible(
            x, apply_random_number_below_threshold=10, above_threshold_smear_factor=0.5
        )
    )
    return df_out


bso = perturb_dataframe_numeric_values_where_possible(raw__bso)
gpp = perturb_dataframe_numeric_values_where_possible(raw__gpp)
utla = perturb_dataframe_numeric_values_where_possible(raw__utla)

In [None]:
top_n = 3
display(raw__bso.head(top_n))
display(bso.head(top_n))

In [None]:
utils.xcheck__perturbation_worked_as_expected(bso, raw__bso)
utils.xcheck__perturbation_worked_as_expected(gpp, raw__gpp)
utils.xcheck__perturbation_worked_as_expected(utla, raw__utla)

### Split out table sections

In [None]:
output_filename = f"2023-11-23--bso-gpp.csv"
bso_gpp.to_csv(DATA_FOLDER__OUTPUTS / output_filename, index=False)

In [None]:
bso.name = "bso"
gpp.name = "gpp"
utla.name = "utla"

for df in [bso, gpp, utla]:
    for line_number_set_name, line_number_set in LINE_NUMBERS_SETS.items():
        filter_ = df[COL__LINE_NUMBER].isin(line_number_set)
        df_out = df.loc[filter_]
        output_filename = f"2023-11-23--{df.name}-level--{line_number_set_name}--perturbed-values.csv"
        df_out.to_csv(DATA_FOLDER__OUTPUTS / output_filename, index=False)