# 10-process-data
> Importing, cleaning, testing, and saving data

This series of notebooks reflects operations to scrape, prepare, validate, and save the data.

#### Partition dataset with categories and subcategories 

In [1]:
import sys
from pathlib import Path
import dask.dataframe as dd 
import dask as dk
import pandas as pd 
import pyarrow
import re

sys.path.append(str(Path().resolve().parent))

from src.constants import raw_data_dir, processed_data_dir, raw_data_name

In [3]:
def read_sheet(sheet: int):
    return dk.delayed(pd.read_excel)(raw_data_dir / raw_data_name, sheet_name = sheet)

In [6]:
ddf_factors = read_sheet(1)
ddf_categories = read_sheet(2)

ddf_factors_long = ddf_factors.melt(id_vars = ['Date'], 
                                    var_name = "Variable", 
                                    value_name = 'value')

In [14]:
p = re.compile(r" |&|/")

ddf_all = ddf_factors_long.merge(ddf_categories, on = 'Variable', how = 'left')
ddfs = []
categories = ddf_categories["Category"].unique().compute()
for c in categories:
    factors = ddf_categories.loc[ddf_categories["Category"] == c, "Variable"].tolist()
    ddf = ddf_factors[factors]

    c = p.sub("-", c, 1).replace("& ", "").lower()
    ddfs.append(ddf.to_parquet(processed_data_dir / f"{c}.parquet"))
    
dk.compute(*ddfs)

#### Group summary

In [121]:
import itertools as it 

category_split_dir = Path(processed_data_dir) / "split" / "category"
subcategory_split_dir = Path(processed_data_dir) / "split" / "subcategory"
files = it.chain(category_split_dir.glob("*.parquet"), subcategory_split_dir.glob("*.parquet"))

In [122]:
summary_dir = processed_data_dir / "summary"
out = []
@dk.delayed
def summarize_category(file: str):
    aggs = pd.read_parquet(file).agg(
        ["min", "max", "mean", "median", "std", "skew", "kurt", lambda col: col.isnull().mean()])

    aggs.index = ["min", "max", "mean", "median", "std", "skewness", "kurtosis", "pct_missing"]
    
    group = file.parents[0].name
    c = file.stem
    return aggs.to_csv(summary_dir / f"{group}/{c}.csv")


In [123]:
for f in files:
    out.append(summarize_category(f))

In [125]:
dk.compute(out)

([None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],)