In [1]:
import os
os.chdir('/home/jovyan/work')

In [2]:
import pandas as pd
import polars as pl

In [3]:
import logging

if __name__ == "__main__":
  logging.basicConfig(
    level=logging.INFO,
    force=True,
    format='%(asctime)s %(levelname)s %(name)s: %(message)s',
  )

log = logging.getLogger(__name__)


In [4]:
from pathlib import Path

class Converter:
  def __init__(self):
    self.log = logging.getLogger(__name__ +'.' + self.__class__.__name__)

  def get_stem(self):
    raise NotImplementedError

  def get_data(self, source_path: Path):
    raise NotImplementedError

  def get_data_source_path(self):
    return Path('data/' + self.get_stem() + '.csv')

  def get_data_destination_path(self):
    return Path('data/' + self.get_stem() + '.parquet.snappy')

  def convert_to_parquet(self):
    self.log.info('Begin data conversion.')
    source_path = self.get_data_source_path()
    self.log.info('Will read source data from %s', source_path.resolve())
    df = self.get_data(source_path)

    self.log.info('read %s: %i rows, %i columns', source_path, len(df), len(df.columns))
    
    destination_path = self.get_data_destination_path()
    self.log.info('Will write result data to %s', destination_path.resolve())
    df.write_parquet(
      file=destination_path,
      compression='snappy',
      use_pyarrow=True, # workaround for 'ArrowErrorException: ExternalFormat("A page can only contain i32::MAX uncompressed bytes. This one contains 2165579577")'
    )

In [5]:
class ArticlesConverter(Converter):
  def get_stem(self):
    return 'articles'

  def get_data(self, source_path: Path):
    df = pl.read_csv(
      file=source_path,
      dtypes=dict(
        article_id=pl.datatypes.Utf8,
        colour_group_code=pl.datatypes.Utf8,
        department_no=pl.datatypes.Utf8,
        garment_group_no=pl.datatypes.Utf8,
        graphical_appearance_no=pl.datatypes.Utf8,
        index_code=pl.datatypes.Utf8,
        index_group_no=pl.datatypes.Utf8,
        perceived_colour_master_id=pl.datatypes.Utf8,
        perceived_colour_value_id=pl.datatypes.Utf8,
        product_code=pl.datatypes.Utf8,
        product_type_no=pl.datatypes.Utf8,
        section_no=pl.datatypes.Utf8,
      )
    )

    return df

In [6]:
class CustomersConverter(Converter):
  def get_stem(self):
    return 'customers'

  def get_data(self, source_path: Path):
    df = pl.read_csv(
      file=source_path,
      dtypes=dict(
        customer_id=pl.datatypes.Utf8,
        FN=pl.datatypes.Utf8,
        Active=pl.datatypes.Utf8,
      )
    )

    return df

In [7]:
class TransactionsConverter(Converter):
  def get_stem(self):
    return 'transactions_train'

  def get_data(self, source_path: Path):
    df = pl.read_csv(
      file=source_path,
      dtypes=dict(
        article_id=pl.datatypes.Utf8,
        sales_channel_id=pl.datatypes.Utf8,
      )
    )

    return df

In [8]:
def load_dataset(stem: str) -> pd.DataFrame:
    path = f"data/{stem}.parquet.snappy"
    return pl.read_parquet(path)

In [9]:
if __name__ == "__main__":
  converters = [
    ArticlesConverter(),
    CustomersConverter(),
    TransactionsConverter(),
  ]

  for converter in converters:
    converter.convert_to_parquet()

2022-04-17 07:25:13,581 INFO __main__.ArticlesConverter: Begin data conversion.
2022-04-17 07:25:13,583 INFO __main__.ArticlesConverter: Will read source data from /home/jovyan/work/data/articles.csv
2022-04-17 07:25:13,631 INFO __main__.ArticlesConverter: read data/articles.csv: 105542 rows, 25 columns
2022-04-17 07:25:13,632 INFO __main__.ArticlesConverter: Will write result data to /home/jovyan/work/data/articles.parquet.snappy
2022-04-17 07:25:13,741 INFO __main__.CustomersConverter: Begin data conversion.
2022-04-17 07:25:13,743 INFO __main__.CustomersConverter: Will read source data from /home/jovyan/work/data/customers.csv
2022-04-17 07:25:13,914 INFO __main__.CustomersConverter: read data/customers.csv: 1371980 rows, 7 columns
2022-04-17 07:25:13,915 INFO __main__.CustomersConverter: Will write result data to /home/jovyan/work/data/customers.parquet.snappy
2022-04-17 07:25:14,728 INFO __main__.TransactionsConverter: Begin data conversion.
2022-04-17 07:25:14,729 INFO __main__.T