In [2]:
%load_ext jupyter_black

# Question: Create a Dataset from the reading of the folder cars

In [34]:
import torchdata.datapipes as dp
from torch.utils.data import DataLoader
import os

In [78]:
def filtering_paths(filename: str):
    checks = (
        ("unclean" not in filename)
        and ("focus" not in filename)
        and ("cclass" not in filename)
        and filename.endswith(".csv")
    )
    return checks


def get_manufacturer(content):
    path, data = content
    manuf = os.path.splitext(os.path.basename(path))
    manuf = manuf[0].upper()
    data.extend([manuf])
    return data


def gen_encoder_dict(series):
    values = series.unique()
    values = values.to_numpy().flatten().tolist()
    return dict(zip(values, range(len(values))))

In [79]:
dropdown_encoders = {col: gen_encoder_dict(df.select(col)) for col in cat_attr}

In [59]:
import polars as pl
import pandas as pd

df = pl.DataFrame(list(datapipe)).transpose(
    column_names=[
        "model",
        "year",
        "price",
        "transmission",
        "mileage",
        "fuel_type",
        "road_tax",
        "mpg",
        "engine_size",
        "manufacturer",
    ]
)
N_ROWS = len(df)
cont_attr = ["year", "mileage", "road_tax", "mpg", "engine_size"]
cat_attr = ["model", "transmission", "fuel_type", "manufacturer"]

In [119]:
import numpy as np


def preproc(row: list):
    colnames = [
        "model",
        "year",
        "price",
        "transmission",
        "mileage",
        "fuel_type",
        "road_tax",
        "mpg",
        "engine_size",
        "manufacturer",
    ]

    cat_attr = ["model", "transmission", "fuel_type", "manufacturer"]
    cont_attr = ["year", "mileage", "road_tax", "mpg", "engine_size"]
    target = "price"

    vals = dict(zip(colnames, row))
    cont_X = [float(vals[name]) for name in cont_attr]
    cat_X = [dropdown_encoders[name][vals[name]] for name in cat_attr]

    return {
        "label": np.array([float(vals[target])], dtype=np.float32),
        "cont_X": np.array(cont_X, dtype=np.float32),
        "cat_X": np.array(cat_X, dtype=int),
    }

In [120]:
datapipe = dp.iter.FileLister("./cars/")
datapipe = datapipe.filter(filter_fn=filtering_paths)
# the openfiles and parsecsv works together
datapipe = datapipe.open_files(mode="rt")
datapipe = datapipe.parse_csv(delimiter=",", skip_lines=1, return_path=True)
datapipe = datapipe.map(get_manufacturer)
datapipe = datapipe.map(preproc)

In [121]:
# ways to check
dataloader = DataLoader(datapipe, batch_size=5)

next(iter(dataloader))

{'label': tensor([[12500.],
         [16500.],
         [11000.],
         [16800.],
         [17300.]]),
 'cont_X': tensor([[2.0170e+03, 1.5735e+04, 1.5000e+02, 5.5400e+01, 1.4000e+00],
         [2.0160e+03, 3.6203e+04, 2.0000e+01, 6.4200e+01, 2.0000e+00],
         [2.0160e+03, 2.9946e+04, 3.0000e+01, 5.5400e+01, 1.4000e+00],
         [2.0170e+03, 2.5952e+04, 1.4500e+02, 6.7300e+01, 2.0000e+00],
         [2.0190e+03, 1.9980e+03, 1.4500e+02, 4.9600e+01, 1.0000e+00]]),
 'cat_X': tensor([[ 8,  0,  1,  7],
         [62,  3,  0,  7],
         [ 8,  0,  1,  7],
         [85,  3,  0,  7],
         [91,  0,  1,  7]])}