In [1]:
%load_ext jupyter_black

# Question 18

Create a python script that do the following: 
- load the Auto MPG Dataset: csv file
- split in train test and validation clean those missing data standardize the numerical features apply OrdicalEncoder on those discret features 
- use nn.Embeddings on just one discret features create a Dataset pytorch class
- create a DataLoader pytorch class


# Answer

In [90]:
import polars as pl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import torch.nn as nn
import torch
import polars.selectors as cs
from torch.utils.data import Dataset, DataLoader


class DataPrep:
    def __init__(self):
        self.data = pl.read_csv("../data/auto_mpg.csv")
        self.X_train = ...
        self.y_train = ...
        self.X_test = ...
        self.y_test = ...
        self.X_val = ...
        self.y_val = ...
        self.ct = ...
        self.unique_classes_cyl = ...

    def cleaning_data(self) -> None:
        self.data = self.data.drop_nulls()

    def shuffling_data(self) -> None:
        self.data = self.data.sample(fraction=1, shuffle=True, seed=42)

    def split_data(self) -> None:
        X_train, X_test, y_train, y_test = train_test_split(
            self.data.drop("mpg"),
            self.data.select("mpg"),
            test_size=0.2,
            random_state=42,
        )
        X_train, X_val, y_train, y_val = train_test_split(
            X_train,
            y_train,
            test_size=0.2,
            random_state=42,
        )
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.X_val = X_val
        self.y_val = y_val

    def encoder(self) -> None:
        ct = ColumnTransformer(
            [
                ("std_scaler", StandardScaler(), ["disp", "hp", "weight", "acc"]),
                ("ordinal_scaler", OrdinalEncoder(), ["cyl", "origin", "year"]),
            ]
        )
        self.ct = ct

    def fit_transform(self) -> None:
        self.X_train = self.ct.fit_transform(self.X_train.to_pandas())
        self.X_test = self.ct.transform(self.X_test.to_pandas())
        self.X_val = self.ct.transform(self.X_val.to_pandas())
        self.new_features = self.ct.get_feature_names_out()

        self.X_train = pl.from_numpy(self.X_train)
        self.X_train.columns = self.new_features

        self.X_test = pl.from_numpy(self.X_test)
        self.X_test.columns = self.new_features

        self.X_val = pl.from_numpy(self.X_val)
        self.X_val.columns = self.new_features

        self.unique_classes_cyl = (
            self.X_train.select("ordinal_scaler__cyl")
            .unique("ordinal_scaler__cyl")
            .shape
        )

    def using_embedding(self) -> None:
        embedding_cyl = nn.Embedding(self.unique_classes_cyl[0], 50)

        dfs = [self.X_train, self.X_test, self.X_val]
        new_dfs = []

        for df in dfs:
            cyl_to_embed = df.select("ordinal_scaler__cyl").to_numpy()

            cyl_embeded = embedding_cyl(torch.LongTensor(cyl_to_embed))

            cyl_embedded_tabular = pl.from_numpy(
                cyl_embeded.view((-1, 50)).detach().numpy()
            )

            data = df.drop(
                [
                    "ordinal_scaler__cyl",
                    "ordinal_scaler__origin",
                    "ordinal_scaler__year",
                ]
            )

            data = pl.concat(
                [
                    data,
                    cyl_embedded_tabular,
                ],
                how="horizontal",
            )
            new_dfs.append(data)

        return new_dfs

In [86]:
data_prep = DataPrep()
data_prep.cleaning_data()
data_prep.shuffling_data()
data_prep.split_data()
data_prep.encoder()
data_prep.fit_transform()
X_train, X_test, X_val = data_prep.using_embedding()

In [88]:
class MPGDataset(Dataset):
    def __init__(self, X, y):
        super().__init__()
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        X = self.X.to_numpy()[idx]
        y = self.y.to_numpy()[idx]
        return X, y

In [89]:
train_dataset = MPGDataset(X_train, data_prep.y_train)

In [91]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=5)

for X, y in train_dataloader:
    print(X)
    print(y)
    break

tensor([[-0.9079, -0.6607, -0.9064, -0.5411,  0.3979,  2.1349,  0.0736,  0.3205,
         -0.7000, -0.0707,  1.3242,  1.5165,  0.7971,  2.1962, -0.2196,  1.3388,
         -0.6371, -0.2119,  0.6129,  0.0561,  0.8354,  0.8953, -0.3982, -1.8467,
         -0.7590, -0.0091,  0.0426, -0.0468, -0.1222,  0.2128,  0.2577,  1.1217,
         -0.6829,  1.4282, -0.2626, -0.8961,  1.7603,  2.3195,  0.3237,  0.4234,
          1.1591,  0.0160, -0.5305,  0.6294, -0.6996,  0.4748,  1.3023,  0.0254,
         -0.7422,  0.5812, -0.8319,  1.5225, -0.3287,  0.2177],
        [-0.9751, -0.7382, -1.0040, -0.0459,  0.3979,  2.1349,  0.0736,  0.3205,
         -0.7000, -0.0707,  1.3242,  1.5165,  0.7971,  2.1962, -0.2196,  1.3388,
         -0.6371, -0.2119,  0.6129,  0.0561,  0.8354,  0.8953, -0.3982, -1.8467,
         -0.7590, -0.0091,  0.0426, -0.0468, -0.1222,  0.2128,  0.2577,  1.1217,
         -0.6829,  1.4282, -0.2626, -0.8961,  1.7603,  2.3195,  0.3237,  0.4234,
          1.1591,  0.0160, -0.5305,  0.6294, 