# Model demo using the UCI `covertype` data

In [1]:
# Last run.
!date

Tue Jun 11 15:42:11 PDT 2024


In [2]:
from typing import List

import numpy as np
import pandas as pd

import lightning as pl
import torch
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import LabelEncoder, RobustScaler, OneHotEncoder, OrdinalEncoder

import matplotlib.pyplot as plt

from ucimlrepo import fetch_ucirepo  

In [3]:
%reload_ext autoreload
%autoreload 2

## Fetch and prepare the data

Fetch the `covertype` data from UCI, split and preprocess, and create a data loader.

In [4]:
data = fetch_ucirepo("Covertype")

In [5]:
data_df = pd.concat([data.data.features, data.data.targets], axis=1)

In [6]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581012 entries, 0 to 581011
Data columns (total 55 columns):
 #   Column                              Non-Null Count   Dtype
---  ------                              --------------   -----
 0   Elevation                           581012 non-null  int64
 1   Aspect                              581012 non-null  int64
 2   Slope                               581012 non-null  int64
 3   Horizontal_Distance_To_Hydrology    581012 non-null  int64
 4   Vertical_Distance_To_Hydrology      581012 non-null  int64
 5   Horizontal_Distance_To_Roadways     581012 non-null  int64
 6   Hillshade_9am                       581012 non-null  int64
 7   Hillshade_Noon                      581012 non-null  int64
 8   Hillshade_3pm                       581012 non-null  int64
 9   Horizontal_Distance_To_Fire_Points  581012 non-null  int64
 10  Wilderness_Area1                    581012 non-null  int64
 11  Soil_Type1                          581012 non-null 

In [7]:
data_df["Wilderness_Area1"].value_counts()

Wilderness_Area1
0    320216
1    260796
Name: count, dtype: int64

In [8]:
data_config = {
    "target": "Cover_Type",
    "task": "classification",
    # Numerical columns that will be normalized.
    "numerical_columns_norm": [
        "Elevation",
        "Aspect",
        "Slope",
        "Horizontal_Distance_To_Hydrology",
        "Vertical_Distance_To_Hydrology",
        "Horizontal_Distance_To_Roadways",
        "Hillshade_9am",
        "Hillshade_Noon",
        "Hillshade_3pm",
        "Horizontal_Distance_To_Fire_Points",
    ],
    # Numerical non-transformed (raw) columns.
    "numerical_columns_raw": [
        "Wilderness_Area1",
        "Wilderness_Area2",
        "Wilderness_Area3",
        "Wilderness_Area4",
        "Soil_Type1",
        "Soil_Type2",
        "Soil_Type3",
        "Soil_Type4",
        "Soil_Type5",
        "Soil_Type6",
        "Soil_Type7",
        "Soil_Type8",
        "Soil_Type9",
        "Soil_Type10",
        "Soil_Type11",
        "Soil_Type12",
        "Soil_Type13",
        "Soil_Type14",
        "Soil_Type15",
        "Soil_Type16",
        "Soil_Type17",
        "Soil_Type18",
        "Soil_Type19",
        "Soil_Type20",
        "Soil_Type21",
        "Soil_Type22",
        "Soil_Type23",
        "Soil_Type24",
        "Soil_Type25",
        "Soil_Type26",
        "Soil_Type27",
        "Soil_Type28",
        "Soil_Type29",
        "Soil_Type30",
        "Soil_Type31",
        "Soil_Type32",
        "Soil_Type33",
        "Soil_Type34",
        "Soil_Type35",
        "Soil_Type36",
        "Soil_Type37",
        "Soil_Type38",
        "Soil_Type39",
        "Soil_Type40",
    ],
    # Categorical columns that will be embedded.
    "categorical_columns_embed": [],
    # Categorical columns that will be one-hot encoded.
    "categorical_columns_onehot": [],
}

In [30]:
class TabDataset(Dataset):
    """Tabular dataset.
    
    Args:
        data_df (pd.DataFrame): Dataframe containing the data.
        config (dict): Configuration dictionary.
        transform (callable): Optional transform to be applied on a sample.
    """
    def __init__(self, data_df: pd.DataFrame, config: dict, transform=None):
        self.data_size = len(data_df)
        # All numerical features to be normalized.
        self.numerical_features_norm = data_df[
            config["numerical_columns_norm"]
        ].values.astype("float32")
        # All numerical features that are not normalized.
        self.numerical_features_raw = data_df[
            config["numerical_columns_raw"]
        ].values.astype("float32")
        # Categorical features that will be embedded.
        self.categorical_features_embed = data_df[
            config["categorical_columns_embed"]
        ].values.astype("int64")
        # Categorical features that will be one-hot encoded.
        self.categorical_features_onehot = data_df[
            config["categorical_columns_onehot"]
        ].values.astype("int64")
        # Target variable.
        target_type = "int64" if config["task"] == "classification" else "float32"
        self.target = (
            data_df[config["target"]].values.astype(target_type)
        )
        self.config = config
        self.transform = transform

    def __len__(self) -> int:
        return self.data_size

    def __getitem__(self, idx: List[int]) -> dict:
        if torch.is_tensor(idx):
            idx = idx.tolist()
        X_norm = self.numerical_features_norm[idx]
        X_raw = self.numerical_features_raw[idx]
        X_embed = self.categorical_features_embed[idx]
        X_onehot = self.categorical_features_onehot[idx]
        y = self.target[idx]
        data = {
            "X_norm": X_norm,
            "X_raw": X_raw,
            "X_embed": X_embed,
            "X_onehot": X_onehot,
            "y": y,
        }
        if self.transform:
            data = self.transform(data)
        return data

    @property
    def data(self) -> pd.DataFrame:
        """A DataFrame containing the data."""
        return pd.concat(
            [
                pd.DataFrame(
                    self.numerical_features_norm,
                    columns=self.config["numerical_columns_norm"],
                ),
                pd.DataFrame(
                    self.numerical_features_raw,
                    columns=self.config["numerical_columns_raw"],
                ),
                pd.DataFrame(
                    self.categorical_features_embed,
                    columns=self.config["categorical_columns_embed"],
                ),
                pd.DataFrame(
                    self.categorical_features_onehot,
                    columns=self.config["categorical_columns_onehot"],
                ),
                pd.DataFrame(self.target, columns=[self.config["target"]]),
            ],
        )


class TabDataTransformer:
    """Transform the data given by the TabDataset.

    This includes normalize numerical features and for categorical features, either one-hot encode or apply
    ordinal encoding (in preperaton for creating embeddings). In addition, apply label encoding for the target
    variable if it is a classification task.

    Args:
        config (dict): Configuration dictionary.

    Returns:
        dict: Transformed data.
    """

    def __init__(self, config: dict):
        self.config = config
        if config["task"] == "classification":
            self.label_encoder = LabelEncoder()
        self.scaler = self._scaler()
        # Keep track of encoders for each categorical column.
        self.one_hod_encoders = {cn: self._one_hod_encoder() for cn in config["categorical_columns_onehot"]}
        self.embed_encoders = {cn: self._embed_encoder() for cn in config["categorical_columns_embed"]}

    def _one_hod_encoder():
        """For the purpose of cusomization."""
        return OneHotEncoder()
    
    def _scaler(self):
        """For the purpose of cusomization."""
        return RobustScaler()
    
    def _embed_encoder(self):
        """For the purpose of cusomization."""
        return OrdinalEncoder()

    def _fit_scaler(self, X: np.ndarray):
        self.scaler.fit(X)

    def _fit_label_encoders(self, y: np.ndarray):
        self.label_encoder.fit(y)

    def _fit_one_hot_encoders(self, X: np.ndarray):
        for i, cn in enumerate(self.config["categorical_columns_onehot"]):
            self.one_hod_encoders[cn].fit(X[:, i].reshape(-1, 1))

    def _fit_embed_encoders(self, X: np.ndarray):
        for i, cn in enumerate(self.config["categorical_columns_embed"]):
            self.embed_encoders[cn].fit(X[:, i].reshape(-1, 1))

    def fit(self, data: TabDataset):
        self._fit_scaler(data.numerical_features_norm)
        self._fit_label_encoders(data.target)
        self._fit_one_hot_encoders(data.categorical_features_onehot)
        self._fit_embed_encoders(data.categorical_features_embed)

    def transform(self, data: dict):
        X_norm = self.scaler.transform(data["X_norm"])
        X_onehot = np.hstack(
            [
                self.one_hod_encoders[cn].transform(data["X_onehot"][:, i].reshape(-1, 1)).toarray()
                for i, cn in enumerate(self.config["categorical_columns_onehot"])
            ]
        )
        X_embed = np.hstack(
            [
                self.embed_encoders[cn].transform(data["X_embed"][:, i].reshape(-1, 1))
                for i, cn in enumerate(self.config["categorical_columns_embed"])
            ]
        )
        y = self.label_encoder.transform(data["y"])
        return {
            "X_norm": X_norm,
            "X_onehot": X_onehot,
            "X_embed": X_embed,
            "y": y,
        }
    
 

#### Testing

In [31]:
tmp_ds = TabDataset(data_df.sample(100), data_config)

In [32]:
len(tmp_ds)

100

In [33]:
tmp_ds[[0,1]]

{'X_norm': array([[3035.,   91.,   13.,  182.,   36., 5155.,  240.,  219.,  108.,
         3302.],
        [2110.,  349.,   21.,  319.,    0.,  417.,  177.,  201.,  159.,
          685.]], dtype=float32),
 'X_raw': array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32),
 'X_embed': array([], shape=(2, 0), dtype=int64),
 'X_onehot': array([], shape=(2, 0), dtype=int64),
 'y': array([2, 3])}

In [34]:
uci_dataset.data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2905060 entries, 0 to 581011
Data columns (total 55 columns):
 #   Column                              Dtype  
---  ------                              -----  
 0   Elevation                           float32
 1   Aspect                              float32
 2   Slope                               float32
 3   Horizontal_Distance_To_Hydrology    float32
 4   Vertical_Distance_To_Hydrology      float32
 5   Horizontal_Distance_To_Roadways     float32
 6   Hillshade_9am                       float32
 7   Hillshade_Noon                      float32
 8   Hillshade_3pm                       float32
 9   Horizontal_Distance_To_Fire_Points  float32
 10  Wilderness_Area1                    float32
 11  Wilderness_Area2                    float32
 12  Wilderness_Area3                    float32
 13  Wilderness_Area4                    float32
 14  Soil_Type1                          float32
 15  Soil_Type2                          float32
 16  Soil_T

In [35]:
transformer = TabDataTransformer(data_config)

In [36]:
transformer.fit(tmp_ds)

In [37]:
transformer.transform(tmp_ds[[0,1]])

ValueError: need at least one array to concatenate

In [17]:
torch.backends.mps.is_available()

True