In [1]:
# Data packages
import pandas as pd 
import polars as pl     # requires installing polars first
import pyarrow          # requires installing pyarrow first
import re

from data_processing.tabnet_utils import CodeBookFilter

# Model
import numpy as np
import torch_frame as tf
import torch
torch.set_default_dtype(torch.float32)

In [2]:
def get_device():
    # Check if CUDA is available
    if torch.cuda.is_available():
        # If CUDA is available, select the first CUDA device
        device = torch.device("cuda:0")
        print("Using CUDA device:", torch.cuda.get_device_name(0))
    # Check for MPS availability on supported macOS devices (requires PyTorch 1.12 or newer)
    elif torch.backends.mps.is_available():
        # If MPS is available, use MPS device
        device = torch.device("mps")
        print("Using MPS (Metal Performance Shaders) device")
    else:
        # Fallback to CPU if neither CUDA nor MPS is available
        device = torch.device("cpu")
        print("Using CPU")
    return device
device = get_device()

Using MPS (Metal Performance Shaders) device


## Data 

In [3]:
cbf = CodeBookFilter(path = "data/codebooks/PreFer_codebook.csv", accept_missing_rate=0.02, column_appeared_times=8)
PID_col = "nomem_encr"
df = pl.read_csv("data/training_data/PreFer_train_data.csv",
                     infer_schema_length=7418, columns=[PID_col, "outcome_available"]+ cbf.return_valid_column_names()).to_pandas()

In [4]:
cbf.codebook.head()

Unnamed: 0,var_name,var_label,values_cat,labels_cat,unique_values_n,n_missing,prop_missing,type_var,note,year,survey,dataset
0,nomem_encr,Number of household member encrypted,,,,0,0.0,numeric,,,All surveys,PreFer_train_data.csv
2,cf08a_m,Year and month of field work period,,,2.0,0,0.0,numeric,,2008.0,Family & Household,PreFer_train_data.csv
3,cf09b_m,Year and month of field work period,,,2.0,0,0.0,numeric,,2009.0,Family & Household,PreFer_train_data.csv
4,cf10c_m,Year and month of field work period,,,1.0,0,0.0,numeric,,2010.0,Family & Household,PreFer_train_data.csv
5,cf11d_m,Year and month of field work period,,,2.0,0,0.0,numeric,,2011.0,Family & Household,PreFer_train_data.csv


In [87]:
### create data table 
dfs = []
for year in range(2007,2020):
    tempdf = df[[PID_col] + cbf.year2col[year]]
    tempdf["year"] = year
    tempdf = tempdf[tempdf.isna().sum(axis=1) < 80]
    new_names = dict()
    for _c in tempdf.columns:
        try:
            new_names[_c] = cbf.col2id[_c]
        except:
            pass
    tempdf = tempdf.rename(new_names, axis=1)
    dfs.append(tempdf)
result = pd.concat(dfs, axis=0, join='outer', ignore_index=True).sort_values(PID_col).reset_index(drop=True)
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempdf["year"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempdf["year"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempdf["year"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats

Unnamed: 0,nomem_encr,ch004,ch005,ch011,ch012,ch013,ch014,ch015,ch016,ch017,...,cw098,cw099,cw100,cw101,cw102,cr120,cw522,cw523,cw525,cf432
0,700008,3.0,4.0,2.0,1.0,4.0,2.0,5.0,174.0,70.0,...,0.0,0.0,0.0,0.0,0.0,299.000000,0.0,0.0,,
1,700008,3.0,3.0,2.0,3.0,6.0,2.0,5.0,174.0,64.0,...,,,,,,,,,,
2,700025,3.0,3.0,2.0,1.0,5.0,2.0,5.0,171.0,61.0,...,0.0,0.0,0.0,1.0,0.0,439.000000,0.0,0.0,7.0,5.0
3,700025,3.0,3.0,3.0,1.0,4.0,1.0,5.0,171.0,63.0,...,0.0,0.0,0.0,1.0,0.0,275.000000,0.0,0.0,7.0,4.0
4,700025,3.0,3.0,3.0,1.0,4.0,1.0,5.0,173.0,64.0,...,0.0,0.0,0.0,0.0,0.0,96.000000,0.0,0.0,11.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14138,733171,3.0,4.0,3.0,1.0,5.0,1.0,5.0,183.0,74.0,...,0.0,0.0,0.0,0.0,0.0,197.369999,0.0,1.0,7.0,4.0
14139,733171,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,131.000000,0.0,1.0,,4.0
14140,733176,5.0,3.0,1.0,1.0,5.0,1.0,6.0,190.0,95.0,...,0.0,0.0,0.0,0.0,0.0,250.000000,0.0,0.0,1.0,4.0
14141,733176,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,,4.0


In [88]:
data = tf.data.Dataset(df=result, target_col=PID_col, col_to_stype=cbf.get_dtype(result.columns))
data.materialize()
train_data, test_data = data[:0.7], data[0.7:]

In [94]:
import numpy as np
from torch.utils.data import Sampler

class CustomSampler(Sampler):
    def __init__(self, data_source, batch_size=4, seed=None):
        self.data_source = data_source.df.set_index(PID_col)
        self.batch_size = batch_size
        self.unique_indices = list(set(self.data_source.index))
        self.num_batches = len(self.unique_indices) // batch_size
        if seed is not None:
            np.random.seed(seed)

    def __iter__(self):
        # Shuffle the unique indices
        shuffled_indices = np.random.permutation(self.unique_indices)
        for i in range(self.num_batches):
            batch_unique_indices = shuffled_indices[i * self.batch_size: (i + 1) * self.batch_size]
            # Convert unique index values to row indices
            row_indices = []
            for unique_index in batch_unique_indices:
                loc = self.data_source.index.get_loc(unique_index)
                if isinstance(loc, slice):
                    row_indices.extend(range(loc.start, loc.stop))
                elif isinstance(loc, np.ndarray):
                    row_indices.extend(loc.tolist())
                else:
                    row_indices.append(loc)
            yield row_indices

    def __len__(self):
        return self.num_batches


In [95]:
train_loader = tf.data.DataLoader(dataset=data, batch_sampler=CustomSampler(data, batch_size=6))

In [121]:
from typing import Any, Dict, List

from torch import Tensor
from torch.nn import Linear, Module, ModuleList

import torch_frame
from torch_frame import TensorFrame, stype
from torch_frame.data.stats import StatType
from torch_frame.nn.conv import TabTransformerConv
from torch_frame.nn.decoder import ExcelFormerDecoder

from torch_frame.nn.encoder import (
    EmbeddingEncoder,
    LinearEncoder,
    StypeWiseFeatureEncoder,
)


class TabEncoder(Module):
    def __init__(
        self,
        hidden_size: int,
        output_size: int,
        num_layers: int,
        num_heads: int,
        num_cols: int,
        col_stats: Dict[str, Dict[StatType, Any]],
        col_names_dict: Dict[torch_frame.stype, List[str]],
        dropout: float = 0.1,
    ):
        super().__init__()
        self.encoder = StypeWiseFeatureEncoder(
            out_channels=hidden_size,
            col_stats=col_stats,
            col_names_dict=col_names_dict,
            stype_encoder_dict={
                stype.categorical: EmbeddingEncoder(),
                stype.numerical: LinearEncoder()
            },
        )
        self.tab_transformer_convs = ModuleList([
            TabTransformerConv(
                channels=hidden_size,
                num_heads=num_heads,
                ffn_dropout=dropout,
                attn_dropout= dropout / 3.0,
            ) for _ in range(num_layers)
        ])
        #self.aggregator = Linear(hidden_size, output_size)
        self.norm = torch.nn.LayerNorm(hidden_size)
        self.aggregator = ExcelFormerDecoder(in_channels=hidden_size, out_channels=output_size, num_cols=num_cols)

    def forward(self, tf: TensorFrame) -> Tensor:
        x, _ = self.encoder(tf)
        for tab_transformer_conv in self.tab_transformer_convs:
            x = tab_transformer_conv(x)
        out = self.aggregator(x)
        return out

In [122]:
model = TabEncoder(
    hidden_size=256,
    output_size=32,
    num_cols=429,
    num_layers=3,
    num_heads=8,
    col_stats=train_data.col_stats,
    col_names_dict=train_data.tensor_frame.col_names_dict,
).to(device)

In [123]:
xx = next(iter(train_loader))
y = model(xx.to(device))

In [124]:
y

tensor([[-3.6252e-02, -4.6947e-02, -7.7023e-02, -6.0085e-02, -3.1129e-02,
         -5.3647e-02, -8.7403e-02, -6.8112e-02, -1.7283e-02, -3.0689e-02,
         -1.5013e-02, -3.6461e-02, -1.6170e-02, -2.6299e-02,  1.6068e-02,
         -9.9421e-02, -2.9579e-02, -4.3630e-02, -1.1460e-01, -9.0588e-02,
         -1.0174e-01, -5.5973e-03, -6.4619e-02, -2.4721e-02, -4.7826e-02,
         -5.9996e-02, -4.8672e-02, -3.0717e-02, -8.5568e-02, -4.4220e-02,
         -6.7584e-02, -6.9223e-02],
        [-7.8907e-02, -3.6700e-02, -4.6518e-02, -6.2875e-03, -4.8517e-02,
         -9.5982e-02, -5.7317e-02, -6.3849e-02, -5.3177e-02,  9.5596e-03,
         -7.9327e-04, -6.2242e-02, -3.9515e-02, -2.5262e-02,  4.2245e-03,
         -9.4060e-02, -3.4260e-02, -5.7771e-02, -2.6479e-02, -8.8618e-02,
         -6.5816e-02,  2.2451e-02, -8.5282e-02, -3.5081e-02, -3.4729e-02,
         -6.7824e-02, -3.5838e-02, -3.3604e-02, -2.7031e-02, -3.6856e-02,
          5.6459e-03, -4.3361e-02],
        [-6.6066e-02,  9.0571e-04, -1.06

In [99]:
xx.get_col_feat("year")

tensor([[2008.],
        [2007.],
        [2008.],
        [2007.],
        [2014.],
        [2016.],
        [2015.],
        [2013.],
        [2012.],
        [2014.],
        [2012.],
        [2015.],
        [2016.],
        [2009.],
        [2011.],
        [2008.],
        [2017.],
        [2010.],
        [2007.],
        [2013.],
        [2013.]])

In [100]:
y

tensor([[-0.0022,  0.0496, -0.0217,  ..., -0.0172, -0.0736,  0.0259],
        [ 0.0190,  0.0168,  0.0391,  ..., -0.0509, -0.0722,  0.1620],
        [-0.0015,  0.0532, -0.0146,  ..., -0.0252, -0.0716,  0.0398],
        ...,
        [ 0.0153,  0.0193,  0.0372,  ..., -0.0508, -0.0717,  0.1598],
        [-0.0042,  0.0496, -0.0185,  ..., -0.0262, -0.0756,  0.0321],
        [-0.0031,  0.0564, -0.0154,  ..., -0.0227, -0.0760,  0.0416]],
       device='mps:0', grad_fn=<LinearBackward0>)

In [101]:
xx.y

tensor([718795., 718795., 714323., 714323., 725563., 711529., 711529., 711529.,
        711529., 711529., 730280., 730280., 730280., 730280., 730280., 730280.,
        730280., 730280., 730280., 730280., 716709.])