In [23]:
# Data packages
import pandas as pd 
import polars as pl     # requires installing polars first
import pyarrow          # requires installing pyarrow first
import re

from collections import defaultdict


# Model
import numpy as np
import torch_frame as tf
import torch
torch.set_default_dtype(torch.float32)

In [24]:
def get_device():
    # Check if CUDA is available
    if torch.cuda.is_available():
        # If CUDA is available, select the first CUDA device
        device = torch.device("cuda:0")
        print("Using CUDA device:", torch.cuda.get_device_name(0))
    # Check for MPS availability on supported macOS devices (requires PyTorch 1.12 or newer)
    elif torch.backends.mps.is_available():
        # If MPS is available, use MPS device
        device = torch.device("mps")
        print("Using MPS (Metal Performance Shaders) device")
    else:
        # Fallback to CPU if neither CUDA nor MPS is available
        device = torch.device("cpu")
        print("Using CPU")
    return device
device = get_device()

Using MPS (Metal Performance Shaders) device


In [3]:
class CodeBookFilter:
    def __init__(self, path: str, accept_missing_rate: float = 0.05, column_appeared_times: int = 8) -> None:
        assert accept_missing_rate >= 0 and accept_missing_rate <=1.0
        assert column_appeared_times > 0
        self.column_appeared_times = column_appeared_times
        self.accept_missing_rate = accept_missing_rate


        codebook = pl.read_csv(path).to_pandas()

        self.column_appeared_times = column_appeared_times
        self.codebook = self._filter_codebook(codebook=codebook) # CodeBook Filtered
        self.valid_columns = self._return_valid_columns()
        self._create_column_dict()
    
    def _filter_codebook(self, codebook):
            codebook = codebook[codebook["prop_missing"] < self.accept_missing_rate]
            codebook = codebook[codebook["type_var"].isin(["numeric", "categorical"])]
            return codebook
    def _return_valid_columns(self):
        column_appeared = defaultdict(int)
        for year in range(2007,2020):
            _cols = self.codebook[self.codebook["year"] == year]["var_name"].values
            _cols = [self._match(x) for x in _cols]
            for _c in _cols:
                if _c != None:
                    column_appeared[_c] += 1
        return set([k for k,v in column_appeared.items() if v>self.column_appeared_times])
    
    def _create_column_dict(self):
        self.col2id = dict()
        self.year2col = dict()
        self.col2dtype = dict()
        for year in range(2007,2020):
            _cols = self.codebook[self.codebook["year"] == year]["var_name"].values
            _matched_cols = list()
            for _c in _cols:
                if self._match(_c) in self.valid_columns:
                    self.col2id[_c] = self._match(_c)
                    self.col2dtype[_c] = self.codebook[self.codebook["var_name"] == _c]["type_var"].values[0]
                    self.col2dtype[self._match(_c)] = self.codebook[self.codebook["var_name"] == _c]["type_var"].values[0]
                    _matched_cols.append(_c)
            self.year2col[year] = _matched_cols


    def _match(self, x):
        """
        Returns standardized name of the column if possible: XXNNN"""
        pattern = re.compile(r'^([a-zA-Z]{2}).*([0-9]{3})$')
        m = pattern.match(x)
        if m:
            return("%s%s"%(m.group(1), m.group(2)))
        return None

    def return_valid_column_names(self):
        return list(self.col2id.keys())
    def return_valid_column_transformed_names(self):
        return list(set([i for i in self.col2id.values()]))

## Data 

In [4]:
cbf = CodeBookFilter(path = "data/codebooks/PreFer_codebook.csv", accept_missing_rate=0.02, column_appeared_times=8)
PID_col = "nomem_encr"
df = pl.read_csv("data/training_data/PreFer_train_data.csv",
                     infer_schema_length=7418, columns=[PID_col, "outcome_available"]+ cbf.return_valid_column_names()).to_pandas()

In [5]:
cbf.codebook.head()

Unnamed: 0,var_name,var_label,values_cat,labels_cat,unique_values_n,n_missing,prop_missing,type_var,note,year,survey,dataset
0,nomem_encr,Number of household member encrypted,,,,0,0.0,numeric,,,All surveys,PreFer_train_data.csv
2,cf08a_m,Year and month of field work period,,,2.0,0,0.0,numeric,,2008.0,Family & Household,PreFer_train_data.csv
3,cf09b_m,Year and month of field work period,,,2.0,0,0.0,numeric,,2009.0,Family & Household,PreFer_train_data.csv
4,cf10c_m,Year and month of field work period,,,1.0,0,0.0,numeric,,2010.0,Family & Household,PreFer_train_data.csv
5,cf11d_m,Year and month of field work period,,,2.0,0,0.0,numeric,,2011.0,Family & Household,PreFer_train_data.csv


In [6]:
### create data table 
dfs = []
for year in range(2007,2020):
    tempdf = df[[PID_col] + cbf.year2col[year]]
    tempdf["year"] = year
    tempdf = tempdf[tempdf.isna().sum(axis=1) < 80]
    new_names = dict()
    for _c in tempdf.columns:
        try:
            new_names[_c] = cbf.col2id[_c]
        except:
            pass
    tempdf = tempdf.rename(new_names, axis=1)
    dfs.append(tempdf)
result = pd.concat(dfs, axis=0, join='outer', ignore_index=True).sort_values(PID_col).reset_index(drop=True).set_index(PID_col)

# Get unique indices
#unique_indices = result.index.unique()

# Randomly assign each unique index to 0 (train) or 1 (validation)
#np.random.seed(42)  # For reproducibility
#index_split = pd.Series(np.random.choice([0, 1], size=len(unique_indices), p=[0.7, 0.3]), index=unique_indices)

# Map the index_split to the original DataFrame
#result['data_split'] = result.index.map(index_split)
# Select columns with 'float64' dtype  
float64_cols = list(result.select_dtypes(include='float64'))

# The same code again calling the columns
result = result.astype("float32")
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempdf["year"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempdf["year"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempdf["year"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats

Unnamed: 0_level_0,ch004,ch005,ch011,ch012,ch013,ch014,ch015,ch016,ch017,ch018,...,cw098,cw099,cw100,cw101,cw102,cr120,cw522,cw523,cw525,cf432
nomem_encr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
700008,3.0,4.0,2.0,1.0,4.0,2.0,5.0,174.0,70.0,2.0,...,0.0,0.0,0.0,0.0,0.0,299.000000,0.0,0.0,,
700008,3.0,3.0,2.0,3.0,6.0,2.0,5.0,174.0,64.0,2.0,...,,,,,,,,,,
700025,3.0,3.0,2.0,1.0,5.0,2.0,5.0,171.0,61.0,1.0,...,0.0,0.0,0.0,1.0,0.0,439.000000,0.0,0.0,7.0,5.0
700025,3.0,3.0,3.0,1.0,4.0,1.0,5.0,171.0,63.0,1.0,...,0.0,0.0,0.0,1.0,0.0,275.000000,0.0,0.0,7.0,4.0
700025,3.0,3.0,3.0,1.0,4.0,1.0,5.0,173.0,64.0,1.0,...,0.0,0.0,0.0,0.0,0.0,96.000000,0.0,0.0,11.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
733171,3.0,4.0,3.0,1.0,5.0,1.0,5.0,183.0,74.0,2.0,...,0.0,0.0,0.0,0.0,0.0,197.369995,0.0,1.0,7.0,4.0
733171,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,131.000000,0.0,1.0,,4.0
733176,5.0,3.0,1.0,1.0,5.0,1.0,6.0,190.0,95.0,2.0,...,0.0,0.0,0.0,0.0,0.0,250.000000,0.0,0.0,1.0,4.0
733176,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,,4.0


In [7]:
set(cbf.col2dtype.values())
{'categorical',
 'character [almost exclusively empty strings]',
 'date or time',
 'numeric',
 'response to open-ended question'}

{'categorical',
 'character [almost exclusively empty strings]',
 'date or time',
 'numeric',
 'response to open-ended question'}

In [8]:
def get_dtype(cols):
    result = dict()
    for _c in cols:
        try:
            col_type = cbf.col2dtype[_c]
        except:
            col_type = None
        if col_type == "categorical":
            result[_c] = tf.categorical
        elif col_type =="numerical":
            result[_c] = tf.numerical
        else:
            result[_c] = tf.numerical

    try:
        del result["data_split"]
    except:
        pass
    return result
        


In [9]:
result.dtypes

ch004    float32
ch005    float32
ch011    float32
ch012    float32
ch013    float32
          ...   
cr120    float32
cw522    float32
cw523    float32
cw525    float32
cf432    float32
Length: 429, dtype: object

In [15]:
data = tf.data.Dataset(df=result, col_to_stype=get_dtype(result.columns))
data.materialize()
train_data, test_data = data[:0.7], data[0.7:]

In [48]:
train_loader = tf.data.DataLoader(dataset=train_data.tensor_frame, batch_size=2)

TypeError: __init__() got an unexpected keyword argument 'index'

In [54]:
data.index_select([1,2,3]).tensor_frame

TensorFrame(
  num_cols=429,
  num_rows=3,
  categorical (391): ['cd002', 'cd003', 'cd035', 'cd038', 'cd041', 'cd042', 'cd043', 'cd044', 'cd045', 'cd046', 'cd047', 'cd048', 'cd049', 'cd050', 'cd051', 'cd052', 'cd053', 'cd054', 'cd055', 'cd058', 'cd073', 'cd074', 'cd075', 'cd076', 'cd077', 'cf001', 'cf003', 'cf024', 'cf388', 'cf389', 'cf390', 'cf391', 'cf392', 'cf432', 'ch004', 'ch005', 'ch011', 'ch012', 'ch013', 'ch014', 'ch015', 'ch018', 'ch020', 'ch021', 'ch022', 'ch023', 'ch024', 'ch025', 'ch026', 'ch027', 'ch028', 'ch029', 'ch030', 'ch031', 'ch032', 'ch033', 'ch034', 'ch035', 'ch036', 'ch037', 'ch038', 'ch039', 'ch040', 'ch041', 'ch042', 'ch043', 'ch044', 'ch045', 'ch070', 'ch071', 'ch072', 'ch073', 'ch074', 'ch075', 'ch076', 'ch077', 'ch078', 'ch079', 'ch099', 'ch125', 'ch133', 'ch159', 'ch160', 'ch161', 'ch162', 'ch163', 'ch169', 'ch170', 'ch171', 'ch172', 'ch173', 'ch174', 'ch175', 'ch176', 'ch177', 'ch178', 'ch179', 'ch180', 'ch181', 'ch182', 'ch183', 'ch184', 'ch196', 'ch197',

In [19]:
xx.to("mps")

TensorFrame(
  num_cols=429,
  num_rows=2,
  categorical (391): ['cd002', 'cd003', 'cd035', 'cd038', 'cd041', 'cd042', 'cd043', 'cd044', 'cd045', 'cd046', 'cd047', 'cd048', 'cd049', 'cd050', 'cd051', 'cd052', 'cd053', 'cd054', 'cd055', 'cd058', 'cd073', 'cd074', 'cd075', 'cd076', 'cd077', 'cf001', 'cf003', 'cf024', 'cf388', 'cf389', 'cf390', 'cf391', 'cf392', 'cf432', 'ch004', 'ch005', 'ch011', 'ch012', 'ch013', 'ch014', 'ch015', 'ch018', 'ch020', 'ch021', 'ch022', 'ch023', 'ch024', 'ch025', 'ch026', 'ch027', 'ch028', 'ch029', 'ch030', 'ch031', 'ch032', 'ch033', 'ch034', 'ch035', 'ch036', 'ch037', 'ch038', 'ch039', 'ch040', 'ch041', 'ch042', 'ch043', 'ch044', 'ch045', 'ch070', 'ch071', 'ch072', 'ch073', 'ch074', 'ch075', 'ch076', 'ch077', 'ch078', 'ch079', 'ch099', 'ch125', 'ch133', 'ch159', 'ch160', 'ch161', 'ch162', 'ch163', 'ch169', 'ch170', 'ch171', 'ch172', 'ch173', 'ch174', 'ch175', 'ch176', 'ch177', 'ch178', 'ch179', 'ch180', 'ch181', 'ch182', 'ch183', 'ch184', 'ch196', 'ch197',

In [34]:
from typing import Any, Dict, List

from torch import Tensor
from torch.nn import Linear, Module, ModuleList

import torch_frame
from torch_frame import TensorFrame, stype
from torch_frame.data.stats import StatType
from torch_frame.nn.conv import TabTransformerConv
from torch_frame.nn.encoder import (
    EmbeddingEncoder,
    LinearEncoder,
    StypeWiseFeatureEncoder,
)


class TabEncoder(Module):
    def __init__(
        self,
        hidden_size: int,
        output_size: int,
        num_layers: int,
        num_heads: int,
        col_stats: Dict[str, Dict[StatType, Any]],
        col_names_dict: Dict[torch_frame.stype, List[str]],
    ):
        super().__init__()
        self.encoder = StypeWiseFeatureEncoder(
            out_channels=hidden_size,
            col_stats=col_stats,
            col_names_dict=col_names_dict,
            stype_encoder_dict={
                stype.categorical: EmbeddingEncoder(),
                stype.numerical: LinearEncoder()
            },
        )
        self.tab_transformer_convs = ModuleList([
            TabTransformerConv(
                channels=hidden_size,
                num_heads=num_heads,
            ) for _ in range(num_layers)
        ])
        self.aggregator = Linear(hidden_size, output_size)

    def forward(self, tf: TensorFrame) -> Tensor:
        x, _ = self.encoder(tf)
        for tab_transformer_conv in self.tab_transformer_convs:
            x = tab_transformer_conv(x)
        out = self.aggregator(x.mean(dim=1))
        return out

In [41]:
model = TabEncoder(
    hidden_size=256,
    output_size=64,
    num_layers=3,
    num_heads=8,
    col_stats=train_data.col_stats,
    col_names_dict=train_data.tensor_frame.col_names_dict,
).to(device)

In [42]:
xx = next(iter(train_loader))
y = model(xx.to(device))

In [45]:
xx.y

In [44]:
y

tensor([[-0.0304, -0.0303, -0.0299, -0.0375, -0.0466, -0.0027, -0.0418, -0.0426,
         -0.0293, -0.0646, -0.0043,  0.0447,  0.0745, -0.0342, -0.0002,  0.0457,
         -0.0242,  0.0316, -0.0397, -0.0048,  0.0160, -0.0299, -0.0487, -0.0344,
          0.0129,  0.0260,  0.0072, -0.0262, -0.0710,  0.0639, -0.0516,  0.0242,
          0.0102, -0.0139,  0.0007,  0.0206, -0.0117,  0.0280, -0.0086,  0.0528,
         -0.0283, -0.0374,  0.0257, -0.0641, -0.0070,  0.0590, -0.0139, -0.0026,
         -0.0180,  0.0847, -0.0059,  0.0022, -0.0314, -0.0223, -0.0384, -0.0639,
          0.0187, -0.0420, -0.0032, -0.0059,  0.0532,  0.0212, -0.0022,  0.0221],
        [-0.0011,  0.0002, -0.0214, -0.1086, -0.0510, -0.0578, -0.0719, -0.0463,
         -0.0784, -0.1176, -0.0196,  0.0210,  0.0928, -0.0345,  0.0223,  0.0399,
         -0.0217,  0.0710, -0.0673,  0.0362,  0.0064,  0.0209, -0.0907, -0.1350,
          0.0236,  0.0436, -0.0450, -0.0370, -0.0704,  0.0712, -0.0547, -0.0465,
          0.0079, -0.0185, 

In [46]:
train_data.index

AttributeError: 'Dataset' object has no attribute 'index'