<a href="https://colab.research.google.com/github/MengOonLee/Deep_learning/blob/master/PyTorch/Transformer/TabTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

csv_headers = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
    'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss',
    'hours_per_week', 'native_country', 'income_bracket'
]

def create_X_y(df):
    df = df.copy()
    df.drop(columns=['fnlwgt', 'education_num'], inplace=True)
    df.dropna(inplace=True)
    df['income_bracket'] = df['income_bracket'].astype('str')\
        .str.replace('.', '').str.strip()
    X, y = df.drop(columns=['income_bracket']), df['income_bracket']
    y = y.map({'<=50K': 0, '>50K': 1}).astype('float32')

    numeric_headers = ['capital_gain', 'capital_loss', 'hours_per_week']
    X_num = X[numeric_headers].astype('float32')

    categoric_headers = [n for n in X.columns if n not in numeric_headers]
    for col in categoric_headers:
        X[col] = X[col].astype('str').str.strip()
    X_cat = X[categoric_headers]

    return X_num.values, X_cat, y.values

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df_train = pd.read_csv(url, header=None, names=csv_headers)
X_num_train, X_cat_train, y_train = create_X_y(df=df_train)

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
df_test = pd.read_csv(url, header=0, names=csv_headers)
X_num_test, X_cat_test, y_test = create_X_y(df=df_test)

cat_maps = {}
for col in X_cat_train.columns:
    cat_maps[col] = {val: i+1 for i, val in
        enumerate(sorted(X_cat_train[col].unique()))}
    cat_maps[col]['__UNK__'] = 0
    X_cat_train[col] = X_cat_train[col].map(cat_maps[col])
    X_cat_test[col] = X_cat_test[col].map(cat_maps[col])
X_cat_train, X_cat_test = X_cat_train.values, X_cat_test.values

In [2]:
import torch

class TabularDataset(torch.utils.data.Dataset):
    def __init__(self, X_num, X_cat, y):
        self.X_num = torch.tensor(data=X_num, dtype=torch.float32)
        self.X_cat = torch.tensor(data=X_cat, dtype=torch.long)
        self.y = torch.tensor(data=y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X_num[idx], self.X_cat[idx], self.y[idx]

ds_train = TabularDataset(X_num=X_num_train, X_cat=X_cat_train, y=y_train)
ds_test = TabularDataset(X_num=X_num_test, X_cat=X_cat_test, y=y_test)

dl_train = torch.utils.data.DataLoader(dataset=ds_train,
    batch_size=256, shuffle=True)
dl_test = torch.utils.data.DataLoader(dataset=ds_test, batch_size=256)

In [3]:
import torch

class Normalization(torch.nn.Module):
    def __init__(self, mean, std):
        super().__init__()
        self.register_buffer('mean', torch.tensor(data=mean, dtype=torch.float32))
        self.register_buffer('std', torch.tensor(data=std, dtype=torch.float32))

    def forward(self, x):
        return (x - self.mean) / self.std

In [19]:
import torch

mean, std = X_num_train.mean(axis=0), X_num_train.std(axis=0)
num_dim = X_num_train.shape[1]
cat_cardinalities =  X_cat_train.max(axis=0) + 1

class ClassifyModel(torch.nn.Module):
    def __init__(self, cat_cardinalities):
        super().__init__()
        self.normalizer = Normalization(mean=mean, std=std)
        self.emb_layers = torch.nn.ModuleList(modules=[
            torch.nn.Embedding(num_embeddings=c, embedding_dim=min(4, (c+1)//2))
            for c in cat_cardinalities
        ])

    def forward(self, x_num, x_cat):
        x_num = self.normalizer(x_num)
        x_emb = [emb(x_cat[:, i]) for i, emb in enumerate(self.emb_layers)]


        return x_num, x_emb

model = ClassifyModel(cat_cardinalities=cat_cardinalities)
x_num = torch.tensor(X_num_train[:10], dtype=torch.float32)
x_emb = torch.tensor(X_cat_train[:10], dtype=torch.long)
model(x_num, x_emb)

(tensor([[ 0.1485, -0.2167, -0.0354],
         [-0.1459, -0.2167, -2.2222],
         [-0.1459, -0.2167, -0.0354],
         [-0.1459, -0.2167, -0.0354],
         [-0.1459, -0.2167, -0.0354],
         [-0.1459, -0.2167, -0.0354],
         [-0.1459, -0.2167, -1.9792],
         [-0.1459, -0.2167,  0.3695],
         [ 1.7611, -0.2167,  0.7745],
         [ 0.5552, -0.2167, -0.0354]]),
 [tensor([[ 0.1893,  0.0246,  1.2913,  0.1899],
          [-1.2577,  0.3759, -1.3739, -0.6427],
          [-0.6679,  1.1394,  0.0518, -1.0582],
          [ 0.7615, -0.6128,  0.5224, -1.1209],
          [ 1.9621,  0.8056, -0.0843, -0.5087],
          [-0.9374,  0.6213, -1.7214, -1.5217],
          [-0.4241,  0.7210, -1.9187, -0.0584],
          [ 1.8087, -0.4617,  1.1644,  0.9504],
          [ 0.6953, -0.0394, -1.3335, -0.9625],
          [ 0.0104, -0.0883,  1.0266, -0.0743]], grad_fn=<EmbeddingBackward0>),
  tensor([[-1.7141,  0.7404,  0.4529, -0.0996],
          [ 0.6384,  0.9471, -0.2653,  0.5266],
          