<a href="https://colab.research.google.com/github/MengOonLee/Deep_learning/blob/master/PyTorch/Transformer/TabTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

csv_headers = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
    'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss',
    'hours_per_week', 'native_country', 'income_bracket'
]

def create_X_y(df):
    df = df.copy()
    df.drop(columns=['fnlwgt', 'education_num'], inplace=True)
    df.dropna(inplace=True)
    df['income_bracket'] = df['income_bracket'].astype('str')\
        .str.replace('.', '').str.strip()
    X, y = df.drop(columns=['income_bracket']), df['income_bracket']
    y = y.map({'<=50K': 0, '>50K': 1}).astype('float32')

    numeric_headers = ['capital_gain', 'capital_loss', 'hours_per_week']
    X_num = X[numeric_headers].astype('float32')

    categoric_headers = [n for n in X.columns if n not in numeric_headers]
    for col in categoric_headers:
        X[col] = X[col].astype('str').str.strip()
    X_cat = X[categoric_headers]

    return X_num, X_cat, y

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df_train = pd.read_csv(url, header=None, names=csv_headers)
X_num_train, X_cat_train, y_train = create_X_y(df=df_train)

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
df_test = pd.read_csv(url, header=0, names=csv_headers)
X_num_test, X_cat_test, y_test = create_X_y(df=df_test)

cat_maps = {}
for col in X_cat_train.columns:
    cat_maps[col] = {val: i+1 for i, val in
        enumerate(sorted(X_cat_train[col].unique()))}
    cat_maps[col]['__UNK__'] = 0
    X_cat_train[col] = X_cat_train[col].map(cat_maps[col])
    X_cat_test[col] = X_cat_test[col].map(cat_maps[col])

In [3]:
import torch

class TabularDataset(torch.utils.data.Dataset):
    def __init__(self, X_num, X_cat, y):
        self.X_num = torch.tensor(data=X_num.values, dtype=torch.float32)
        self.X_cat = torch.tensor(data=X_cat.values, dtype=torch.long)
        self.y = torch.tensor(data=y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X_num[idx], self.X_cat[idx], self.y[idx]

ds_train = TabularDataset(X_num=X_num_train, X_cat=X_cat_train, y=y_train)
ds_test = TabularDataset(X_num=X_num_test, X_cat=X_cat_test, y=y_test)

dl_train = torch.utils.data.DataLoader(dataset=ds_train,
    batch_size=256, shuffle=True)
dl_test = torch.utils.data.DataLoader(dataset=ds_test, batch_size=256)

<torch.utils.data.dataloader.DataLoader at 0x78fad1f2cef0>

In [None]:
import torch

mean, std = X_num_train.values.mean(axis=0), X_num_train.values.std(axis=0)