<a href="https://colab.research.google.com/github/MengOonLee/Deep_learning/blob/master/PyTorch/Transformer/TabTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

CSV_HEADERS = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
    'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss',
    'hours_per_week', 'native_country', 'income_bracket']

train_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df_train = pd.read_csv(train_url, header=None, names=CSV_HEADERS)

test_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
df_test = pd.read_csv(test_url, header=None, names=CSV_HEADERS, skiprows=1)

def load_data(df):
    df = df.drop(columns=['fnlwgt', 'education_num']).reset_index(drop=True)

    numeric_cols = ['capital_gain', 'capital_loss', 'hours_per_week']
    X_num = df[numeric_cols].astype('float32')

    categoric_cols = [c for c in df.columns if c not in numeric_cols + ['income_bracket']]
    X_cat = df[categoric_cols].astype(str).apply(lambda s: s.str.strip())

    y = df['income_bracket'].str.replace('.', '', regex=False).str.strip()

    return X_num, X_cat, y

X_num_train, X_cat_train, y_train = load_data(df=df_train)
X_num_test, X_cat_test, y_test = load_data(df=df_test)

In [None]:
from sklearn import preprocessing

y_lb = preprocessing.LabelBinarizer()
y_train = y_lb.fit_transform(y=y_train).astype(int).squeeze()
y_test = y_lb.transform(y=y_test).astype(int).squeeze()

num_scaler = preprocessing.StandardScaler()
X_num_train = num_scaler.fit_transform(X=X_num_train).astype('float32')
X_num_test = num_scaler.transform(X=X_num_test).astype('float32')
num_features = X_num_train.shape[1]

cat_encoder = preprocessing.OrdinalEncoder(unknown_value=-1,
    handle_unknown='use_encoded_value')
X_cat_train = cat_encoder.fit_transform(X=X_cat_train).astype(int) + 1
X_cat_test = cat_encoder.transform(X=X_cat_test).astype(int) + 1
cat_cardinalities = [len(c)+1 for c in cat_encoder.categories_]

In [None]:
import torch
torch.manual_seed(seed=42)

class CensusDataset(torch.utils.data.Dataset):
    def __init__(self, X_num, X_cat, y):
        self.X_num = torch.tensor(data=X_num, dtype=torch.float32)
        self.X_cat = torch.tensor(data=X_cat, dtype=torch.long)
        self.y = torch.tensor(data=y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X_num[idx], self.X_cat[idx], self.y[idx]

ds_temp = CensusDataset(X_num=X_num_train, X_cat=X_cat_train, y=y_train)
ds_train, ds_val = torch.utils.data.random_split(dataset=ds_temp, lengths=[0.9, 0.1],
    generator=torch.Generator().manual_seed(42))
dl_train = torch.utils.data.DataLoader(dataset=ds_train, batch_size=256, shuffle=True)
dl_val = torch.utils.data.DataLoader(dataset=ds_val, batch_size=256, shuffle=False)

ds_test = CensusDataset(X_num=X_num_test, X_cat=X_cat_test, y=y_test)
dl_test = torch.utils.data.DataLoader(dataset=ds_test, batch_size=256, shuffle=False)

In [None]:
import torch
torch.manual_seed(seed=42)

class CensusClassifier(torch.nn.Module):
    def __init__(self, num_features, cat_cardinalities):
        super().__init__()
        self.embedding_layers = torch.nn.ModuleList(modules=[
            torch.nn.Embedding(num_embeddings=c,
                embedding_dim=int(min(8, max(1, round(c**0.25)))),
                padding_idx=0)
            for c in cat_cardinalities])

        in_features = num_features + sum(e.embedding_dim for e in self.embedding_layers)

        layers = []
        for h in [64, 32]:
            layers.append(torch.nn.Linear(in_features=in_features, out_features=h))
            layers.append(torch.nn.ReLU())
            layers.append(torch.nn.Dropout(p=0.3))
            in_features = h
        layers.append(torch.nn.Linear(in_features, 2))
        self.fc = torch.nn.Sequential(*layers)

    def forward(self, X_num, X_cat):
        X_emb = [emb(X_cat[:, i]) for i, emb in enumerate(self.embedding_layers)]
        X_emb = torch.cat(tensors=X_emb, dim=1)
        X = torch.cat(tensors=[X_num, X_emb], dim=1)
        return self.fc(X)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CensusClassifier(num_features=num_features, cat_cardinalities=cat_cardinalities)
model = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
print('Total parameters:', total_params)

In [None]:
import torch
torch.manual_seed(seed=42)

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.001, momentum=0.9)

def train(dataloader, model, loss_fn, optimizer):
    model.train()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss, train_acc = 0.0, 0.0
    for X_num, X_cat, y in dataloader:
        X_num, X_cat, y = X_num.to(device), X_cat.to(device), y.to(device)

        y_pred = model(X_num=X_num, X_cat=X_cat)
        loss = loss_fn(input=y_pred, target=y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_loss += loss.item()
        train_acc += (y_pred.argmax(dim=-1)==y).sum().item()
    train_loss /= len(dataloader)
    train_acc /= size
    return train_loss, train_acc

def test(dataloader, model, loss_fn):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, test_acc = 0.0, 0.0
    with torch.no_grad():
        for X_num, X_cat, y in dataloader:
            X_num, X_cat, y = X_num.to(device), X_cat.to(device), y.to(device)
            y_pred = model(X_num=X_num, X_cat=X_cat)
            test_loss += loss_fn(input=y_pred, target=y).item()
            test_acc += (y_pred.argmax(dim=-1)==y).sum().item()
    test_loss /= len(dataloader)
    test_acc /= size
    return test_loss, test_acc

In [None]:
import numpy as np
import torch
torch.manual_seed(seed=42)

def train_model(model, dl_train, dl_val, loss_fn, optimizer,
        epochs, patience, save_path):

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min',
        factor=0.5, patience=patience)

    best_val_loss = float('inf')

    history = {
        'train_loss': [], 'train_acc':[],
        'val_loss': [], 'val_acc': []
    }
    for epoch in range(epochs):
        train_loss, train_acc = train(dataloader=dl_train, model=model, loss_fn=loss_fn,
            optimizer=optimizer)
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)

        val_loss, val_acc = test(dataloader=dl_val, model=model, loss_fn=loss_fn)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        scheduler.step(val_loss)

        print("Epoch: %d, LR: %.3f, val_acc: %.3f"%(
            epoch, optimizer.param_groups[0]['lr'], val_acc))
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), save_path)

    model.load_state_dict(torch.load(save_path, weights_only=True))
    return model, history

model, history = train_model(model=model, dl_train=dl_train, dl_val=dl_val, loss_fn=loss_fn,
    optimizer=optimizer, epochs=10, patience=3, save_path='census_classifier_best.pth')