In [1]:
import pandas as pd

CSV_HEADERS = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
    'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss',
    'hours_per_week', 'native_country', 'income_bracket']

train_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df_train = pd.read_csv(train_url, header=None, names=CSV_HEADERS)

test_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
df_test = pd.read_csv(test_url, header=None, names=CSV_HEADERS, skiprows=1)

def load_data(df):
    df = df.copy().drop(columns=['fnlwgt', 'education_num'])\
        .reset_index(drop=True)
    
    numeric_cols = ['capital_gain', 'capital_loss', 'hours_per_week']
    X_num = df[numeric_cols].astype('float32')

    categoric_cols = [c for c in df.columns if c not in numeric_cols]
    X_cat = df[categoric_cols].astype(str).apply(lambda s: s.str.strip())
    y = X_cat['income_bracket'].str.replace('.', '')
    X_cat = X_cat.drop(columns=['income_bracket'])
    
    return X_num, X_cat, y

X_num_train, X_cat_train, y_train = load_data(df=df_train)
X_num_test, X_cat_test, y_test = load_data(df=df_test)

In [3]:
from sklearn import preprocessing

y_lb = preprocessing.LabelBinarizer()
y_lb.fit(y=y_train)

num_scaler = preprocessing.StandardScaler()
num_scaler.fit(X=X_num_train)
num_features = num_scaler.n_features_in_

cat_encoder = preprocessing.OrdinalEncoder(unknown_value=-1,
    handle_unknown='use_encoded_value')
cat_encoder.fit(X=X_cat_train)
cat_cardinalities = [len(c)+1 for c in cat_encoder.categories_]

In [8]:
import torch
torch.manual_seed(seed=42)

class CensusDataset(torch.utils.data.Dataset):
    def __init__(self, X_num, X_cat, y, num_scaler, cat_encoder, y_lb):
        X_num = num_scaler.transform(X_num)
        self.X_num = torch.tensor(data=X_num, dtype=torch.float32)
        
        X_cat = cat_encoder.transform(X_cat) + 1
        self.X_cat = torch.tensor(data=X_cat, dtype=torch.long)
        
        y = y_lb.transform(y).astype('int').squeeze()
        self.y = torch.tensor(data=y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X_num[idx], self.X_cat[idx], self.y[idx]

ds_temp = CensusDataset(X_num=X_num_train, X_cat=X_cat_train, y=y_train,
    num_scaler=num_scaler, cat_encoder=cat_encoder, y_lb=y_lb)
ds_train, ds_val = torch.utils.data.random_split(dataset=ds_temp, lengths=[0.8, 0.2],
    generator=torch.Generator().manual_seed(42))
dl_train = torch.utils.data.DataLoader(dataset=ds_train, batch_size=256, shuffle=True)
dl_val = torch.utils.data.DataLoader(dataset=ds_val, batch_size=256)

ds_test = CensusDataset(X_num=X_num_test, X_cat=X_cat_test, y=y_test,
    num_scaler=num_scaler, cat_encoder=cat_encoder, y_lb=y_lb)
dl_test = torch.utils.data.DataLoader(dataset=ds_test, batch_size=256)

In [24]:
import torch
torch.manual_seed(seed=42)

class CensusClassifier(torch.nn.Module):
    def __init__(self, num_features, cat_cardinalities):
        super().__init__()
        self.embedding_layers = torch.nn.ModuleList(modules=[
            torch.nn.Embedding(num_embeddings=c,
                embedding_dim=int(min(50, max(1, round(c**0.25)))))
            for c in cat_cardinalities])
        
        in_features = num_features + sum(e.embedding_dim for e in self.embedding_layers)
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(in_features=in_features, out_features=64),
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=64, out_features=2)
        )

    def forward(self, X_num, X_cat):
        X_emb = [emb(X_cat[:, i]) for i, emb in enumerate(self.embedding_layers)]
        X_emb = torch.cat(tensors=X_emb, dim=1)
        X = torch.cat(tensors=[X_num, X_emb], dim=1)
        y = torch.nn.functional.sigmoid(self.fc(X))
        return y

device = torch.accelerator.current_accelerator().type \
    if torch.accelerator.is_available() else 'cpu'

model = CensusClassifier(num_features=num_features,
    cat_cardinalities=cat_cardinalities).to(device)

total_params = sum(p.numel() for p in model.parameters())
print('Total parameters:', total_params)

Total parameters: 2084


In [None]:
import torch
torch.manual_seed(seed=42)

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

def train(dataloader, model, loss_fn, optimizer):
    model.train()
    for X_num, X_cat, y in dataloader:
        X_num, X_cat = X_num.to(device), X_cat.to(device)
        y_pred = model(X_num=X_num, X_cat=X_cat)
        loss = loss_fn(input=y_pred, target=y)
        print(loss)
        optimizer.step()
        optimizer.zero_grad()

train(dataloader=dl_train, model=model, loss_fn=loss_fn, optimizer=optimizer)
        
# def test(dataloader, model, loss_fn):
#     model.eval()
#     with torch.no_grad():
#         for x_num, x_cat, y in dataloader:
#             if torch