In [3]:
'''
This code shows how to write an ANN to predict whether the income of a person is <=50k or vice versa (>50k),
other remaining features are used for training
'''

import torch

print(torch.__version__)
# 2.11.0.dev20251216+cu130

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
# cuda

2.11.0.dev20251216+cu130
cuda


In [17]:
######################
## Data preparation ##
######################

import polars as pl
from polars import col as c

pl_adult = (
    pl.read_csv("https://raw.githubusercontent.com/saravrajavelu/Adult-Income-Analysis/refs/heads/master/adult.csv")
    .to_dummies(columns=pl.selectors.string(), drop_first=True)
    .cast(pl.Float32)
)

print(pl_adult)
# shape: (48_842, 101)
# ┌──────┬────────────┬────────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐
# │ age  ┆ workclass_ ┆ workclass_ ┆ workclass_ ┆ … ┆ native-cou ┆ native-co ┆ native-co ┆ income_>5 │
# │ ---  ┆ ?          ┆ Federal-go ┆ Local-gov  ┆   ┆ ntry_Trina ┆ untry_Vie ┆ untry_Yug ┆ 0K        │
# │ f32  ┆ ---        ┆ v          ┆ ---        ┆   ┆ dad&Tobago ┆ tnam      ┆ oslavia   ┆ ---       │
# │      ┆ f32        ┆ ---        ┆ f32        ┆   ┆ ---        ┆ ---       ┆ ---       ┆ f32       │
# │      ┆            ┆ f32        ┆            ┆   ┆ f32        ┆ f32       ┆ f32       ┆           │
# ╞══════╪════════════╪════════════╪════════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡
# │ 25.0 ┆ 0.0        ┆ 0.0        ┆ 0.0        ┆ … ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ 0.0       │
# │ 38.0 ┆ 0.0        ┆ 0.0        ┆ 0.0        ┆ … ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ 0.0       │
# │ 28.0 ┆ 0.0        ┆ 0.0        ┆ 1.0        ┆ … ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ 1.0       │
# │ 44.0 ┆ 0.0        ┆ 0.0        ┆ 0.0        ┆ … ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ 1.0       │
# │ 18.0 ┆ 1.0        ┆ 0.0        ┆ 0.0        ┆ … ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ 0.0       │
# │ …    ┆ …          ┆ …          ┆ …          ┆ … ┆ …          ┆ …         ┆ …         ┆ …         │
# │ 27.0 ┆ 0.0        ┆ 0.0        ┆ 0.0        ┆ … ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ 0.0       │
# │ 40.0 ┆ 0.0        ┆ 0.0        ┆ 0.0        ┆ … ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ 1.0       │
# │ 58.0 ┆ 0.0        ┆ 0.0        ┆ 0.0        ┆ … ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ 0.0       │
# │ 22.0 ┆ 0.0        ┆ 0.0        ┆ 0.0        ┆ … ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ 0.0       │
# │ 52.0 ┆ 0.0        ┆ 0.0        ┆ 0.0        ┆ … ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ 1.0       │
# └──────┴────────────┴────────────┴────────────┴───┴────────────┴───────────┴───────────┴───────────┘

shape: (48_842, 101)
┌──────┬────────────┬────────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐
│ age  ┆ workclass_ ┆ workclass_ ┆ workclass_ ┆ … ┆ native-cou ┆ native-co ┆ native-co ┆ income_>5 │
│ ---  ┆ ?          ┆ Federal-go ┆ Local-gov  ┆   ┆ ntry_Trina ┆ untry_Vie ┆ untry_Yug ┆ 0K        │
│ f32  ┆ ---        ┆ v          ┆ ---        ┆   ┆ dad&Tobago ┆ tnam      ┆ oslavia   ┆ ---       │
│      ┆ f32        ┆ ---        ┆ f32        ┆   ┆ ---        ┆ ---       ┆ ---       ┆ f32       │
│      ┆            ┆ f32        ┆            ┆   ┆ f32        ┆ f32       ┆ f32       ┆           │
╞══════╪════════════╪════════════╪════════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡
│ 25.0 ┆ 0.0        ┆ 0.0        ┆ 0.0        ┆ … ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ 0.0       │
│ 38.0 ┆ 0.0        ┆ 0.0        ┆ 0.0        ┆ … ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ 0.0       │
│ 28.0 ┆ 0.0        ┆ 0.0        ┆ 1.0        ┆ … ┆ 0.0        ┆ 0.0  

In [18]:
'''----- X ------'''
from sklearn.preprocessing import StandardScaler

X_raw = pl_adult.select(c("*").exclude("income_>50K")).to_numpy()

X_scaler = StandardScaler().fit(X_raw)

X_scaled = torch.tensor(
    data=X_scaler.transform(X_raw),
    device=device
)

print(X_scaled)
# tensor([[-0.9951, -0.2466, -0.1738,  ..., -0.0235, -0.0420, -0.0217],
#         [-0.0469, -0.2466, -0.1738,  ..., -0.0235, -0.0420, -0.0217],
#         [-0.7763, -0.2466, -0.1738,  ..., -0.0235, -0.0420, -0.0217],
#         ...,
#         [ 1.4118, -0.2466, -0.1738,  ..., -0.0235, -0.0420, -0.0217],
#         [-1.2139, -0.2466, -0.1738,  ..., -0.0235, -0.0420, -0.0217],
#         [ 0.9742, -0.2466, -0.1738,  ..., -0.0235, -0.0420, -0.0217]],
#        device='cuda:0')

print(X_scaled.shape)
# torch.Size([48842, 100])
# Here, 100 will be the in_features

'''----- y ------'''
from sklearn.preprocessing import RobustScaler

y_raw = pl_adult.select("income_>50K").to_numpy()

y_scaler = StandardScaler().fit(y_raw)

y_scaled = torch.tensor(
    data=y_scaler.transform(y_raw),
    device=device
)

print(y_scaled)
# tensor([[-0.5608],
#         [-0.5608],
#         [ 1.7830],
#         ...,
#         [-0.5608],
#         [-0.5608],
#         [ 1.7830]], device='cuda:0')

print(y_scaled.shape)
# torch.Size([48842, 1])
# Here, 1 will be the final out_features

tensor([[-0.9951, -0.2466, -0.1738,  ..., -0.0235, -0.0420, -0.0217],
        [-0.0469, -0.2466, -0.1738,  ..., -0.0235, -0.0420, -0.0217],
        [-0.7763, -0.2466, -0.1738,  ..., -0.0235, -0.0420, -0.0217],
        ...,
        [ 1.4118, -0.2466, -0.1738,  ..., -0.0235, -0.0420, -0.0217],
        [-1.2139, -0.2466, -0.1738,  ..., -0.0235, -0.0420, -0.0217],
        [ 0.9742, -0.2466, -0.1738,  ..., -0.0235, -0.0420, -0.0217]],
       device='cuda:0')
torch.Size([48842, 100])
tensor([[-0.5608],
        [-0.5608],
        [ 1.7830],
        ...,
        [-0.5608],
        [-0.5608],
        [ 1.7830]], device='cuda:0')
torch.Size([48842, 1])


In [None]:
#######################
## Dataset splitting ##
#######################

train_len = int(0.8 * len(X_scaled)) # MUST be INTEGER
val_len = int(0.1 * len(X_scaled))
test_len = len(X_scaled) - (train_len + val_len)

from torch.utils.data import DataLoader, TensorDataset, random_split

full_dataset = TensorDataset(X_scaled, y_scaled)
train_split, val_split, test_split = random_split(dataset=full_dataset, lengths=[train_len, val_len, test_len])

train_set = DataLoader(train_split, batch_size=2**11, shuffle=True)
val_set = DataLoader(val_split, batch_size=2**11, shuffle=True)
test_set = DataLoader(test_split, batch_size=2**11, shuffle=True)