In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
columns = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income"
]

# Training data
train_df = pd.read_csv(
    "adult.data",
    names=columns,
    sep=",",
    skipinitialspace=True # spaces after comma not included
)

# Test data
test_df = pd.read_csv(
    "adult.test",
    names=columns,
    sep=",",
    skipinitialspace=True,
    skiprows=1  # skips header line in test file
)

# Remove trailing dot in test labels
test_df["income"] = test_df["income"].str.replace(".", "", regex=False)

# Combine (optional but common)
df = pd.concat([train_df, test_df], axis=0)

# Target
df["income"] = df["income"].map({">50K": 1, "<=50K": 0})

# Drop missing rows
df.replace("?", np.nan, inplace=True)
df.dropna(inplace=True)

# One-hot encode categorical columns
df = pd.get_dummies(df, drop_first=True)

# Split X and y
X = df.drop("income", axis=1).values
y = df["income"].values


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [4]:
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)

y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_test = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)


In [5]:
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)


In [6]:
model = MLP(X_train.shape[1])

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [7]:
epochs = 3000

for epoch in range(epochs):
    model.train()

    y_pred = model(X_train)
    loss = criterion(y_pred, y_train)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 750 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")


Epoch [750/3000], Loss: 0.2506
Epoch [1500/3000], Loss: 0.2288
Epoch [2250/3000], Loss: 0.2187
Epoch [3000/3000], Loss: 0.2111


In [8]:
model.eval()

with torch.no_grad():
    y_prob = model(X_test)
    y_pred = (y_prob >= 0.5).int()

y_test_np = y_test.numpy()
y_pred_np = y_pred.numpy()

print("Accuracy:", accuracy_score(y_test_np, y_pred_np))
print("Precision:", precision_score(y_test_np, y_pred_np))
print("Recall:", recall_score(y_test_np, y_pred_np))
print("F1-score:", f1_score(y_test_np, y_pred_np))


Accuracy: 0.8255389718076285
Precision: 0.6774827925270404
Recall: 0.5991304347826087
F1-score: 0.6359021688970927
