In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import  accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression

In [2]:
income = pd.read_csv('data/income.csv')
income['target'] = income.income.map({'<=50K': 0, '>50K': 1})
income.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income,target
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K,1
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K,0


In [3]:
nom_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'native-country']
num_features = ['educational-num']
target = 'target'
groups = ['age', 'race', 'gender']

In [4]:
preprocessing = ColumnTransformer(
    [('onehotencoder', OneHotEncoder(), nom_features),
     ('standardscaler', StandardScaler(), num_features)
     ])

income_train, income_test = train_test_split(income, test_size=0.2, random_state=42)

X_train = preprocessing.fit_transform(income_train).toarray()
y_train = income_train[target].to_numpy()

X_test = preprocessing.transform(income_test).toarray()
y_test = income_test[target].to_numpy()

print('Training data shape:', X_train.shape, '\ttarget shape:', y_train.shape)
print('Test data shape:', X_test.shape, '\t\ttarget shape:', y_test.shape)

Training data shape: (39073, 96) 	target shape: (39073,)
Test data shape: (9769, 96) 		target shape: (9769,)


In [5]:
logreg = LogisticRegression(max_iter=1000, random_state=42, penalty=None)
logreg.fit(X_train, y_train)
logreg.score(X_test, y_test)

0.8394922714709796

In [6]:
class BinaryLogisticRegression(nn.Module):
    def __init__(self,
                 in_dim: int):
        super().__init__()
        
        self.layer = nn.Linear(in_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        return self.sigmoid(self.layer(x))

In [7]:
tensordata_train = TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train).float())
trainloader = DataLoader(tensordata_train, batch_size=64, shuffle=True)

X_test, y_test = torch.tensor(X_test).float(), torch.tensor(y_test).float()

model = BinaryLogisticRegression(X_train.shape[1])

n_epochs = 10

loss_fn = nn.BCELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

batch_losses = []
losses = []
val_losses = []
accs = []

for epoch in range(n_epochs):
    print(epoch)
    model.train()
    for x, y in trainloader:
        b_size = x.size(0)
        
        y_hat = model(x)
        loss = loss_fn(torch.squeeze(y_hat), y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        batch_losses.append(loss.item())
    
    losses.append(np.mean(batch_losses[-b_size:]))
    
    model.eval()
    with torch.no_grad():
        y_hat = model(X_test)
        loss = loss_fn(torch.squeeze(y_hat), y_test)
        acc = accuracy_score(y_test.detach(), torch.squeeze(y_hat).detach().round())
        print('loss:', loss.item(), 'accuracy_score', acc)
        val_losses.append(loss.item())
        accs.append(acc)
        

0
loss: 0.3972487151622772 accuracy_score 0.8297676323062749
1
loss: 0.3704058825969696 accuracy_score 0.8353976865595251
2
loss: 0.36311179399490356 accuracy_score 0.8377520728836114
3
loss: 0.36017677187919617 accuracy_score 0.8376497082608251
4
loss: 0.35887977480888367 accuracy_score 0.8386733544886887
5
loss: 0.35790929198265076 accuracy_score 0.8372402497696796
6
loss: 0.3575206398963928 accuracy_score 0.8381615313747569
7
loss: 0.3573370575904846 accuracy_score 0.8393899068481933
8
loss: 0.35712429881095886 accuracy_score 0.8385709898659024
9
loss: 0.3570325970649719 accuracy_score 0.8393899068481933


In [8]:
y_test = y_test.detach().numpy()
preds = model(X_test).detach().numpy().round()

In [9]:
print(accuracy_score(y_test, preds))
print(balanced_accuracy_score(y_test, preds))

0.8393899068481933
0.7343788517601832
