# Algorithmic Fairness, Accountability, and Ethics, Spring 2025

## Mandatory Assignment 2

Please use the following code to prepare the dataset.
 

In [7]:
from folktables.acs import adult_filter
from folktables import ACSDataSource
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.optimize import fmin_tnc
import pandas as pd

data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["CA"], download=True)

feature_names = ['AGEP', # Age
                 "CIT", # Citizenship status
                 'COW', # Class of worker
                 "ENG", # Ability to speak English
                 'SCHL', # Educational attainment
                 'MAR', # Marital status
                 "HINS1", # Insurance through a current or former employer or union
                 "HINS2", # Insurance purchased directly from an insurance company
                 "HINS4", # Medicaid
                 "RAC1P", # Recoded detailed race code
                 'SEX']

target_name = "PINCP" # Total person's income

def data_processing(data, features, target_name:str, threshold: float = 35000):
    df = data
    ### Adult Filter (STARTS) (from Foltktables)
    df = df[~df["SEX"].isnull()]
    df = df[~df["RAC1P"].isnull()]
    df = df[df['AGEP'] > 16]
    df = df[df['PINCP'] > 100]
    df = df[df['WKHP'] > 0]
    df = df[df['PWGTP'] >= 1]
    ### Adult Filter (ENDS)
    ### Groups of interest
    sex = df["SEX"].values
    ### Target
    df["target"] = df[target_name] > threshold
    target = df["target"].values
    df = df[features + ["target", target_name]] ##we want to keep df before one_hot encoding to make Bias Analysis
    df_processed = df[features].copy()
    cols = [ "HINS1", "HINS2", "HINS4", "CIT", "COW", "SCHL", "MAR", "SEX", "RAC1P"]
    df_processed = pd.get_dummies(df_processed, prefix=None, prefix_sep='_', dummy_na=False, columns=cols, drop_first=True)
    df_processed = pd.get_dummies(df_processed, prefix=None, prefix_sep='_', dummy_na=True, columns=["ENG"], drop_first=True)
    return df_processed, df, target, sex

data, data_original, target, group = data_processing(acs_data, feature_names, target_name)

X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    data, target, group, test_size=0.2, random_state=0)

## Logistic regression model without any fairness

In [None]:
# Sigmoid function
def sigmoid(beta, X):
    return 1/(1+np.exp(-(X @ beta)))

# Logistic loss 
def logistic_loss(beta, X, y, lambda_, gamma_):
    m = len(y)
    g = sigmoid(beta, X)
    return 1/m* np.sum(-y * np.log(g) - (1-y) * np.log(1-g))

# Objective function to minimiza
def objective_function(beta, X, y, lambda_, gamma_):
    lloss=logistic_loss(beta, X, y, lambda_,gamma_)
    f=0 # not including any fairness constraint thus f=0
    l2loss=np.sum(beta**2)
    return lloss+gamma_*l2loss+lambda_*f

# Function for the given f prime
def fprime(beta, X, y, lambda_, gamma_):
    m = len(y)
    g = sigmoid(beta, X)
    return 1/m * np.dot(X.T,(g-y))+2*gamma_*beta

# The given evaluation error function
def evaluate_error(prediction_score,labels_):
    prediction_score[prediction_score > 0] = 1
    prediction_score[prediction_score <= 0] = 0
    y_pred=prediction_score
    accuracy = 1-np.sum(np.abs(prediction_score - labels_)) / float(np.size(labels_))
    return y_pred, accuracy

# Initial values
beta0 = np.zeros(X_train.shape[1])
lambda1=1
gamma1=0.1

X_train=X_train.astype(float)
y_train=y_train.astype(float)

optimal_beta, nfeval, rc = fmin_tnc(func=objective_function,x0=beta0,fprime=fprime, args=(X_train,y_train, lambda1, gamma1), ftol=1e-5)
print("Optimized beta:", optimal_beta)

prediction_scoretest = X_train @ optimal_beta
print(prediction_scoretest)

predictions, accuracy = evaluate_error(prediction_scoretest, y_test)
print("the accuracy is:", accuracy)


Optimized beta: [ 1.43690055e-02 -3.21315027e-01 -9.34715434e-02  7.79466966e-02
 -1.98150080e-04  2.27901411e-04  1.06861867e-03 -9.75289035e-02
  1.06535371e-02  2.25413383e-02  1.39631833e-02  1.25888324e-02
 -5.33917889e-02  1.66002872e-02 -4.17488081e-03 -2.80408705e-04
 -2.87124947e-04 -1.08651799e-03 -2.43964801e-03 -4.82361825e-03
 -2.95793026e-03 -4.52203117e-03 -2.50498689e-02 -3.41349125e-03
 -8.89900190e-03 -1.46969391e-02 -1.06738462e-02 -2.41576660e-02
 -2.55998214e-02 -1.24477732e-01 -1.24163203e-02 -3.38826661e-02
 -7.58127324e-02 -8.37025231e-03  1.17942301e-01  1.00865999e-01
  3.41123646e-02  2.50675110e-02 -4.15113384e-03  5.22849494e-03
 -1.08138533e-02 -2.33878161e-01 -1.63206839e-01 -1.60089936e-02
 -3.86488612e-03 -3.58295754e-05 -1.79205787e-03  1.48117047e-02
 -9.62856806e-04 -8.60347992e-02 -1.27995979e-02 -4.05833022e-02
 -7.01432521e-02 -3.42981463e-02  4.97098604e-02]
358945    0.630672
275788    0.640754
141517    0.295973
66729     0.873194
268579   -0.0

ValueError: operands could not be broadcast together with shapes (156532,) (39133,) 

In [12]:
d={'features': X_train.columns.tolist(), 'weight':optimal_beta}
df = pd.DataFrame(data=d)
print("Estimated betas:\n" ,df)

Estimated betas:
      features    weight
0        AGEP  0.014369
1     HINS1_2 -0.321315
2     HINS2_2 -0.093472
3     HINS4_2  0.077947
4       CIT_2 -0.000198
5       CIT_3  0.000228
6       CIT_4  0.001069
7       CIT_5 -0.097529
8     COW_2.0  0.010654
9     COW_3.0  0.022541
10    COW_4.0  0.013963
11    COW_5.0  0.012589
12    COW_6.0 -0.053392
13    COW_7.0  0.016600
14    COW_8.0 -0.004175
15   SCHL_2.0 -0.000280
16   SCHL_3.0 -0.000287
17   SCHL_4.0 -0.001087
18   SCHL_5.0 -0.002440
19   SCHL_6.0 -0.004824
20   SCHL_7.0 -0.002958
21   SCHL_8.0 -0.004522
22   SCHL_9.0 -0.025050
23  SCHL_10.0 -0.003413
24  SCHL_11.0 -0.008899
25  SCHL_12.0 -0.014697
26  SCHL_13.0 -0.010674
27  SCHL_14.0 -0.024158
28  SCHL_15.0 -0.025600
29  SCHL_16.0 -0.124478
30  SCHL_17.0 -0.012416
31  SCHL_18.0 -0.033883
32  SCHL_19.0 -0.075813
33  SCHL_20.0 -0.008370
34  SCHL_21.0  0.117942
35  SCHL_22.0  0.100866
36  SCHL_23.0  0.034112
37  SCHL_24.0  0.025068
38      MAR_2 -0.004151
39      MAR_3  0.00522

## Logistic regression model SEX fairness 

## Logistic regression model RAC1P fairness

SyntaxError: invalid syntax (3354970551.py, line 1)