# Algoithmic Fairness, Accountability and Ethics
## Assignment 2 (Template)

In [1]:
from folktables import ACSDataSource, BasicProblem, generate_categories
import numpy as np
import scipy.optimize as opt

### 1. Load and Preprocess the data
We are going to work with the [Folktables](https://github.com/socialfoundations/folktables#quick-start-examples) dataset (*you have already worked with it*).

1. As last week, we are still predicting the *Total person's income*  (I've digitized  it in  `target_transform=lambda x: x > 25000`).
2. Today, we are going to implement two methods for data debiasing: [Fair PCA](https://deepai.org/publication/efficient-fair-pca-for-fair-representation-learning) and [A Geometric Solution to Fair Representations](https://dl.acm.org/doi/10.1145/3375627.3375864).
3. We are going to evaluate the performance on two sensitive features: `SEX` (i.e. *Males* and *Females*) and `RAC1P` (we will consider only *Whites* and *African-Americans*)
4. I updated the filtering method `adult_filter` to keep the specified groups.

In [2]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["CA"], download=True)

def adult_filter(data):
    """Mimic the filters in place for Adult data.
    Adult documentation notes: Extraction was done by Barry Becker from
    the 1994 Census database. A set of reasonably clean records was extracted
    using the following conditions:
    ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))
    """
    df = data
    df = df[df['AGEP'] > 16]
    df = df[df['PINCP'] > 100]
    df = df[df['WKHP'] > 0]
    df = df[df['PWGTP'] >= 1]
    df = df[df["RAC1P"] < 3] ## keep only Whites and African-Americans
    return df


ACSIncomeNew = BasicProblem(
    features=[
        'AGEP',
        'COW',
        'SCHL',
        'MAR',
        'CIT',
        'RELP',
        'WKHP',
        'PWGTP',
        'SEX',
        'RAC1P'
    ],
    target='PINCP',
    target_transform=lambda x: x > 25000,    
    group=['SEX', "RAC1P"],
    preprocess=adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)

Downloading data for 2018 1-Year person survey for CA...


## Task 1

In [3]:
######
## YOUR CODE
#####

## Task 2

### Convert categorical to one-hot encoding

In [4]:
definition_df = data_source.get_definitions(download=True)
categories = generate_categories(features=ACSIncomeNew.features, definition_df=definition_df)
features, labels, groups = ACSIncomeNew.df_to_pandas(acs_data, categories=categories, dummies=True)
### groups now contain information about SEX and RAC1P
features.head()

Downloading the attribute definition file...


Unnamed: 0,AGEP,WKHP,PWGTP,"COW_Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions","COW_Employee of a private not-for-profit, tax-exempt, or charitable organization",COW_Federal government employee,"COW_Local government employee (city, county, etc.)","COW_Self-employed in own incorporated business, professional practice or farm","COW_Self-employed in own not incorporated business, professional practice, or farm",COW_State government employee,...,RELP_Parent-in-law,RELP_Reference person,RELP_Roomer or boarder,RELP_Son-in-law or daughter-in-law,RELP_Stepson or stepdaughter,RELP_Unmarried partner,SEX_Female,SEX_Male,RAC1P_Black or African American alone,RAC1P_White alone
0,21.0,20.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,65.0,8.0,33.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,33.0,40.0,53.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,18.0,18.0,106.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,27.0,50.0,23.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [5]:
# Drop the "redundant" columns
features = features.drop(["RELP_Unmarried partner",
                          "CIT_U.S. citizen by naturalization",
                          "SEX_Male",
                          "SCHL_1 or more years of college credit, no degree",  
                          "MAR_Divorced", 
                          "RELP_Adopted son or daughter",
                          'COW_Working without pay in family business or farm', 
                          "RAC1P_White alone" ], axis = 1) 

print("Columns with the protected features:")
for i, f in enumerate(features.columns):
    if ("RAC1P" in f) or ("SEX" in f):
        print("Column ID: %s" %i, "(%s)"%f)

Columns with the protected features:
Column ID: 57 (SEX_Female)
Column ID: 58 (RAC1P_Black or African American alone)


In [8]:
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    features, labels, groups, test_size=0.3, random_state=0, shuffle=True)

N = 500 ### I am subsampling because it is slow on my machine

X_train = X_train[:N]
y_train = y_train[:N]
group_train = group_train[:N]
X_test = X_test[:N]
y_test = y_test[:N]
group_test = group_test[:N]

### Build your own implementation of the Logistic Regression with L2-penalty (aka Ridge Regression).

In [126]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import sklearn
import pandas as pd

def sigmoid(x, eps=1e-10):
    """
    This is logistic regression
    f = 1/(1+exp(-beta^T * x))
    This function assumes as input that you have already multiplied beta and X together
    """
    return 1 / (1 + np.exp(x+eps))


def logistic_loss(y_true, y_pred, eps=1e-10):
    """
    Loss for the logistic regression, y_preds are probabilities
    eps: epsilon for stability
    """
    return np.mean(
        -y_true * np.log(y_pred + eps) - (1 - y_true) * np.log(1 - y_pred + eps)
    )


def l2_loss(beta):
    """
    L2-Regularisation
    """
    return np.sum(beta[1:]**2)


def group_fair_loss(y, y_pred, all_groups):
    """
    Group fairness Loss
    """
    fairness = 0
    for groups in all_groups.T:
        y, y_pred, groups = [
            pd.Series(np.squeeze(data), name=name)
            for name, data in [("y", y), ("y_pred", y_pred), ("groups", groups)]
        ]
        X_groups = pd.concat([y, y_pred], axis=1).groupby(groups)
        X_g1, X_g2 = X_groups.get_group(1), X_groups.get_group(2)

        cross_pairs = X_g1.merge(X_g2, how="cross", suffixes=["1", "2"])
        assert len(cross_pairs) == len(X_g1) * len(X_g2)

        df = cross_pairs
        fairness += np.mean((df["y1"] == df["y2"]) * (df["y_pred1"] - df["y_pred2"])**2)
    return fairness


def compute_gradient(beta, X, y, groups, _lambda, _gamma):
    """Calculate the gradient - used for finding the best beta values.
    You do not need to use groups and lambda (fmin_tnc expects same input as in func, that's why they are included here)"""
    y_pred = predict(X, beta)

    grad = (y_pred - y) @ X + 2 * _gamma * beta
    assert grad.shape == beta.shape, "gradient shape doesnt match beta shape"
    return grad


def compute_cost(beta, X, y, groups, _lambda, _gamma):
    """Computes cost function with constraints"""
    y_pred = predict(X, beta)

    loss = logistic_loss(y, y_pred) + _lambda*group_fair_loss(y, y_pred, groups) + _gamma*l2_loss(beta)
    return loss


def predict(X, beta):
    return sigmoid(X @ beta)

class LogReg(BaseEstimator):
    def __init__(self, gamma):
        self.gamma = gamma
        self.lambda_ = 1e3
        self.scaler = MinMaxScaler().set_output(transform='pandas')

    def _add_bias(self, X_train):
        bias = pd.Series(np.ones_like(X_train[X_train.columns[0]]), name="bias", index=X_train.index, dtype=float)
        return pd.concat([bias, X_train], axis=1)
    
    def fit(self, X_train, y_train):
        X_train = self.scaler.fit_transform(X_train)
        X_train = self._add_bias(X_train)


        X = X_train.values
        y = y_train.astype(float).squeeze().values
        train_func = lambda: opt.fmin_tnc(
            func=compute_cost,
            x0=self.init_beta,
            fprime=compute_gradient,
            # approx_grad=True,
            maxfun=500,
            args=(X, y, groups.values, self.lambda_, self.gamma),
            xtol=1e-5,
            ftol=1e-5,
        )

        for i in range(5):
            self.init_beta = np.random.rand(len(X_train.columns))

            self.beta, _, state = train_func()

            weights_changed = np.abs(self.beta - self.init_beta).max() > 1e-6
            if state in [0, 1, 2] and weights_changed:
                break

            elif i == 4:
                raise RuntimeError(f'{state = } {weights_changed = }')

    
    def predict(self, X_train):
        X_train = self.scaler.transform(X_train)
        X_train = self._add_bias(X_train)
        return predict(X_train.values, self.beta)
    
    def score(self, X_train, y_train):
        X_train = self.scaler.transform(X_train)
        X_train = self._add_bias(X_train)
        X = X_train.values
        y = y_train.astype(float).squeeze().values
        return -compute_cost(self.beta, X, y, groups.values, self.lambda_, self.gamma)

In [127]:
# Check all functions work
clf = LogReg(1e-5)
clf.fit(X_train, y_train)
clf.score(X_train, y_train)
clf.predict(X_train);

### Use Cross-Validation to find the most optimal value for L2-penalty (you should implement it yourself).


#### Task 2.2.
Use the following arguments in the `opt.fmin_funct`: `xtol=1e-4, ftol=1e-4,  maxfun=1000`

In [130]:
gammas = np.logspace(1e-5,1e-2,10)
cv = GridSearchCV(
    estimator=clf,
    param_grid={"gamma": gammas},
    verbose=2,
    n_jobs=-1
)
cv.fit(X_train, y_train.squeeze())
cv.cv_results_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


19 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Christoffer\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Christoffer\AppData\Local\Temp\ipykernel_4800\2179330235.py", line 118, in fit
RuntimeError: state = 4 weights_changed = False

--------------------------------------------------------------------------------
11 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Christoffer\AppData\Local\Programs\Python\Python310\lib\site-p

RuntimeError: state = 1 weights_changed = False

#### Task 2.3
Use the following arguments in the `opt.fmin_funct`: ` xtol=1e-3, ftol=1e-3, approx_grad=True, maxfun=1000`

In [None]:
lambdas = np.array([1e-3, 5e-3, 1e-2, 5e-2, 0.1, 1])
###########
# YOUR CODE
###########