In [1]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from fairness_evals import positive_rates, true_postive_rates, true_negative_rate

# Data

In [2]:

toy_educ = ['HS-grad', 'Doctorate', 'Masters', 'Bachelors', 'Doctorate'] * 3
toy_mar = ['widowed', 'divorced', 'married', 'single', 'married'] * 3
toy_sex = ['Male', 'Male', 'Male', 'Female', 'Female'] * 3
toy_inc = [1, 0, 1, 0, 1] * 3
toy_educ_num = [3,4,7,2, 3,2,1,5,2,2,3,4,6,7,8]
toy_data = pd.DataFrame({'education':toy_educ , 'marital.status':toy_mar, 'sex':toy_sex, 'income':toy_inc, 'education.num':toy_educ_num})
toy_data


Unnamed: 0,education,marital.status,sex,income,education.num
0,HS-grad,widowed,Male,1,3
1,Doctorate,divorced,Male,0,4
2,Masters,married,Male,1,7
3,Bachelors,single,Female,0,2
4,Doctorate,married,Female,1,3
5,HS-grad,widowed,Male,1,2
6,Doctorate,divorced,Male,0,1
7,Masters,married,Male,1,5
8,Bachelors,single,Female,0,2
9,Doctorate,married,Female,1,2


In [3]:
DATA = toy_data
FEAT_OF_INT = ['education', 'marital.status', 'sex', 'income'] #The features we are interested in

## Encoder for categorical values

In [4]:
from sklearn.preprocessing import OneHotEncoder
cat_feats = list(DATA.select_dtypes("object").keys())  #names of all categorical features
# cat_feats.remove('income') #keep these values as is 
print(f"{len(cat_feats)} categorical features found: {cat_feats}")

cat_feat_encoder = OneHotEncoder(sparse_output=False, drop='if_binary').set_output(transform="pandas")
cat_feat_encoder = cat_feat_encoder.fit(DATA[cat_feats])

3 categorical features found: ['education', 'marital.status', 'sex']


## Split train and test

In [5]:
train, test = train_test_split(DATA, test_size=0.3)
print(f"Train size: {len(train)}, test size: {len(test)}")

Train size: 10, test size: 5


In [6]:
x_train = train.drop(columns=['income'])
y_train = train['income']

x_test  = test.drop(columns=['income'])
y_test  = test['income']

In [7]:
from data_transform import cat_to_one_hot
x_train_encoded = cat_to_one_hot(x_train, cat_feats, cat_feat_encoder)
x_test_encoded = cat_to_one_hot(x_test, cat_feats, cat_feat_encoder)

# Logistic regression

In [8]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression().fit(x_train_encoded, y_train)

y_pred= list(logreg.predict(x_test_encoded))

logreg_results = test.copy() #get all the data of the test set

logreg_results = logreg_results.filter(items=FEAT_OF_INT) #keep only relevant info
logreg_results['y_pred'] = y_pred
logreg_results

Unnamed: 0,education,marital.status,sex,income,y_pred
10,HS-grad,widowed,Male,1,1
9,Doctorate,married,Female,1,0
5,HS-grad,widowed,Male,1,0
4,Doctorate,married,Female,1,0
2,Masters,married,Male,1,1


In [9]:
logreg_acc = accuracy_score(y_test, y_pred)
logreg_rec = recall_score(y_test, y_pred)
logreg_pre = precision_score(y_test, y_pred)
logreg_f1 = f1_score(y_test, y_pred)
logreg_eval = pd.DataFrame({'accuracy': [logreg_acc], 'recall': [logreg_rec], 'precision': [logreg_pre], 'f1': [logreg_f1]}, index=['LogReg'])
logreg_eval

Unnamed: 0,accuracy,recall,precision,f1
LogReg,0.4,0.4,1.0,0.571429


In [10]:
pos_rates = positive_rates(logreg_results, 'sex')
true_pos_rates = true_postive_rates(logreg_results, 'sex')
true_neg_rates = true_negative_rate(logreg_results, 'sex')
logreg_fair_eval = pd.DataFrame({'positive rates' : pos_rates, 'true positive rates' : true_pos_rates, 'true negative rates': true_neg_rates})
logreg_fair_eval


No gold false instances found for Male
No gold false instances found for Female


Unnamed: 0,positive rates,true positive rates,true negative rates
Male,0.666667,0.666667,
Female,0.0,0.0,
