In [1]:
import pandas as pd
import sklearn
import random, os 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from fairness_evals import positive_rates, true_postive_rates, true_negative_rate, false_postive_rates

# Data

In [2]:

toy_educ = ['HS-grad', 'Doctorate', 'Masters', 'Bachelors', 'Doctorate'] * 3 
random.shuffle(toy_educ)
toy_mar = ['widowed', 'divorced', 'married', 'single', 'married'] * 3
random.shuffle(toy_mar)
toy_sex = ['Male', 'Male', 'Male', 'Female', 'Female'] * 3
random.shuffle(toy_sex)
toy_inc = [1, 0, 1, 0, 1] * 3
random.shuffle(toy_inc)
toy_educ_num = [31,41,17,24, 36,23,41,57,25,72,33,54,56,27,18]
random.shuffle(toy_educ_num)
toy_data = pd.DataFrame({'education':toy_educ , 'marital.status':toy_mar, 'sex':toy_sex, 'income':toy_inc, 'age':toy_educ_num})
toy_data

Unnamed: 0,education,marital.status,sex,income,age
0,Masters,divorced,Male,0,17
1,Bachelors,married,Female,1,72
2,Doctorate,married,Female,1,24
3,HS-grad,single,Male,1,36
4,Doctorate,single,Male,1,41
5,Doctorate,widowed,Female,0,54
6,HS-grad,widowed,Female,1,31
7,Bachelors,single,Male,1,23
8,Masters,married,Female,1,18
9,Doctorate,divorced,Male,0,33


In [3]:
# reconst_data = pd.read_csv('Dataverwerking/Adult_Reconstruction/adult_reconstruction.csv')

In [4]:
data_old = pd.read_csv(os.path.join('Processed_data', 'adult.csv'))

In [5]:
data_old.keys()

Index(['age', 'workclass', 'education', 'marital.status', 'race', 'sex',
       'hours.per.week', 'income'],
      dtype='object')

In [6]:
data_old

Unnamed: 0,age,workclass,education,marital.status,race,sex,hours.per.week,income
0,82,Private,HS-grad,Widowed,White,Female,18,0
1,54,Private,7th-8th,Divorced,White,Female,40,0
2,41,Private,Some-college,Separated,White,Female,40,0
3,34,Private,HS-grad,Divorced,White,Female,45,0
4,38,Private,10th,Separated,White,Male,40,0
...,...,...,...,...,...,...,...,...
30157,22,Private,Some-college,Never-married,White,Male,40,0
30158,27,Private,Assoc,Married-civ-spouse,White,Female,38,0
30159,40,Private,HS-grad,Married,White,Male,40,1
30160,58,Private,HS-grad,Widowed,White,Female,40,0


In [7]:
data_2018 = pd.read_csv(os.path.join('Processed_data', 'data_2018.csv'))

In [8]:
data_2018.keys()

Index(['age', 'workclass', 'education', 'marital.status', 'hours.per.week',
       'sex', 'race', 'income'],
      dtype='object')

In [9]:
data_2018

Unnamed: 0,age,workclass,education,marital.status,hours.per.week,sex,race,income
0,18,Private,Some-college,Never-married,21,Female,Black,0
1,53,Federal-gov,HS-grad,Never-married,40,Male,White,0
2,41,Private,HS-grad,Never-married,40,Male,White,0
3,18,Self-emp-not-inc,Some-college,Never-married,2,Female,White,0
4,21,Federal-gov,Some-college,Never-married,50,Male,White,0
...,...,...,...,...,...,...,...,...
22316,20,Self-emp-not-inc,Some-college,Never-married,25,Male,White,0
22317,63,Private,HS-grad,Married,48,Male,White,1
22318,65,Private,Bachelors,Never-married,40,Female,White,1
22319,37,Private,HS-grad,Separated,50,Female,White,0


In [10]:
DATA = data_old 
DATA_NAME = 'old' #! don't forget to change this accordingly 
FEAT_OF_INT = ['education', 'marital.status', 'sex', 'income'] #The features we are interested in
DATA.head()

Unnamed: 0,age,workclass,education,marital.status,race,sex,hours.per.week,income
0,82,Private,HS-grad,Widowed,White,Female,18,0
1,54,Private,7th-8th,Divorced,White,Female,40,0
2,41,Private,Some-college,Separated,White,Female,40,0
3,34,Private,HS-grad,Divorced,White,Female,45,0
4,38,Private,10th,Separated,White,Male,40,0


## Encoder for categorical values

In [11]:
from sklearn.preprocessing import OneHotEncoder
cat_feats = list(DATA.select_dtypes("object").keys())  #names of all categorical features
# cat_feats.remove('income') #keep these values as is 
print(f"{len(cat_feats)} categorical features found: {cat_feats}")

cat_feat_encoder = OneHotEncoder(sparse_output=False, drop='if_binary').set_output(transform="pandas")
cat_feat_encoder = cat_feat_encoder.fit(DATA[cat_feats])

5 categorical features found: ['workclass', 'education', 'marital.status', 'race', 'sex']


## Split train, dev and test

In [12]:
train, dev = train_test_split(DATA, test_size=0.3)
test = data_2018
print(f"Train size: {len(train)}, dev size: {len(dev)}, test size: {len(test)}")

Train size: 21113, dev size: 9049, test size: 22321


In [13]:
x_train = train.drop(columns=['income'])
y_train = train['income']

x_dev  = dev.drop(columns=['income'])
y_dev  = dev['income']

x_test  = test.drop(columns=['income'])
y_test  = test['income']

In [14]:
from Dataverwerking.data_transform import cat_to_one_hot
x_train_encoded = cat_to_one_hot(x_train, cat_feats, cat_feat_encoder)
x_dev_encoded = cat_to_one_hot(x_dev, cat_feats, cat_feat_encoder)
x_test_encoded = cat_to_one_hot(x_test, cat_feats, cat_feat_encoder)

In [15]:
x_train_encoded

Unnamed: 0,age,hours.per.week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_10th,...,marital.status_Married-spouse-absent,marital.status_Never-married,marital.status_Separated,marital.status_Widowed,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Male
26817,32,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
27826,37,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25614,50,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
9827,37,38,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
11845,34,50,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12822,20,4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
26104,22,19,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6206,39,48,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1630,79,35,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [16]:
x_test_encoded

Unnamed: 0,age,hours.per.week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_10th,...,marital.status_Married-spouse-absent,marital.status_Never-married,marital.status_Separated,marital.status_Widowed,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Male
0,18,21,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,53,40,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,41,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,18,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,21,50,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22316,20,25,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
22317,63,48,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
22318,65,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22319,37,50,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Logistic regression

**params logreg**
* Penalty (=L2)
* C (=1): regularization strenght -> set with dev set 
* solver: ?

In [17]:
# from sklearn.linear_model import LogisticRegression
# logreg = LogisticRegression(solver = 'sag').fit(x_train_encoded, y_train)



In [18]:
from sklearn.svm import SVC
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
logreg = make_pipeline(StandardScaler(), SVC(gamma='auto'))
logreg.fit(x_train_encoded, y_train)

## Dev evaluation

In [19]:
y_pred_dev= list(logreg.predict(x_dev_encoded))

logreg_results_dev = dev.copy() #get all the data of the dev set

logreg_results_dev = logreg_results_dev.filter(items=FEAT_OF_INT) #keep only relevant info
logreg_results_dev['y_pred'] = y_pred_dev
logreg_results_dev

Unnamed: 0,education,marital.status,sex,income,y_pred
24156,Some-college,Married,Male,1,0
1556,Prof-school,Married,Male,1,1
12252,HS-grad,Never-married,Female,0,0
10696,Bachelors,Never-married,Female,0,0
10222,Some-college,Divorced,Male,0,0
...,...,...,...,...,...
28920,Bachelors,Married,Male,1,1
9428,HS-grad,Never-married,Male,0,0
18127,11th,Married,Male,0,0
5164,Some-college,Married,Male,0,0


In [20]:
logreg_acc_dev = accuracy_score(y_dev, y_pred_dev)
logreg_rec_dev = recall_score(y_dev, y_pred_dev)
logreg_pre_dev = precision_score(y_dev, y_pred_dev)
logreg_f1_dev = f1_score(y_dev, y_pred_dev)
logreg_eval_dev = pd.DataFrame({'accuracy': [logreg_acc_dev], 'recall': [logreg_rec_dev], 'precision': [logreg_pre_dev], 'f1': [logreg_f1_dev]}, index=['dev'])
logreg_eval_dev.to_csv(os.path.join('Results', f'dev_{DATA_NAME}_eval.csv'),sep='\t')
logreg_eval_dev

Unnamed: 0,accuracy,recall,precision,f1
dev,0.822411,0.507482,0.70262,0.589318


In [21]:
logreg_fair_eval_dev = pd.DataFrame()
logreg_fair_eval_dev['positive rates'] = positive_rates(logreg_results_dev, 'sex')
logreg_fair_eval_dev['true positive rates'] = true_postive_rates(logreg_results_dev, 'sex')
logreg_fair_eval_dev['true negative rates'] = true_negative_rate(logreg_results_dev, 'sex')
logreg_fair_eval_dev['false positive rates'] = false_postive_rates(logreg_results_dev, 'sex')
logreg_fair_eval_dev.to_csv(os.path.join('Results', f'dev_{DATA_NAME}_fair_eval.csv'),sep='\t')
logreg_fair_eval_dev


Unnamed: 0,positive rates,true positive rates,true negative rates,false positive rates
Male,0.236284,0.529382,0.899574,0.100426
Female,0.063066,0.371429,0.974951,0.025049


## Test evaluation

In [22]:
y_pred_test= list(logreg.predict(x_test_encoded))

logreg_results_test = test.copy() #get all the data of the dev set

logreg_results_test = logreg_results_test.filter(items=FEAT_OF_INT) #keep only relevant info
logreg_results_test['y_pred'] = y_pred_test
logreg_results_test

Unnamed: 0,education,marital.status,sex,income,y_pred
0,Some-college,Never-married,Female,0,0
1,HS-grad,Never-married,Male,0,0
2,HS-grad,Never-married,Male,0,0
3,Some-college,Never-married,Female,0,0
4,Some-college,Never-married,Male,0,0
...,...,...,...,...,...
22316,Some-college,Never-married,Male,0,0
22317,HS-grad,Married,Male,1,0
22318,Bachelors,Never-married,Female,1,0
22319,HS-grad,Separated,Female,0,0


In [23]:
logreg_acc_test = accuracy_score(y_test, y_pred_test)
logreg_rec_test = recall_score(y_test, y_pred_test)
logreg_pre_test = precision_score(y_test, y_pred_test)
logreg_f1_test = f1_score(y_test, y_pred_test)
logreg_eval_test = pd.DataFrame({'accuracy': [logreg_acc_test], 'recall': [logreg_rec_test], 'precision': [logreg_pre_test], 'f1': [logreg_f1_test]}, index=['test'])
logreg_eval_test.to_csv(os.path.join('Results', 'test_2018_eval.csv'),sep='\t')
logreg_eval_test

Unnamed: 0,accuracy,recall,precision,f1
test,0.731508,0.43014,0.59232,0.498368


In [24]:
logreg_fair_eval_test = pd.DataFrame()
logreg_fair_eval_test['positive rates'] = positive_rates(logreg_results_test, 'sex')
logreg_fair_eval_test['true positive rates'] = true_postive_rates(logreg_results_test, 'sex')
logreg_fair_eval_test['true negative rates'] = true_negative_rate(logreg_results_test, 'sex')
logreg_fair_eval_test['false positive rates'] = false_postive_rates(logreg_results_test, 'sex')
logreg_fair_eval_test.to_csv(os.path.join('Results', 'test_2018_fair_eval.csv'),sep='\t')
logreg_fair_eval_test


Unnamed: 0,positive rates,true positive rates,true negative rates,false positive rates
Female,0.216849,0.432688,0.839882,0.160118
Male,0.23279,0.428936,0.899856,0.100144


# Reweighing