In [1]:
import pandas as pd
import sklearn
import random, os 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from fairness_evals import positive_rates, true_postive_rates, true_negative_rate, false_postive_rates

# Data

In [2]:

toy_educ = ['HS-grad', 'Doctorate', 'Masters', 'Bachelors', 'Doctorate'] * 3 
random.shuffle(toy_educ)
toy_mar = ['widowed', 'divorced', 'married', 'single', 'married'] * 3
random.shuffle(toy_mar)
toy_sex = ['Male', 'Male', 'Male', 'Female', 'Female'] * 3
random.shuffle(toy_sex)
toy_inc = [1, 0, 1, 0, 1] * 3
random.shuffle(toy_inc)
toy_educ_num = [31,41,17,24, 36,23,41,57,25,72,33,54,56,27,18]
random.shuffle(toy_educ_num)
toy_data = pd.DataFrame({'education':toy_educ , 'marital.status':toy_mar, 'sex':toy_sex, 'income':toy_inc, 'age':toy_educ_num})
toy_data

Unnamed: 0,education,marital.status,sex,income,age
0,Doctorate,married,Male,1,57
1,Bachelors,divorced,Female,1,41
2,Doctorate,single,Female,0,24
3,Doctorate,married,Female,1,72
4,HS-grad,married,Male,0,36
5,HS-grad,single,Male,0,18
6,HS-grad,widowed,Male,1,25
7,Doctorate,divorced,Female,1,31
8,Masters,widowed,Male,1,23
9,Masters,divorced,Female,0,56


In [3]:
# reconst_data = pd.read_csv('Dataverwerking/Adult_Reconstruction/adult_reconstruction.csv')

In [4]:
data_old = pd.read_csv(os.path.join('Processed_data', 'adult.csv'))

In [5]:
data_old.keys()

Index(['age', 'workclass', 'education', 'marital.status', 'occupation',
       'relationship', 'race', 'sex', 'hours.per.week', 'income'],
      dtype='object')

In [6]:
data_old

Unnamed: 0,age,workclass,education,marital.status,occupation,relationship,race,sex,hours.per.week,income
0,82,Private,HS-grad,Widowed,Exec-managerial,Not-in-family,White,Female,18,0
1,54,Private,7th-8th,Divorced,Machine-op-inspct,Unmarried,White,Female,40,0
2,41,Private,Some-college,Separated,Prof-specialty,Own-child,White,Female,40,0
3,34,Private,HS-grad,Divorced,Other-service,Unmarried,White,Female,45,0
4,38,Private,10th,Separated,Adm-clerical,Unmarried,White,Male,40,0
...,...,...,...,...,...,...,...,...,...,...
30157,22,Private,Some-college,Never-married,Protective-serv,Not-in-family,White,Male,40,0
30158,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,38,0
30159,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,1
30160,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,40,0


In [7]:
data_2018 = pd.read_csv(os.path.join('Processed_data', 'data_2018.csv'))

In [8]:
data_2018.keys()

Index(['age', 'workclass', 'education', 'marital.status', 'occupation',
       'relationship', 'hours.per.week', 'sex', 'race', 'income'],
      dtype='object')

In [9]:
data_2018

Unnamed: 0,age,workclass,education,marital.status,occupation,relationship,hours.per.week,sex,race,income
0,18,Private,Some-college,Never-married,4720,17,21,Female,Black,0
1,53,Federal-gov,HS-grad,Never-married,3605,16,40,Male,White,0
2,41,Private,HS-grad,Never-married,7330,17,40,Male,White,0
3,18,Self-emp-not-inc,Some-college,Never-married,2722,17,2,Female,White,0
4,21,Federal-gov,Some-college,Never-married,3870,17,50,Male,White,0
...,...,...,...,...,...,...,...,...,...,...
22316,20,Self-emp-not-inc,Some-college,Never-married,4251,4,25,Male,White,0
22317,63,Private,HS-grad,Married,440,0,48,Male,White,1
22318,65,Private,Bachelors,Never-married,420,2,40,Female,White,1
22319,37,Private,HS-grad,Separated,340,0,50,Female,White,0


In [10]:
DATA = data_old 
DATA_NAME = 'old' #! don't forget to change this accordingly 
FEAT_OF_INT = ['education', 'marital.status', 'sex', 'income'] #The features we are interested in
DATA.head()

Unnamed: 0,age,workclass,education,marital.status,occupation,relationship,race,sex,hours.per.week,income
0,82,Private,HS-grad,Widowed,Exec-managerial,Not-in-family,White,Female,18,0
1,54,Private,7th-8th,Divorced,Machine-op-inspct,Unmarried,White,Female,40,0
2,41,Private,Some-college,Separated,Prof-specialty,Own-child,White,Female,40,0
3,34,Private,HS-grad,Divorced,Other-service,Unmarried,White,Female,45,0
4,38,Private,10th,Separated,Adm-clerical,Unmarried,White,Male,40,0


## Encoder for categorical values

In [11]:
from sklearn.preprocessing import OneHotEncoder
cat_feats = list(DATA.select_dtypes("object").keys())  #names of all categorical features
# cat_feats.remove('income') #keep these values as is 
print(f"{len(cat_feats)} categorical features found: {cat_feats}")

cat_feat_encoder = OneHotEncoder(sparse_output=False, drop='if_binary').set_output(transform="pandas")
cat_feat_encoder = cat_feat_encoder.fit(DATA[cat_feats])

7 categorical features found: ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex']


## Split train, dev and test

In [12]:
train, dev = train_test_split(DATA, test_size=0.3)
test = data_2018
print(f"Train size: {len(train)}, dev size: {len(dev)}, test size: {len(test)}")

Train size: 21113, dev size: 9049, test size: 22321


In [13]:
x_train = train.drop(columns=['income'])
y_train = train['income']

x_dev  = dev.drop(columns=['income'])
y_dev  = dev['income']

x_test  = test.drop(columns=['income'])
y_test  = test['income']

In [14]:
from Dataverwerking.data_transform import cat_to_one_hot
x_train_encoded = cat_to_one_hot(x_train, cat_feats, cat_feat_encoder)
x_dev_encoded = cat_to_one_hot(x_dev, cat_feats, cat_feat_encoder)
x_test_encoded = cat_to_one_hot(x_test, cat_feats, cat_feat_encoder)

ValueError: Found unknown categories ['Assoc'] in column 1 during transform

In [None]:
x_train_encoded

Unnamed: 0,age,hours.per.week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_10th,...,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Male
20594,32,40,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
15071,56,40,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
14927,34,35,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
23627,44,45,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3813,29,40,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10918,22,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
26155,19,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
12375,48,50,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3312,72,20,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [None]:
x_test_encoded

# Logistic regression

**params logreg**
* Penalty (=L2)
* C (=1): regularization strenght -> set with dev set 
* solver: ?

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver = 'sag').fit(x_train_encoded, y_train)



## Dev evaluation

In [None]:
y_pred_dev= list(logreg.predict(x_dev_encoded))

logreg_results_dev = dev.copy() #get all the data of the dev set

logreg_results_dev = logreg_results_dev.filter(items=FEAT_OF_INT) #keep only relevant info
logreg_results_dev['y_pred'] = y_pred_dev
logreg_results_dev

Unnamed: 0,education,marital.status,sex,income,y_pred
1248,12th,Married-civ-spouse,Male,0,0
14529,HS-grad,Divorced,Female,0,0
17394,Some-college,Never-married,Male,0,0
13005,HS-grad,Never-married,Female,0,0
21219,Some-college,Never-married,Female,0,0
...,...,...,...,...,...
3812,HS-grad,Never-married,Male,0,0
13543,HS-grad,Married-civ-spouse,Male,0,0
2618,HS-grad,Married-civ-spouse,Male,1,0
2205,HS-grad,Widowed,Male,1,0


In [None]:
logreg_acc_dev = accuracy_score(y_dev, y_pred_dev)
logreg_rec_dev = recall_score(y_dev, y_pred_dev)
logreg_pre_dev = precision_score(y_dev, y_pred_dev)
logreg_f1_dev = f1_score(y_dev, y_pred_dev)
logreg_eval_dev = pd.DataFrame({'accuracy': [logreg_acc_dev], 'recall': [logreg_rec_dev], 'precision': [logreg_pre_dev], 'f1': [logreg_f1_dev]}, index=['dev'])
logreg_eval_dev.to_csv(os.path.join('Results', f'dev_{DATA_NAME}_eval.csv'),sep='\t')
logreg_eval_dev

Unnamed: 0,accuracy,recall,precision,f1
LogReg,0.835672,0.556381,0.729513,0.631292


In [None]:
logreg_fair_eval_dev = pd.DataFrame()
logreg_fair_eval_dev['positive rates'] = positive_rates(logreg_results_dev, 'sex')
logreg_fair_eval_dev['true positive rates'] = true_postive_rates(logreg_results_dev, 'sex')
logreg_fair_eval_dev['true negative rates'] = true_negative_rate(logreg_results_dev, 'sex')
logreg_fair_eval_dev['false positive rates'] = false_postive_rates(logreg_results_dev, 'sex')
logreg_fair_eval_dev.to_csv(os.path.join('Results', f'dev_{DATA_NAME}_fair_eval.csv'),sep='\t')
logreg_fair_eval_dev


Unnamed: 0,positive rates,true positive rates,true negative rates,false positive rates
Male,0.253748,0.58244,0.900407,0.099593
Female,0.064538,0.401216,0.978328,0.021672


## Test evaluation

In [None]:
y_pred_test= list(logreg.predict(x_test_encoded))

logreg_results_test = test.copy() #get all the data of the dev set

logreg_results_test = logreg_results_test.filter(items=FEAT_OF_INT) #keep only relevant info
logreg_results_test['y_pred'] = y_pred_test
logreg_results_test

In [None]:
logreg_acc_test = accuracy_score(y_test, y_pred_test)
logreg_rec_test = recall_score(y_test, y_pred_test)
logreg_pre_test = precision_score(y_test, y_pred_test)
logreg_f1_test = f1_score(y_test, y_pred_test)
logreg_eval_test = pd.DataFrame({'accuracy': [logreg_acc_test], 'recall': [logreg_rec_test], 'precision': [logreg_pre_test], 'f1': [logreg_f1_test]}, index=['test'])
logreg_eval_test.to_csv(os.path.join('Results', 'test_2018_eval.csv'),sep='\t')
logreg_eval_test

In [None]:
logreg_fair_eval_test = pd.DataFrame()
logreg_fair_eval_test['positive rates'] = positive_rates(logreg_eval_test, 'sex')
logreg_fair_eval_test['true positive rates'] = true_postive_rates(logreg_eval_test, 'sex')
logreg_fair_eval_test['true negative rates'] = true_negative_rate(logreg_eval_test, 'sex')
logreg_fair_eval_test['false positive rates'] = false_postive_rates(logreg_eval_test, 'sex')
logreg_fair_eval_test.to_csv(os.path.join('Results', 'test_2018_fair_eval.csv'),sep='\t')
logreg_fair_eval_test


# Reweighing