In [1]:
import pandas as pd
import sklearn
import random, os 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from fairness_evals import positive_rates, true_postive_rates, true_negative_rate, false_postive_rates
from Dataverwerking.reweighing import getweights

# Data

In [2]:
data_old = pd.read_csv(os.path.join('Processed_data', 'adult.csv'))

In [3]:
weights = getweights(os.path.join('Processed_data', 'adult.csv'))
data_old['weights'] = weights
data_old

Unnamed: 0,age,workclass,education,marital.status,race,sex,hours.per.week,income,weights
0,82,Private,HS-grad,Widowed,White,Female,18,0,0.561784
1,54,Private,7th-8th,Divorced,White,Female,40,0,0.733710
2,41,Private,Some-college,Separated,White,Female,40,0,0.621668
3,34,Private,HS-grad,Divorced,White,Female,45,0,0.733710
4,38,Private,10th,Separated,White,Male,40,0,0.696527
...,...,...,...,...,...,...,...,...,...
30157,22,Private,Some-college,Never-married,White,Male,40,0,1.041887
30158,27,Private,Assoc,Married,White,Female,38,0,4.033258
30159,40,Private,HS-grad,Married,White,Male,40,1,0.660456
30160,58,Private,HS-grad,Widowed,White,Female,40,0,0.561784


In [4]:
data_2018 = pd.read_csv(os.path.join('Processed_data', 'data_2018.csv'))

In [5]:
DATA = data_old 
# DATA.drop(columns=['occupation'])
DATA_NAME = 'old_weighted' #! don't forget to change this accordingly 
FEAT_OF_INT = ['education', 'marital.status', 'sex', 'income'] #The features we are interested in
DATA.head()

Unnamed: 0,age,workclass,education,marital.status,race,sex,hours.per.week,income,weights
0,82,Private,HS-grad,Widowed,White,Female,18,0,0.561784
1,54,Private,7th-8th,Divorced,White,Female,40,0,0.73371
2,41,Private,Some-college,Separated,White,Female,40,0,0.621668
3,34,Private,HS-grad,Divorced,White,Female,45,0,0.73371
4,38,Private,10th,Separated,White,Male,40,0,0.696527


## Encoder for categorical values

In [6]:
from sklearn.preprocessing import OneHotEncoder
cat_feats = list(DATA.select_dtypes("object").keys())  #names of all categorical features
# cat_feats.remove('income') #keep these values as is 
print(f"{len(cat_feats)} categorical features found: {cat_feats}")

cat_feat_encoder = OneHotEncoder(sparse_output=False, drop='first').set_output(transform="pandas")
cat_feat_encoder = cat_feat_encoder.fit(DATA[cat_feats])

5 categorical features found: ['workclass', 'education', 'marital.status', 'race', 'sex']


## Split train, dev and test

In [7]:
train, rest = train_test_split(DATA, test_size=0.3, random_state= 64819)
dev, test_old = train_test_split(rest, test_size=2/3, random_state= 8910)
test_2018 = data_2018
print(f"Train size: {len(train)}, dev size: {len(dev)}, test old size: {len(test_old)}, test 2018 size: {len(test_2018)}")

Train size: 21113, dev size: 3016, test old size: 6033, test 2018 size: 1667195


In [8]:
x_train = train.drop(columns=['income', 'weights'])
y_train = train['income']
weights_train = train['weights']

x_dev  = dev.drop(columns=['income', 'weights'])
y_dev  = dev['income']

x_test_old  = test_old.drop(columns=['income', 'weights'])
y_test_old  = test_old['income']

x_test_2018  = test_2018.drop(columns=['income'])
y_test_2018  = test_2018['income']

In [9]:
from Dataverwerking.data_transform import cat_to_one_hot
x_train_encoded = cat_to_one_hot(x_train, cat_feats, cat_feat_encoder)
x_dev_encoded = cat_to_one_hot(x_dev, cat_feats, cat_feat_encoder)
x_test_old_encoded = cat_to_one_hot(x_test_old, cat_feats, cat_feat_encoder)
x_test_2018_encoded = cat_to_one_hot(x_test_2018, cat_feats, cat_feat_encoder)

# Logistic regression

**params logreg**
* Penalty (=L2)
* C (=1): regularization strenght -> set with dev set 
* solver: ?

In [10]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver = 'lbfgs', max_iter=250, class_weight={0: 0.32, 1: 0.68}).fit(x_train_encoded, y_train, sample_weight= weights_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Dev evaluation

In [11]:
y_pred_dev= list(logreg.predict(x_dev_encoded))

logreg_results_dev = dev.copy() #get all the data of the dev set

logreg_results_dev = logreg_results_dev.filter(items=FEAT_OF_INT) #keep only relevant info
logreg_results_dev['y_pred'] = y_pred_dev
logreg_results_dev

Unnamed: 0,education,marital.status,sex,income,y_pred
25299,Some-college,Separated,Female,0,0
938,HS-grad,Married,Male,1,1
9113,HS-grad,Never-married,Male,0,0
19694,10th,Married,Male,1,0
22532,Bachelors,Divorced,Female,0,0
...,...,...,...,...,...
12403,Bachelors,Never-married,Female,0,0
15435,12th,Widowed,Female,0,0
26831,HS-grad,Married,Male,1,0
11994,Some-college,Never-married,Male,0,0


In [12]:
logreg_acc_dev = accuracy_score(y_dev, y_pred_dev)
logreg_rec_dev = recall_score(y_dev, y_pred_dev)
logreg_pre_dev = precision_score(y_dev, y_pred_dev)
logreg_f1_dev = f1_score(y_dev, y_pred_dev)
logreg_eval_dev = pd.DataFrame({'accuracy': [logreg_acc_dev], 'recall': [logreg_rec_dev], 'precision': [logreg_pre_dev], 'f1': [logreg_f1_dev]}, index=['dev'])
# logreg_eval_dev.to_csv(os.path.join('Results', f'dev_{DATA_NAME}_eval.csv'),sep='\t')
logreg_eval_dev

Unnamed: 0,accuracy,recall,precision,f1
dev,0.785809,0.764398,0.55619,0.643881


In [13]:
logreg_fair_eval_dev = pd.DataFrame()
logreg_fair_eval_dev['positive rates'] = positive_rates(logreg_results_dev, 'sex')
logreg_fair_eval_dev['true positive rates'] = true_postive_rates(logreg_results_dev, 'sex')
logreg_fair_eval_dev['true negative rates'] = true_negative_rate(logreg_results_dev, 'sex')
logreg_fair_eval_dev['false positive rates'] = false_postive_rates(logreg_results_dev, 'sex')
# logreg_fair_eval_dev.to_csv(os.path.join('Results', f'dev_{DATA_NAME}_fair_eval.csv'),sep='\t')
logreg_fair_eval_dev


Unnamed: 0,positive rates,true positive rates,true negative rates,false positive rates
Female,0.119675,0.612069,0.945977,0.054023
Male,0.459113,0.791667,0.696816,0.303184


## Test evaluation

In [14]:
y_pred_test_2018 = list(logreg.predict(x_test_2018_encoded))

logreg_results_test_2018 = test_2018.copy() #get all the data of the dev set

logreg_results_test_2018 = logreg_results_test_2018.filter(items=FEAT_OF_INT) #keep only relevant info
logreg_results_test_2018['y_pred'] = y_pred_test_2018
logreg_results_test_2018

Unnamed: 0,education,marital.status,sex,income,y_pred
0,Some-college,Never-married,Female,0,0
1,HS-grad,Never-married,Male,0,0
2,HS-grad,Never-married,Male,0,0
3,Some-college,Never-married,Female,0,0
4,Some-college,Never-married,Male,0,0
...,...,...,...,...,...
1667190,HS-grad,Never-married,Male,0,0
1667191,11th,Never-married,Male,0,0
1667192,Some-college,Divorced,Female,0,0
1667193,HS-grad,Married,Male,0,0


In [15]:
logreg_acc_test_2018 = accuracy_score(y_test_2018, y_pred_test_2018)
logreg_rec_test_2018 = recall_score(y_test_2018, y_pred_test_2018)
logreg_pre_test_2018 = precision_score(y_test_2018, y_pred_test_2018)
logreg_f1_test_2018 = f1_score(y_test_2018, y_pred_test_2018)
logreg_eval_test_2018 = pd.DataFrame({'accuracy': [logreg_acc_test_2018], 'recall': [logreg_rec_test_2018], 'precision': [logreg_pre_test_2018], 'f1': [logreg_f1_test_2018]}, index=['test'])
logreg_eval_test_2018.to_csv(os.path.join('Results', 'weighted', 'test_weighted_2018_eval.csv'),sep='\t')
logreg_eval_test_2018

Unnamed: 0,accuracy,recall,precision,f1
test,0.691928,0.698271,0.566174,0.625322


In [16]:
logreg_fair_eval_test_2018 = pd.DataFrame()
logreg_fair_eval_test_2018['positive rates'] = positive_rates(logreg_results_test_2018, 'sex')
logreg_fair_eval_test_2018['true positive rates'] = true_postive_rates(logreg_results_test_2018, 'sex')
logreg_fair_eval_test_2018['true negative rates'] = true_negative_rate(logreg_results_test_2018, 'sex')
logreg_fair_eval_test_2018['false positive rates'] = false_postive_rates(logreg_results_test_2018, 'sex')

logreg_fair_eval_test_2018['equalized odds ratio'] = logreg_fair_eval_test_2018['true positive rates']['Male']/logreg_fair_eval_test_2018['true positive rates']['Female']
logreg_fair_eval_test_2018['equalized odds diff'] = logreg_fair_eval_test_2018['true positive rates']['Male']-logreg_fair_eval_test_2018['true positive rates']['Female']

logreg_fair_eval_test_2018.to_csv(os.path.join('Results', 'weighted', 'test_weighted_2018_fair_eval.csv'),sep='\t')
logreg_fair_eval_test_2018


Unnamed: 0,positive rates,true positive rates,true negative rates,false positive rates,equalized odds ratio,equalized odds diff
Female,0.43075,0.65936,0.66231,0.33769,1.094663,0.062417
Male,0.475524,0.721777,0.718552,0.281448,1.094663,0.062417
