In [1]:
import pandas as pd
import sklearn
import random, os 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from fairness_evals import positive_rates, true_postive_rates, true_negative_rate, false_postive_rates
from Dataverwerking.reweighing import getweights

# Data

In [2]:
data_old = pd.read_csv(os.path.join('Processed_data', 'adult.csv'))

In [3]:
weights = getweights(os.path.join('Processed_data', 'adult.csv'))
data_old['weights'] = weights
data_old

Unnamed: 0,age,workclass,education,marital.status,race,sex,hours.per.week,income,weights
0,82,Private,HS-grad,Widowed,White,Female,18,0,0.561784
1,54,Private,7th-8th,Divorced,White,Female,40,0,0.733710
2,41,Private,Some-college,Separated,White,Female,40,0,0.621668
3,34,Private,HS-grad,Divorced,White,Female,45,0,0.733710
4,38,Private,10th,Separated,White,Male,40,0,0.696527
...,...,...,...,...,...,...,...,...,...
30157,22,Private,Some-college,Never-married,White,Male,40,0,1.041887
30158,27,Private,Assoc,Married-civ-spouse,White,Female,38,0,4.033258
30159,40,Private,HS-grad,Married,White,Male,40,1,0.660456
30160,58,Private,HS-grad,Widowed,White,Female,40,0,0.561784


In [4]:
data_2018 = pd.read_csv(os.path.join('Processed_data', 'data_2018.csv'))

In [5]:
DATA = data_old 
# DATA.drop(columns=['occupation'])
DATA_NAME = 'old_weighted' #! don't forget to change this accordingly 
FEAT_OF_INT = ['education', 'marital.status', 'sex', 'income'] #The features we are interested in
DATA.head()

Unnamed: 0,age,workclass,education,marital.status,race,sex,hours.per.week,income,weights
0,82,Private,HS-grad,Widowed,White,Female,18,0,0.561784
1,54,Private,7th-8th,Divorced,White,Female,40,0,0.73371
2,41,Private,Some-college,Separated,White,Female,40,0,0.621668
3,34,Private,HS-grad,Divorced,White,Female,45,0,0.73371
4,38,Private,10th,Separated,White,Male,40,0,0.696527


## Encoder for categorical values

In [6]:
from sklearn.preprocessing import OneHotEncoder
cat_feats = list(DATA.select_dtypes("object").keys())  #names of all categorical features
# cat_feats.remove('income') #keep these values as is 
print(f"{len(cat_feats)} categorical features found: {cat_feats}")

cat_feat_encoder = OneHotEncoder(sparse_output=False, drop='first').set_output(transform="pandas")
cat_feat_encoder = cat_feat_encoder.fit(DATA[cat_feats])

5 categorical features found: ['workclass', 'education', 'marital.status', 'race', 'sex']


## Split train, dev and test

In [7]:
train, dev = train_test_split(DATA, random_state=1989)
test = data_2018
print(f"Train size: {len(train)}, dev size: {len(dev)}, test size: {len(test)}")

Train size: 22621, dev size: 7541, test size: 1667195


In [8]:
x_train = train.drop(columns=['income', 'weights'])
y_train = train['income']
weights_train = train['weights']

x_dev  = dev.drop(columns=['income', 'weights'])
y_dev  = dev['income']

x_test  = test.drop(columns=['income'])
y_test  = test['income']

In [9]:
from Dataverwerking.data_transform import cat_to_one_hot
x_train_encoded = cat_to_one_hot(x_train, cat_feats, cat_feat_encoder)
x_dev_encoded = cat_to_one_hot(x_dev, cat_feats, cat_feat_encoder)
x_test_encoded = cat_to_one_hot(x_test, cat_feats, cat_feat_encoder)

# Logistic regression

**params logreg**
* Penalty (=L2)
* C (=1): regularization strenght -> set with dev set 
* solver: ?

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import FixedThresholdClassifier
logreg = LogisticRegression(solver = 'lbfgs', max_iter=250, class_weight={0: 0.32, 1: 0.68}).fit(x_train_encoded, y_train, sample_weight= weights_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Dev evaluation

In [11]:
y_pred_dev= list(logreg.predict(x_dev_encoded))

logreg_results_dev = dev.copy() #get all the data of the dev set

logreg_results_dev = logreg_results_dev.filter(items=FEAT_OF_INT) #keep only relevant info
logreg_results_dev['y_pred'] = y_pred_dev
logreg_results_dev

Unnamed: 0,education,marital.status,sex,income,y_pred
15688,Bachelors,Divorced,Female,0,0
4719,HS-grad,Married,Male,1,1
12320,HS-grad,Married,Male,0,0
9038,HS-grad,Divorced,Male,0,0
22761,HS-grad,Married,Male,0,1
...,...,...,...,...,...
1489,Masters,Divorced,Male,1,1
13261,Masters,Married,Male,1,1
29103,11th,Married,Male,0,0
26107,Bachelors,Married,Male,0,1


In [12]:
logreg_acc_dev = accuracy_score(y_dev, y_pred_dev)
logreg_rec_dev = recall_score(y_dev, y_pred_dev)
logreg_pre_dev = precision_score(y_dev, y_pred_dev)
logreg_f1_dev = f1_score(y_dev, y_pred_dev)
logreg_eval_dev = pd.DataFrame({'accuracy': [logreg_acc_dev], 'recall': [logreg_rec_dev], 'precision': [logreg_pre_dev], 'f1': [logreg_f1_dev]}, index=['dev'])
logreg_eval_dev.to_csv(os.path.join('Results', f'dev_{DATA_NAME}_eval.csv'),sep='\t')
logreg_eval_dev

Unnamed: 0,accuracy,recall,precision,f1
dev,0.798833,0.766072,0.566747,0.651505


In [13]:
logreg_fair_eval_dev = pd.DataFrame()
logreg_fair_eval_dev['positive rates'] = positive_rates(logreg_results_dev, 'sex')
logreg_fair_eval_dev['true positive rates'] = true_postive_rates(logreg_results_dev, 'sex')
logreg_fair_eval_dev['true negative rates'] = true_negative_rate(logreg_results_dev, 'sex')
logreg_fair_eval_dev['false positive rates'] = false_postive_rates(logreg_results_dev, 'sex')
logreg_fair_eval_dev.to_csv(os.path.join('Results', f'dev_{DATA_NAME}_fair_eval.csv'),sep='\t')
logreg_fair_eval_dev


Unnamed: 0,positive rates,true positive rates,true negative rates,false positive rates
Female,0.118644,0.565836,0.940131,0.059869
Male,0.432448,0.801911,0.730856,0.269144


## Test evaluation

In [14]:
y_pred_test= list(logreg.predict(x_test_encoded))

logreg_results_test = test.copy() #get all the data of the dev set

logreg_results_test = logreg_results_test.filter(items=FEAT_OF_INT) #keep only relevant info
logreg_results_test['y_pred'] = y_pred_test
logreg_results_test

Unnamed: 0,education,marital.status,sex,income,y_pred
0,Some-college,Never-married,Female,0,0
1,HS-grad,Never-married,Male,0,0
2,HS-grad,Never-married,Male,0,0
3,Some-college,Never-married,Female,0,0
4,Some-college,Never-married,Male,0,0
...,...,...,...,...,...
1667190,HS-grad,Never-married,Male,0,0
1667191,11th,Never-married,Male,0,0
1667192,Some-college,Divorced,Female,0,0
1667193,HS-grad,Married,Male,0,0


In [15]:
logreg_acc_test = accuracy_score(y_test, y_pred_test)
logreg_rec_test = recall_score(y_test, y_pred_test)
logreg_pre_test = precision_score(y_test, y_pred_test)
logreg_f1_test = f1_score(y_test, y_pred_test)
logreg_eval_test = pd.DataFrame({'accuracy': [logreg_acc_test], 'recall': [logreg_rec_test], 'precision': [logreg_pre_test], 'f1': [logreg_f1_test]}, index=['test'])
logreg_eval_test.to_csv(os.path.join('Results', 'test_weighted_2018_eval.csv'),sep='\t')
logreg_eval_test

Unnamed: 0,accuracy,recall,precision,f1
test,0.690234,0.696479,0.564255,0.623433


In [16]:
logreg_fair_eval_test = pd.DataFrame()
logreg_fair_eval_test['positive rates'] = positive_rates(logreg_results_test, 'sex')
logreg_fair_eval_test['true positive rates'] = true_postive_rates(logreg_results_test, 'sex')
logreg_fair_eval_test['true negative rates'] = true_negative_rate(logreg_results_test, 'sex')
logreg_fair_eval_test['false positive rates'] = false_postive_rates(logreg_results_test, 'sex')
logreg_fair_eval_test.to_csv(os.path.join('Results', 'test_weighted_2018_eval.csv'),sep='\t')
logreg_fair_eval_test


Unnamed: 0,positive rates,true positive rates,true negative rates,false positive rates
Female,0.432979,0.660186,0.65951,0.34049
Male,0.474193,0.718403,0.718274,0.281726
