In [1]:
#general imports 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
import random
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

# imports for FMCLP algorithm
from final_fmclp_26_09_2022 import cuae, fmclp, synthetic_dataset

In [2]:
# preprocessing one of compas datasets

d = pd.read_csv('compas-scores-raw.csv')
                
del d['Person_ID']
del d['AssessmentID']
del d['Case_ID']
del d['LastName']
del d['MiddleName']
del d['FirstName']
del d['RawScore']
del d['DecileScore']
del d['IsCompleted']
del d['IsDeleted']
del d['AssessmentReason']
del d['RecSupervisionLevelText']
del d['DisplayText']

del d['Screening_Date']
del d['DateOfBirth']

def race(x):
    if x == 'African-American':
        return 1
    else:
        return 0
d['attr']= d['Ethnic_Code_Text'].apply(race)
del d['Ethnic_Code_Text']

def targeter(x):
    if x == 'Low':
        return 0
    elif x =='Medium':
        return 1
    else:
        return 2
d['target'] = d['ScoreText'].apply(targeter)
del d['ScoreText']
d = pd.get_dummies(d, drop_first = True)

In [3]:
#fit LGBM classifier
y = d.drop('target',axis=1)
x = d['target']
    
y_train,y_test,x_train,x_test = train_test_split(y,x,test_size = 0.3)

estimator = LGBMClassifier()
estimator.fit(y_train,x_train)
estimator_pred= estimator.predict(y_test)
accuracy_score(estimator_pred,x_test)

0.8314249712376047

In [4]:
%%time

#experiment
fair_diff = []
fair_ratio = []
fair_variation = []
fair_accuracy = []
unfair_diff = []
unfair_ratio = []
unfair_variation = []
unfair_accuracy = []

results = []
for i in range(100):
    main_state = random.choice(range(1000))
    res = fmclp(dataset = d, estimator = estimator, number_iterations = 20, prefit = False, 
                    interior_classifier = 'knn', random_state = main_state,
            verbose = False, multiplier =35)
    results.append(res)
    unfair_diff.append(res['fairness_of_initial_classifier']['diff'])
    unfair_ratio.append(res['fairness_of_initial_classifier']['ratio'])
    unfair_variation.append(res['fairness_of_initial_classifier']['variation'])
    unfair_accuracy.append(res['accuracy_of_initial_classifier'])
    
    fair_diff.append(res['fairness_of_fair_classifier']['diff'])
    fair_ratio.append(res['fairness_of_fair_classifier']['ratio'])
    fair_variation.append(res['fairness_of_fair_classifier']['variation'])
    fair_accuracy.append(res['accuracy_of_fair_classifier'])
    
    name = f"compas_trials/trial_№{i+1}.txt"
    file = open(name,'w')
    file.write(f"""unfair_total_diff: {res['fairness_of_initial_classifier']['diff']}
unfair_ratio: {res['fairness_of_initial_classifier']['ratio']}
unfair_variation: {res['fairness_of_initial_classifier']['variation']}
unfair_accuracy: {res['accuracy_of_initial_classifier']}
fair_diff: {res['fairness_of_fair_classifier']['diff']}
fair_ratio: {res['fairness_of_fair_classifier']['ratio']}
fair_variation: {res['fairness_of_fair_classifier']['variation']}
fair_accuracy: {res['accuracy_of_fair_classifier']}
interior_classifier: knn
multiplier: 35
main_state: {main_state}
    """)
    file.close()
    res['fairness_of_fair_classifier']['df'].to_csv(f"compas_trials/compas_trial_№{i+1} cuae-metric-fair.csv")
    res['fairness_of_initial_classifier']['df'].to_csv(f"compas_trials/compas_trial_№{i+1} cuae-metric-unfair.csv")
    print(i+1)
    
fair_diff = np.array(fair_diff)
fair_ratio = np.array(fair_ratio)
fair_variation = np.array(fair_variation)
fair_accuracy = np.array(fair_accuracy)
unfair_diff = np.array(unfair_diff)
unfair_ratio = np.array(unfair_ratio)
unfair_variation = np.array(unfair_variation)
unfair_accuracy = np.array(unfair_accuracy)

file = open('compas_trials/compas_trials.txt','w')
file.write(
f"""dataset for initial classifier training: 200 
classifier: LGBMClassifier()
number_iterations: 10
multiplier:25
interior_classifier: knn
fair_diff: {fair_diff}
fair_ratio: {fair_ratio}
fair_variation: {fair_variation}
fair_accuracy: {fair_accuracy}
unfair_diff: {unfair_diff}
unfair_ratio: {unfair_ratio}
unfair_variation: {unfair_variation}
unfair_accuracy: {unfair_accuracy}""")
file.close()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
CPU times: user 2h 51min 6s, sys: 41min 29s, total: 3h 32min 35s
Wall time: 2h 9min 59s


In [5]:
fair_diff.mean(),unfair_diff.mean()

(0.15045373668539327, 0.1467769521024428)

In [6]:
fair_ratio.mean(),unfair_ratio.mean()

(1.3323821901573902, 1.3537608015695406)

In [7]:
fair_variation.mean(),unfair_variation.mean()

(0.5048978810015841, 0.5701714157738056)

In [9]:
fair_accuracy.mean(), unfair_accuracy.mean()

(0.8262152389717967, 0.8327006771415424)