In [39]:
import pandas as pd
import os
from divexplorer.FP_DivergenceExplorer import FP_DivergenceExplorer




# reproduce DivExplorer


In [40]:


inputDir=os.path.join(".", "datasets")


df= pd.read_csv(os.path.join(inputDir, "compas_discretized.csv"))
class_map={'N': 0, 'P': 1}
df.head()



Unnamed: 0,age,charge,race,sex,#prior,stay,class,predicted
0,>45,F,Other,Male,0,<week,0,0
1,25-45,F,Afr-Am,Male,0,1w-3M,1,0
2,<25,F,Afr-Am,Male,>3,<week,1,0
3,25-45,M,Other,Male,0,<week,0,0
4,25-45,F,Cauc,Male,>3,<week,1,0


In [41]:

len(df)


6172

In [42]:

import time

time1 = time.time()
min_sup=0.01 # 61.72 size threshold
# Input: a discretized dataframe with the true class and the predicted class.
# We specify their column names in the dataframe
# The class_map is a dictionary to specify the positive and the negative class (e.g. {"P":1, "N":0})
fp_diver=FP_DivergenceExplorer(df,"class", "predicted", class_map=class_map)
#Extract frequent patterns (FP) and compute divergence
##min_support: minimum support threshold
##metrics: metrics=["d_fpr", "d_fnr"]
# (default metric of interest: False Positive Rate (FPR) d_fpr, False Negative Rate (FNR) d_fnr, Accuracy divergence)
FP_fm=fp_diver.getFrequentPatternDivergence(min_support=min_sup, metrics=["d_fpr"])
time2 = time.time()
print("running time = {}s".format(time2 - time1))

running time = 1.1769459247589111s


In [43]:
print(f"Number of frequent patterns: {len(FP_fm)}")

Number of frequent patterns: 969


In [44]:
FP_fm.head()

Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
0,1.0,(),3066,297,1962,847,0,6172.0,0.088314,0.0,0.0
1,0.809624,(sex=Male),2357,244,1647,749,1,4997.0,0.09381,0.005496,0.738697
2,0.772683,(stay=<week),2589,201,1487,492,1,4769.0,0.072043,-0.016271,2.339352
3,0.643227,(charge=F),1772,214,1307,677,1,3970.0,0.107754,0.01944,2.301183
4,0.614226,"(stay=<week, sex=Male)",1959,162,1236,434,2,3791.0,0.076379,-0.011935,1.555571


In [45]:

FP_fm_unfair = FP_fm[FP_fm["d_fpr"] > 0.1]
len(FP_fm_unfair)


196

In [46]:
FP_fm_unfair[:10]

Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
28,0.293422,(#prior=>3),470,132,670,539,1,1811.0,0.219269,0.130955,7.503321
35,0.256643,"(sex=Male, #prior=>3)",401,115,588,480,2,1584.0,0.222868,0.134554,7.1467
43,0.21954,"(#prior=>3, charge=F)",331,97,488,439,2,1355.0,0.226636,0.138322,6.702442
49,0.207226,"(#prior=>3, age=25-45)",297,100,469,413,2,1279.0,0.251889,0.163575,7.384973
55,0.196695,"(race=Afr-Am, #prior=>3)",283,100,403,428,2,1214.0,0.261097,0.172783,7.582039
56,0.194426,"(stay=<week, #prior=>3)",349,82,457,312,2,1200.0,0.190255,0.101941,5.283437
58,0.190376,"(sex=Male, #prior=>3, charge=F)",277,83,423,392,3,1175.0,0.230556,0.142242,6.323414
62,0.180655,"(sex=Male, #prior=>3, age=25-45)",254,86,405,370,3,1115.0,0.252941,0.164627,6.903692
64,0.178386,"(sex=Male, age=<25)",357,83,414,247,2,1101.0,0.188636,0.100322,5.265747
67,0.175308,"(sex=Male, race=Afr-Am, #prior=>3)",239,87,369,387,3,1082.0,0.266871,0.178557,7.214382


## use our algorithm on this dataset

In [47]:

inputDir=os.path.join(".", "datasets")


df= pd.read_csv(os.path.join(inputDir, "compas_discretized.csv"))
print(len(df))
df[:5]

6172


Unnamed: 0,age,charge,race,sex,#prior,stay,class,predicted
0,>45,F,Other,Male,0,<week,0,0
1,25-45,F,Afr-Am,Male,0,1w-3M,1,0
2,<25,F,Afr-Am,Male,>3,<week,1,0
3,25-45,M,Other,Male,0,<week,0,0
4,25-45,F,Cauc,Male,>3,<week,1,0


In [48]:

TP = df[(df['class'] == 1) & (df['predicted'] == 1)]
FP = df[(df['class'] == 0) & (df['predicted'] == 1)]
TN = df[(df['class'] == 0) & (df['predicted'] == 0)]
FN = df[(df['class'] == 1) & (df['predicted'] == 0)]

print(len(TP) + len(FP) + len(TN) + len(FN))

selected_attributes = ['age', 'charge', 'race', 'sex', '#prior', 'stay']
df = df[selected_attributes]
TP = TP[selected_attributes]
TN = TN[selected_attributes]
FP = FP[selected_attributes]
FN = FN[selected_attributes]

6172


In [49]:
%reload_ext autoreload

%autoreload 2

from Algorithms import NewAlgGeneral_SizeFairnessValue_2_20210528 as newalg

thc = 61.72
time_limit = 5 * 60
fairness_definition = 1  # FPR = FP/(FP+TN) False_positive_error_rate_balance, but for those treated too well
delta_thf = 0.1
pattern_with_low_fairness1, sizes_of_patterns, fairness_values_of_patterns, \
num_patterns, t1_ = newalg.GraphTraverse(df,
                                         TP, TN, FP, FN, delta_thf,
                                         thc, time_limit, fairness_definition)


print("newalg, time = {} s, num_calculation = {}\n".format(t1_, num_patterns))
print("num of patterns detected = {}".format(len(pattern_with_low_fairness1)))
for i in range(len(pattern_with_low_fairness1)):
    print("{} {} {}\n".format(str(pattern_with_low_fairness1[i]),
                              sizes_of_patterns[i], fairness_values_of_patterns[i]))



False_positive_error_rate_balance, original_thf = 0.08831400535236396, Thf = 0.18831400535236398
newalg, time = 0.33303213119506836 s, num_calculation = 1343

num of patterns detected = 10
[-1, -1, -1, -1, -1, '>3Months'] 310 0.224

[-1, -1, -1, -1, '>3', -1] 1811 0.21926910299003322

[-1, -1, 'Afr-Am', -1, -1, '1w-3M'] 585 0.20772946859903382

[-1, -1, 'Hispanic', 'Male', -1, '1w-3M'] 68 0.2

['<25', -1, -1, -1, -1, '1w-3M'] 232 0.35064935064935066

['<25', -1, -1, -1, '[1,3]', -1] 595 0.21395348837209302

['<25', -1, -1, 'Male', -1, -1] 1101 0.18863636363636363

['<25', -1, 'Afr-Am', -1, -1, -1] 809 0.20754716981132076

['<25', -1, 'Hispanic', -1, -1, -1] 109 0.21428571428571427

['<25', 'F', -1, -1, -1, -1] 968 0.20347394540942929



## whether FP_fm_unfair are descendents of pattern_with_low_fairness1

In [57]:

def P1DominatedByP2(P1, P2):
    length = len(P1)
    for i in range(length):
        if P1[i] == -1:
            if P2[i] != -1:
                return False
        if P1[i] != -1:
            if P2[i] != P1[i] and P2[i] != -1:
                return False
    return True

# whether a pattern P is dominated by MUP M
# except from P itself
def PDominatedByM(P, M):
    for m in M:
        if PatternEqual(m, P):
            return True, m
        if P1DominatedByP2(P, m):
            return True, m
    return False, None



def PatternEqual(m, P):
    length = len(m)
    if len(P) != length:
        return False
    for i in range(length):
        if m[i] != P[i]:
            return False
    return True


for p in FP_fm_unfair:
    if PDominatedByM(p, pattern_with_low_fairness1):
        continue
    else:
        raise Exception("noooo")







# reproduce ProPublica by DivExplorer and our algorithm

## reproduce ProPublica by DivExplorer

In [50]:


original_data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed_7214rows_cat.csv"
TP_data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed-TP-cat.csv"
FP_data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed-FP-cat.csv"
TN_data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed-TN-cat.csv"
FN_data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed-FN-cat.csv"




In [51]:

data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed_7214rows_with_labels.csv"

df= pd.read_csv(data_file)
class_map={'N': 0, 'P': 1}
print(len(df))
df.head()


7214


Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,predicted,ground_truth
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,0,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,Medium,2013-01-13,,,1,0,1174,0,1,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,Low,2013-03-26,,,2,0,1102,0,0,0


In [52]:

ignore_columns = df.columns.to_list()
ignore_columns.remove("sex")
ignore_columns.remove("race")
ignore_columns.remove("age_cat")
ignore_columns.remove("predicted")
ignore_columns.remove("ground_truth")


In [53]:

# def __init__(
#     self,
#     X_discrete,
#     true_class_name,
#     predicted_class_name=None,
#     class_map={},
#     ignore_cols=[],
#     log_loss_values=None,
#     clf=None,
#     dataset_name="",
#     type_cl="",
# ):


min_sup=0.01 # 61.72 size threshold
# Input: a discretized dataframe with the true class and the predicted class.
# We specify their column names in the dataframe
# The class_map is a dictionary to specify the positive and the negative class (e.g. {"P":1, "N":0})

import time
time1 = time.time()
fp_diver=FP_DivergenceExplorer(df,"ground_truth", "predicted", class_map=class_map, ignore_cols=ignore_columns)
#Extract frequent patterns (FP) and compute divergence
##min_support: minimum support threshold
##metrics: metrics=["d_fpr", "d_fnr"]
# (default metric of interest: False Positive Rate (FPR) d_fpr, False Negative Rate (FNR) d_fnr, Accuracy divergence)
FP_fm=fp_diver.getFrequentPatternDivergence(min_support=min_sup, metrics=["d_fpr"])
FP_fm_unfair = FP_fm[FP_fm["d_fpr"] > 0.1]
time2 = time.time()
print(f"Number of frequent patterns: {len(FP_fm)}")
print("number of unfair patterns = {}".format(len(FP_fm_unfair)))
print("running time = {}".format(time2 - time1))
print("{}".format(FP_fm_unfair["itemsets"]))


Number of frequent patterns: 51
number of unfair patterns = 15
running time = 0.11346006393432617
3                               (race=African-American)
5                     (race=African-American, sex=Male)
7              (race=African-American, age_cat=25 - 45)
9     (race=African-American, sex=Male, age_cat=25 -...
11                               (age_cat=Less than 25)
15                     (sex=Male, age_cat=Less than 25)
17        (race=African-American, age_cat=Less than 25)
20    (race=African-American, sex=Male, age_cat=Less...
29               (age_cat=Less than 25, race=Caucasian)
37                   (sex=Female, age_cat=Less than 25)
41    (race=African-American, sex=Female, age_cat=Le...
43                (age_cat=Less than 25, race=Hispanic)
45      (sex=Male, age_cat=Less than 25, race=Hispanic)
48    (sex=Female, age_cat=Less than 25, race=Caucas...
50                   (age_cat=Less than 25, race=Other)
Name: itemsets, dtype: object


In [56]:

FP_fm_unfair


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
3,0.512337,(race=African-American),990,805,532,1369,1,3696.0,0.448468,0.124976,8.99931
5,0.421957,"(race=African-American, sex=Male)",749,641,458,1196,2,3044.0,0.461151,0.137659,9.005014
7,0.304131,"(race=African-American, age_cat=25 - 45)",606,478,307,803,2,2194.0,0.440959,0.117467,6.996436
9,0.249376,"(race=African-American, sex=Male, age_cat=25 -...",451,389,255,704,3,1799.0,0.463095,0.139603,7.460524
11,0.211949,(age_cat=Less than 25),305,360,225,639,1,1529.0,0.541353,0.217861,10.534022
15,0.172027,"(sex=Male, age_cat=Less than 25)",238,252,205,546,2,1241.0,0.514286,0.190793,8.042809
17,0.12753,"(race=African-American, age_cat=Less than 25)",144,215,130,431,2,920.0,0.598886,0.275393,10.246106
20,0.104103,"(race=African-American, sex=Male, age_cat=Less...",110,156,115,370,3,751.0,0.586466,0.262974,8.476221
29,0.054062,"(age_cat=Less than 25, race=Caucasian)",102,97,51,140,2,390.0,0.487437,0.163945,4.561988
37,0.039922,"(sex=Female, age_cat=Less than 25)",67,108,20,93,2,288.0,0.617143,0.293651,7.854487


## reproduce ProPublica by our algorithm (FPR)

In [55]:

import pandas as pd
from Algorithms import NewAlgGeneral_SizeFairnessValue_2_20210528 as newalg

"""
cox['sex'].replace(to_replace=['Male', 'Female'], value=[0, 1], inplace=True)


cox['age_cat'].replace(to_replace=['Less than 25', '25 - 45', 'Greater than 45'], value=[0, 1, 2], inplace=True)

cox['race'].replace(to_replace=['African-American', 'Asian', 'Caucasian', 'Hispanic', 'Native American', 'Other'], value=[0, 1, 2, 3, 4, 5], inplace=True)


"""

selected_attributes = ["sex", "age_cat", "race"]

original_data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed_7214rows_cat.csv"
TP_data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed-TP-cat.csv"
FP_data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed-FP-cat.csv"
TN_data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed-TN-cat.csv"
FN_data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed-FN-cat.csv"

output_path = r'../../../../OutputData/CaseStudy/ProPublica/fp_greater_than_trying.txt'
output_file = open(output_path, "w")

output_file.write("selected_attributes: {}\n".format(selected_attributes))


def read_with_att(original_data_file, selected_attributes):
    original_data = pd.read_csv(original_data_file)
    less_attribute_data = original_data[selected_attributes]
    return less_attribute_data


less_attribute_data = read_with_att(original_data_file, selected_attributes)
TP = read_with_att(TP_data_file, selected_attributes)
FP = read_with_att(FP_data_file, selected_attributes)
TN = read_with_att(TN_data_file, selected_attributes)
FN = read_with_att(FN_data_file, selected_attributes)


# thc = 3696 this is the max thc to find black [-1, -1, 0]
thc = 20
time_limit = 5 * 60
# fairness_definition = 1 # FPR = FP/(FP+TN) False_positive_error_rate_balance

fairness_definition = 1  # FPR = FP/(FP+TN) False_positive_error_rate_balance, but for those treated too well
delta_thf = 0.1

output_file.write("fairness_definition = {}, thc = {}, delta_thf = {}\n".format(fairness_definition, thc, delta_thf))


pattern_with_low_fairness1, sizes_of_patterns, fairness_values_of_patterns, \
num_patterns, t1_ = newalg.GraphTraverse(less_attribute_data,
                                         TP, TN, FP, FN, delta_thf,
                                         thc, time_limit, fairness_definition)

print("newalg, time = {} s, num_calculation = {}\n".format(t1_, num_patterns))
print("num of patterns detected = {}".format(len(pattern_with_low_fairness1)))
for i in range(len(pattern_with_low_fairness1)):
    print("{} {} {}\n".format(str(pattern_with_low_fairness1[i]),
                              sizes_of_patterns[i], fairness_values_of_patterns[i]))


output_file.write("newalg, time = {} s, num_calculation = {}\n".format(t1_, num_patterns))
output_file.write("num of patterns detected = {}\n".format(len(pattern_with_low_fairness1)))
for i in range(len(pattern_with_low_fairness1)):
    output_file.write("{} {} {}\n".format(str(pattern_with_low_fairness1[i]),
                                          sizes_of_patterns[i], fairness_values_of_patterns[i]))

# pattern_with_low_accuracy2, calculation2_, t2_ = naivealg.NaiveAlg(less_attribute_data,
#                                                                    mis_class_data, tha,
#                                                                    thc, time_limit)
# print("naivealg, time = {} s, num_calculation = {}".format(t2_, calculation2_), "\n",
#       pattern_with_low_accuracy2)





False_positive_error_rate_balance, original_thf = 0.32349230381024474, Thf = 0.4234923038102447
newalg, time = 0.21779084205627441 s, num_calculation = 65

num of patterns detected = 2
[-1, -1, 0] 3696 0.44846796657381616

[-1, 0, -1] 1529 0.5413533834586466






# use data from ProPublica but do FNR

## our algorithm

In [16]:
import pandas as pd
from Algorithms import NewAlgGeneral_1_20210528 as newalg

"""
cox['sex'].replace(to_replace=['Male', 'Female'], value=[0, 1], inplace=True)


cox['age_cat'].replace(to_replace=['Less than 25', '25 - 45', 'Greater than 45'], value=[0, 1, 2], inplace=True)

cox['race'].replace(to_replace=['African-American', 'Asian', 'Caucasian', 'Hispanic', 'Native American', 'Other'], value=[0, 1, 2, 3, 4, 5], inplace=True)


"""

selected_attributes = ["sex", "age_cat", "race"]

original_data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed_7214rows_cat.csv"
TP_data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed-TP-cat.csv"
FP_data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed-FP-cat.csv"
TN_data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed-TN-cat.csv"
FN_data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed-FN-cat.csv"

output_path = r'../../../../OutputData/CaseStudy/ProPublica/fp_greater_than_trying.txt'
output_file = open(output_path, "w")

output_file.write("selected_attributes: {}\n".format(selected_attributes))



def read_with_att(original_data_file, selected_attributes):
    original_data = pd.read_csv(original_data_file)
    less_attribute_data = original_data[selected_attributes]
    return less_attribute_data


less_attribute_data = read_with_att(original_data_file, selected_attributes)
TP = read_with_att(TP_data_file, selected_attributes)
FP = read_with_att(FP_data_file, selected_attributes)
TN = read_with_att(TN_data_file, selected_attributes)
FN = read_with_att(FN_data_file, selected_attributes)




thc = 20
time_limit = 5 * 60
# fairness_definition = 1 # FPR = FP/(FP+TN) False_positive_error_rate_balance

fairness_definition = 2 # FNR = FP/(FP+TN) False_positive_error_rate_balance, but for those treated too well
delta_thf = 0.1



output_file.write("fairness_definition = {}, thc = {}, delta_thf = {}\n".format(fairness_definition, thc, delta_thf))

print("less_attribute_data")
print(less_attribute_data)

pattern_with_low_fairness1, calculation1_, t1_ = newalg.GraphTraverse(less_attribute_data,
                                                                      TP, TN, FP, FN, delta_thf,
                                                                      thc, time_limit, fairness_definition)


print("newalg, time = {} s, num_calculation = {}\n".format(t1_, calculation1_))
print("num of patterns detected = {}".format(len(pattern_with_low_fairness1)))
for p in pattern_with_low_fairness1:
    print(p)

output_file.write("newalg, time = {} s, num_calculation = {}\n".format(t1_, calculation1_))
output_file.write("num of patterns detected = {}\n".format(len(pattern_with_low_fairness1)))
for p in pattern_with_low_fairness1:
    output_file.write(str(p))
    output_file.write("\n")

# pattern_with_low_accuracy2, calculation2_, t2_ = naivealg.NaiveAlg(less_attribute_data,
#                                                                    mis_class_data, tha,
#                                                                    thc, time_limit)
# print("naivealg, time = {} s, num_calculation = {}".format(t2_, calculation2_), "\n",
#       pattern_with_low_accuracy2)

less_attribute_data
      sex  age_cat  race
0       0        2     5
1       0        1     0
2       0        0     0
3       0        0     0
4       0        1     5
...   ...      ...   ...
7209    0        0     0
7210    0        0     0
7211    0        2     5
7212    1        1     0
7213    1        0     3

[7214 rows x 3 columns]
False_negative_error_rate_balance, original_thf = 0.3740387573054445, Thf = 0.47403875730544454
newalg, time = 0.053254127502441406 s, num_calculation = 65

num of patterns detected = 4
[-1, -1, 5]
[-1, -1, 3]
[-1, -1, 2]
[-1, 2, -1]


## DivExplorer

In [25]:

# def __init__(
#     self,
#     X_discrete,
#     true_class_name,
#     predicted_class_name=None,
#     class_map={},
#     ignore_cols=[],
#     log_loss_values=None,
#     clf=None,
#     dataset_name="",
#     type_cl="",
# ):


min_sup=0.01 # 61.72 size threshold
# Input: a discretized dataframe with the true class and the predicted class.
# We specify their column names in the dataframe
# The class_map is a dictionary to specify the positive and the negative class (e.g. {"P":1, "N":0})

import time
time1 = time.time()
fp_diver=FP_DivergenceExplorer(df,"ground_truth", "predicted", class_map=class_map, ignore_cols=ignore_columns)
#Extract frequent patterns (FP) and compute divergence
##min_support: minimum support threshold
##metrics: metrics=["d_fpr", "d_fnr"]
# (default metric of interest: False Positive Rate (FPR) d_fpr, False Negative Rate (FNR) d_fnr, Accuracy divergence)
FP_fm=fp_diver.getFrequentPatternDivergence(min_support=min_sup, metrics=["d_accuracy"])
FP_fm_unfair = FP_fm[FP_fm["d_accuracy"] > -0.1]
time2 = time.time()
print(f"Number of frequent patterns: {len(FP_fm)}")
print("number of unfair patterns = {}".format(len(FP_fm_unfair)))
print("running time = {}".format(time2 - time1))
print("{}".format(FP_fm_unfair["itemsets"]))

Number of frequent patterns: 51
number of unfair patterns = 49
running time = 0.11466479301452637
0                                                    ()
1                                            (sex=Male)
2                                     (age_cat=25 - 45)
3                               (race=African-American)
4                           (age_cat=25 - 45, sex=Male)
5                     (sex=Male, race=African-American)
6                                      (race=Caucasian)
7              (age_cat=25 - 45, race=African-American)
8                            (sex=Male, race=Caucasian)
9     (age_cat=25 - 45, sex=Male, race=African-Ameri...
10                            (age_cat=Greater than 45)
11                               (age_cat=Less than 25)
12                                         (sex=Female)
13                    (age_cat=25 - 45, race=Caucasian)
14                  (sex=Male, age_cat=Greater than 45)
15                     (sex=Male, age_cat=Less than 25)
16    

# Medical dataset

## DivExplorer

In [38]:

data_file = r"../../../../InputData/MedicalDataset/train/train_41att_with_labels.csv"
df= pd.read_csv(data_file)
TP = df[(df['ground_truth'] == 1) & (df['predicted'] == 1)]
TN = df[(df['ground_truth'] == 0) & (df['predicted'] == 0)]
FP = df[(df['ground_truth'] == 0) & (df['predicted'] == 1)]
FN = df[(df['ground_truth'] == 1) & (df['predicted'] == 0)]
print(len(df), len(TP), len(TN), len(FP), len(FN))



7915 942 5576 1014 383


In [None]:

data_file = r"../../../../InputData/MedicalDataset/train/train_41att_with_labels.csv"

df= pd.read_csv(data_file)
class_map={'N': 0, 'P': 1}
print(len(df))
df.head()


In [36]:

ignore_columns = df.columns.to_list()
# 13 attributes
selected_attributes = ["REGION", "SEX", "MARRY", "RACE", "FTSTU", "ACTDTY", "HONRDC", "RTHLTH", "MNHLTH", "HIBPDX", "CHDDX", "ANGIDX", "MIDX"]
ignore_columns.remove("predicted")
ignore_columns.remove("ground_truth")

for att in selected_attributes[:10]:
    ignore_columns.remove(att)


min_sup=0.014 # 110 size threshold
# Input: a discretized dataframe with the true class and the predicted class.
# We specify their column names in the dataframe
# The class_map is a dictionary to specify the positive and the negative class (e.g. {"P":1, "N":0})

import time
time1 = time.time()
fp_diver=FP_DivergenceExplorer(df,"ground_truth", "predicted", class_map=class_map, ignore_cols=ignore_columns)
#Extract frequent patterns (FP) and compute divergence
##min_support: minimum support threshold
##metrics: metrics=["d_fpr", "d_fnr"]
# (default metric of interest: False Positive Rate (FPR) d_fpr, False Negative Rate (FNR) d_fnr, Accuracy divergence)
FP_fm=fp_diver.getFrequentPatternDivergence(min_support=min_sup, metrics=["d_accuracy"])
FP_fm_unfair = FP_fm[FP_fm["d_accuracy"] > 0.1]
time2 = time.time()
print(f"Number of frequent patterns: {len(FP_fm)}")
print("number of unfair patterns = {}".format(len(FP_fm_unfair)))
print("running time = {}".format(time2 - time1))
print("{}".format(FP_fm_unfair["itemsets"]))



ValueError: list.remove(x): x not in list