In [None]:
import pandas as pd

data = """Occupation City Criminal Record Loan Approved
retailer Köln no yes
retailer Aachen no no
retailer Aachen yes no
doctor Aachen yes no
doctor Aachen no yes
doctor Düsseldorf no yes"""
data = data.splitlines()

columns,data = data[0].split(),[x.split() for x in data[1:]]
columns = columns[:2] + [" ".join(columns[2:4])] + [" ".join(columns[4:6])]

data = pd.DataFrame(data,columns=columns)
map_bool = {"no":False,"yes":True}
for bool_ in ["Criminal Record","Loan Approved"]:
    data[bool_] = data[bool_].map(lambda x: map_bool[x] if x in map_bool else pd.NA)
for cat in  ["Occupation","City"]:
    data[cat] = data[cat].astype("category")

display(data.rename({col:f"{col} ({data[col].dtype})" for col in data.columns}, axis=1))

In [None]:
def count_conditional(data, conditions:list[tuple[str,str]], return_df=False):
    data_f = data.copy().reset_index()
    common_sel = pd.Series([True]*len(data_f))
    for comp_col,comp_val in conditions:
        common_sel &= (data_f[comp_col] == comp_val)
    common_sel = data_f[common_sel].reset_index()
    if return_df:
        return common_sel
    else:
        return len(common_sel)

count_conditional(data, [("City","Aachen")],True)

# Q9
## (a)
Base rule = Query of all relevant entries  
PD rule = Query that matches only potentially discriminated entries

<!--  -->
<details>
    <summary>My answer b_1</summary>
    b_1 = {Occupation = Retailer} -> {Loan Approved = False}
</details>
<details>
    <summary>My answer r_1</summary>
    r_1 = {Occupation = Retailer, City = Aachen} -> {Loan Approved = False}
</details>

In [None]:
# Extended Lift is confidence in the PD Rule divided by confidence in the Base Rule
# Confidence for A->B was support(A and B)/support(A)
# Support was count of set in data or fraction of set in data, lectures and instructions flip flop there,
# but it's irrelevant here
# Aside: We really should be allowed a formula collection, there are a lot of terms to keep track of

In [None]:
b2 = ([("Occupation","doctor"),("City","Aachen")],[("Loan Approved",False)])
r2 = ([("Criminal Record",True),("Occupation","doctor"),("City","Aachen")],[("Loan Approved",False)])
conf_b2 = (
    count_conditional(data,b2[0]+b2[1])/
    count_conditional(data,b2[0])
)
conf_r2 = (
    count_conditional(data,r2[0] + r2[1])/
    count_conditional(data,r2[0])
)
exlift = conf_r2/conf_b2

In [None]:
print(f"The extended lift of b2 and r2 is {exlift}")

In [None]:
alpha = 3
if exlift>=alpha:
    print(f"r2 is {alpha}-discriminatory.")
else:
    print(f"r2 is not {alpha}-discriminatory.")

In [None]:
def get_equivalence_classes(data,sensitive_attributes):
    data_f = data.copy()
    equivalence_classes = {}
    for index,row in data_f.iterrows():
        key = tuple(row[sensitive_attributes])
        if key not in equivalence_classes:
            equivalence_classes[key] = []
        equivalence_classes[key].append(index)
    return equivalence_classes

def is_k_anonymous(data:pd.DataFrame,sensitive_attributes:list[str],k:int):
    equivalence_classes = get_equivalence_classes(data,sensitive_attributes)
    return {
        class_:len(equivalence_classes[class_])>=k
        for class_ in equivalence_classes
    }

def is_class_k_anonymous(data:pd.DataFrame,equivalence_class:list[int],k:int):
    return len(equivalence_class)>=k

def is_l_diverse(data:pd.DataFrame,sensitive_attributes:list[str],sensitive_attribute:str,l:int):
    equivalence_classes = get_equivalence_classes(data,sensitive_attributes)
    return {
        class_:len(set(data.loc[equivalence_classes[class_]][sensitive_attribute]))>=l
        for class_ in equivalence_classes
    }

def is_class_l_diverse(data:pd.DataFrame,equivalence_class:list[int],sensitive_attribute:str,l:int):
    return len(set(data.loc[equivalence_class][sensitive_attribute]))>=l

In [None]:
quasi_id = ["Occupation","City"]
sens_attr = ["Criminal Record","Loan Approved"]

for eq_class,rows in get_equivalence_classes(data,quasi_id).items():
    print(f"The equivalence class {eq_class} is {len(rows)}-anonymous.")
    for sensitive_attribute in sens_attr:
        print(f"The equivalence class {eq_class} is {len(set(data.loc[rows][sensitive_attribute]))}-diverse for {sensitive_attribute}.")
    display(data.loc[rows])

<details>
<summary>My Answer (c)</summary>
You didn't ask that it is a good operation, so since equivalence class ('doctor','Aachen') is already 2-diverse we just eliminate all other rows.
</details>