In [None]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
warnings.filterwarnings("ignore")

#### Here, we load the dataset and drop some unused columns

In [None]:
df = pd.read_parquet("train.parquet")
df.drop(columns=["ID", "efs_time"], axis=1, inplace=True)
print(df.shape)
df.head()

In [None]:
print(df[(df["ethnicity"] == "Not Hispanic or Latino") & (df["efs"] == 1)].shape)
print(df[(df["ethnicity"] == "Not Hispanic or Latino") & (df["efs"] == 0)].shape)
print(df[(df["ethnicity"] == "Hispanic or Latino") & (df["efs"] == 1)].shape)
print(df[(df["ethnicity"] == "Hispanic or Latino") & (df["efs"] == 0)].shape)

#### Here we do some fillna operations for the null values and use labelencoder for simple features engineering

In [None]:
cat_cols = []
num_cols = []

TARGET = ["efs"]
FEATURES = [c for c in df.columns if c not in TARGET]

for c in FEATURES:
    if df[c].dtype == "object" or df[c].dtype == "category":
        cat_cols.append(c)
    else:
        num_cols.append(c)
print(f"In these features, there are {len(cat_cols)} CATEGORICAL FEATURES: {cat_cols}")


def update(df, cat_cols):
    for c in cat_cols:
        df[c] = df[c].astype(str).fillna("Unknown").astype("category")
    for c in num_cols:
        if df[c].dtype == "float64":
            df[c] = df[c].fillna(0).astype("float32")
        if df[c].dtype == "int64":
            df[c] = df[c].fillna(0).astype("int32")
    return df

df["efs"] = df["efs"].astype("int32")
df = update(df, cat_cols)
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))
df.sample(frac=1.0).reset_index(drop=True, inplace=True)
df.head()

#### Here we set the sensitive attribute to be "ethnicity" and split the dataset into df and test set

In [None]:
df["ethnicity"].value_counts()

In [None]:
features = [c for c in df.columns if c not in TARGET]
sensitive_attribute = features.index("ethnicity")
print(sensitive_attribute)

target = ["efs"]

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=42)

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy().ravel()
y_test = y_test.to_numpy().ravel()

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

#### Here we report the per-group prevalance for z=0 and z=1

In [None]:
z_test = (X_test[:, sensitive_attribute] == 1)

prevalence_0 = np.mean(y_test[z_test == 0])
prevalence_1 = np.mean(y_test[z_test == 1])

prevalence_0, prevalence_1

#### Here we train the basic model and report the demographic parity on the test set

In [None]:
model = LogisticRegression(C=1, max_iter=10000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

z_test = (X_test[:, sensitive_attribute] == 1)

positive_pred_z_0 = np.mean(y_pred[z_test == 0])
positive_pred_z_1 = np.mean(y_pred[z_test == 1])

parity = positive_pred_z_0 / positive_pred_z_1
parity

#### Here we report TPR_0, TPR_1, FPR_0, and FPR_1

In [None]:
z_test = (X_test[:, sensitive_attribute] == 1)

tp_0 = np.sum((y_pred == 1) & (y_test == 1) & (z_test == 0))
fn_0 = np.sum((y_pred == 0) & (y_test == 1) & (z_test == 0))
fp_0 = np.sum((y_pred == 1) & (y_test == 0) & (z_test == 0))
tn_0 = np.sum((y_pred == 0) & (y_test == 0) & (z_test == 0))

tp_1 = np.sum((y_pred == 1) & (y_test == 1) & (z_test == 1))
fn_1 = np.sum((y_pred == 0) & (y_test == 1) & (z_test == 1))
fp_1 = np.sum((y_pred == 1) & (y_test == 0) & (z_test == 1))
tn_1 = np.sum((y_pred == 0) & (y_test == 0) & (z_test == 1))

TPR_0 = tp_0 / (tp_0 + fn_0)
TPR_1 = tp_1 / (tp_1 + fn_1)

FPR_0 = fp_0 / (fp_0 + tn_0)
FPR_1 = fp_1 / (fp_1 + tn_1)

TPR_0, TPR_1, FPR_0, FPR_1