# Demographic Parity

# Gender

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

df = pd.read_csv('/Users/pedrojosetrujillomejia/Desktop/airplane_train_processed.csv')


X = df.drop(['satisfaction'], axis=1)
y = df['satisfaction']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)


y_pred = rfc.predict(X_val)


female_mask = X_val['Gender_Female'] == 1
male_mask = X_val['Gender_Male'] == 1
female_proportion = sum(y_pred[female_mask] == 'satisfied') / sum(female_mask)
male_proportion = sum(y_pred[male_mask] == 'satisfied') / sum(male_mask)


print(f"Female proportion: {female_proportion:.2f}")
print(f"Male proportion: {male_proportion:.2f}")



Female proportion: 0.41
Male proportion: 0.43


Based on the results, the proportion of positive predictions (i.e., predicted proportion of satisfied customers) is slightly higher for males (0.43) than for females (0.41). This suggests that there may be some gender bias in the model, where the model is more likely to predict satisfaction for male passengers than for female passengers.

However, it's important to note that this is just one evaluation metric and may not tell the whole story. It's possible that there are other factors that could explain the difference in predicted satisfaction rates between males and females, such as differences in flight preferences or experiences.

# AGE

In [28]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


df = pd.read_csv('/Users/pedrojosetrujillomejia/Desktop/airplane_train_processed.csv')


X = df.drop(['satisfaction'], axis=1)
y = df['satisfaction']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_val)


young_mask = X_val['Age'] <= 25
middle_age_mask = (X_val['Age'] > 25) & (X_val['Age'] <= 50)
old_mask = X_val['Age'] > 50
young_proportion = sum(y_pred[young_mask] == 'satisfied') / sum(young_mask)
middle_age_proportion = sum(y_pred[middle_age_mask] == 'satisfied') / sum(middle_age_mask)
old_proportion = sum(y_pred[old_mask] == 'satisfied') / sum(old_mask)


print(f"Young proportion: {young_proportion:.2f}")
print(f"Middle-age proportion: {middle_age_proportion:.2f}")
print(f"Old proportion: {old_proportion:.2f}")


Young proportion: 0.25
Middle-age proportion: 0.47
Old proportion: 0.45


the proportion of positive predictions (i.e., the predicted proportion of satisfied customers) is highest for the middle-age group (0.47), followed by the old age group (0.45), and then the young age group (0.25). This suggests that there may be some age bias in the model, where the model is more likely to predict satisfaction for middle-aged and older passengers than for younger passengers.

# True Positive Rate

In [29]:
import pandas as pd
import numpy as np


df = pd.DataFrame({'A': [True, False, np.nan, True]})


df['A'] = df['A'].fillna(False).astype(bool)


df['A_inv'] = ~df['A']

# Gender

In [30]:
from sklearn.metrics import confusion_matrix

y_pred = rfc.predict(X_val)


conf_matrix = confusion_matrix(y_val, y_pred)


female_indices = X_val['Gender_Female'] == 1
male_indices = X_val['Gender_Male'] == 1
X_val_female, y_val_female = X_val[female_indices], y_val[female_indices]
X_val_male, y_val_male = X_val[male_indices], y_val[male_indices]
y_pred_female, y_pred_male = y_pred[female_indices], y_pred[male_indices]


conf_matrix_female = confusion_matrix(y_val_female, y_pred_female)
conf_matrix_male = confusion_matrix(y_val_male, y_pred_male)


tpr_female = conf_matrix_female[1, 1] / (conf_matrix_female[1, 0] + conf_matrix_female[1, 1])
tpr_male = conf_matrix_male[1, 1] / (conf_matrix_male[1, 0] + conf_matrix_male[1, 1])

print("TPR for female customers: {:.3f}".format(tpr_female))
print("TPR for male customers: {:.3f}".format(tpr_male))


TPR for female customers: 0.925
TPR for male customers: 0.932


It seems that the model has a slightly higher true positive rate (TPR) for male customers compared to female customers. However, the difference is relatively small and may not be statistically significant.

# Age

In [34]:
young_mask = X_val['Age_Group'] == 'Young'
middle_age_mask = X_val['Age_Group'] == 'Middle-Aged'
old_mask = X_val['Age_Group'] == 'Old'
y_pred_age = y_pred == 'satisfied'
y_val_age = y_val == 'satisfied'
young_tpr = sum(y_pred_age[young_mask] & y_val_age[young_mask]) / sum(y_val_age[young_mask])
middle_age_tpr = sum(y_pred_age[middle_age_mask] & y_val_age[middle_age_mask]) / sum(y_val_age[middle_age_mask])
old_tpr = sum(y_pred_age[old_mask] & y_val_age[old_mask]) / sum(y_val_age[old_mask])


print(f"Young TPR: {young_tpr:.2f}")
print(f"Middle-aged TPR: {middle_age_tpr:.2f}")
print(f"Old TPR: {old_tpr:.2f}")


Young TPR: 0.85
Middle-aged TPR: 0.94
Old TPR: 0.93


It looks like there is a significant difference in the TPR between the young age group and the middle-aged/old age groups. This could indicate that the model is performing better for middle-aged and old customers compared to young customers, which could suggest a bias in the model.

# False Positive Rate

In [38]:

female_mask = X_val['Gender_Female'] == 1
male_mask = X_val['Gender_Male'] == 1


y_val_female = y_val[female_mask]
y_pred_female = y_pred[female_mask]
if sum(y_val_female == 'not satisfied') == 0:
    female_fpr = 0
else:
    female_fpr = sum((y_val_female == 'not satisfied') & (y_pred_female == 'satisfied')) / sum(y_val_female == 'not satisfied')


y_val_male = y_val[male_mask]
y_pred_male = y_pred[male_mask]
if sum(y_val_male == 'not satisfied') == 0:
    male_fpr = 0
else:
    male_fpr = sum((y_val_male == 'not satisfied') & (y_pred_male == 'satisfied')) / sum(y_val_male == 'not satisfied')


print(f"Female FPR: {female_fpr:.2f}")
print(f"Male FPR: {male_fpr:.2f}")


Female FPR: 0.00
Male FPR: 0.00


In [39]:

young_mask = X_val['Age_Group'] == 'Young'
middle_age_mask = X_val['Age_Group'] == 'Middle-Aged'
old_mask = X_val['Age_Group'] == 'Old'


y_val_young = y_val[young_mask]
y_pred_young = y_pred[young_mask]
if sum(y_val_young == 'not satisfied') == 0:
    young_fpr = 0
else:
    young_fpr = sum((y_val_young == 'not satisfied') & (y_pred_young == 'satisfied')) / sum(y_val_young == 'not satisfied')


y_val_middle_age = y_val[middle_age_mask]
y_pred_middle_age = y_pred[middle_age_mask]
if sum(y_val_middle_age == 'not satisfied') == 0:
    middle_age_fpr = 0
else:
    middle_age_fpr = sum((y_val_middle_age == 'not satisfied') & (y_pred_middle_age == 'satisfied')) / sum(y_val_middle_age == 'not satisfied')


y_val_old = y_val[old_mask]
y_pred_old = y_pred[old_mask]
if sum(y_val_old == 'not satisfied') == 0:
    old_fpr = 0
else:
    old_fpr = sum((y_val_old == 'not satisfied') & (y_pred_old == 'satisfied')) / sum(y_val_old == 'not satisfied')


print(f"Young FPR: {young_fpr:.2f}")
print(f"Middle-aged FPR: {middle_age_fpr:.2f}")
print(f"Old FPR: {old_fpr:.2f}")


Young FPR: 0.00
Middle-aged FPR: 0.00
Old FPR: 0.00
