In [1]:
!pip install "aequitas==1.0.0" &> /dev/null
import pandas as pd
import numpy as np
from aequitas.group import Group
from aequitas.bias import Bias
from aequitas.plotting import Plot

import seaborn as sns
from aequitas.audit import Audit
from aequitas.fairness import Fairness
import aequitas.plot as ap

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [35]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.drop_duplicates()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [22]:
print("Min age: ", min(df['Age']))
print("Max age: ", max(df['Age']))

print("Min BMI: ", min(df['BMI']))
print("Max BMI: ", max(df['BMI']))

print("Min Pregnancies: ", min(df['Pregnancies']))
print("Max Pregnancies: ", max(df['Pregnancies']))

Min age:  21
Max age:  81
Min BMI:  0.0
Max BMI:  67.1
Min Pregnancies:  0
Max Pregnancies:  17


In [12]:
from sklearn.model_selection import train_test_split

features = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
label = 'Outcome'
X, y = df[features].values, df[label].values

# Split data 70%-30% into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

print ('Training cases: %d\nTest cases: %d' % (X_train.shape[0], X_test.shape[0]))

Training cases: 537
Test cases: 231


In [15]:
from sklearn.linear_model import LogisticRegression
reg = 0.01
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

In [18]:
from sklearn.metrics import accuracy_score
predictions = model.predict(X_test)
print('Predicted labels: ', predictions)
print('Actual labels:    ' ,y_test)
print('Accuracy: ', accuracy_score(y_test, predictions))

Predicted labels:  [1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 1
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0
 0 1 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 0 1 0 0 1 0 1 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0
 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 1 1 1 0 0 0 0 0]
Actual labels:     [1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 0 0 0 0 0 0 1 1 0 0 1 1 1 0 0 1 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1 0
 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 0
 0 1 0 1 0 0 1 0 0 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 1 1
 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0
 1 1 0 0 1 1 0 0 0]
Accuracy:  0.779220779

In [36]:
predictions = model.predict(X)
df['score'] = predictions
df.head(15)
print(df['Pregnancies'])

0       6
1       1
2       8
3       1
4       0
       ..
763    10
764     2
765     5
766     1
767     1
Name: Pregnancies, Length: 768, dtype: int64


In [39]:
def categorize_age(age):
  if age < 30:
    return "Less than 30"
  elif age < 60:
    return "30 - 60"
  else:
    return "Greater than 60"

def categorize_pregnancies(pregnancies):
  if pregnancies < 5:
    return "Less than 5"
  elif pregnancies < 10:
    return "5 - 10"
  else:
    return "Greater than 10"

new_df = df.drop(columns=["Glucose", "BloodPressure",	"SkinThickness", "Insulin",	"DiabetesPedigreeFunction"])
# new_df.head()
new_df['Pregnancies'] = df['Pregnancies'].apply(categorize_pregnancies)
new_df['Age'] = df['Age'].apply(categorize_age)
new_df.head(15)

Unnamed: 0,Pregnancies,BMI,Age,Outcome,score
0,5 - 10,33.6,30 - 60,1,1
1,Less than 5,26.6,30 - 60,0,0
2,5 - 10,23.3,30 - 60,1,1
3,Less than 5,28.1,Less than 30,0,0
4,Less than 5,43.1,30 - 60,1,1
5,5 - 10,25.6,30 - 60,0,0
6,Less than 5,31.0,Less than 30,1,0
7,Greater than 10,35.3,Less than 30,0,1
8,Less than 5,30.5,30 - 60,1,1
9,5 - 10,0.0,30 - 60,1,0


In [42]:
audit = Audit(new_df.drop(columns=["BMI"]), label_column="Outcome")
audit.audit()
audit.metrics.round(2)
# Pregnancies greater than 10 are more likely to be labeled as having diabetes

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,tpr,tnr,for,fdr,fpr,fnr,npv,precision,ppr,pprev,prev
attribute_name,attribute_value,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Pregnancies,5 - 10,0.72,0.61,0.81,0.3,0.26,0.19,0.39,0.7,0.74,0.39,0.39,0.47
Pregnancies,Greater than 10,0.62,0.67,0.57,0.38,0.38,0.43,0.33,0.62,0.62,0.15,0.55,0.52
Pregnancies,Less than 5,0.81,0.52,0.92,0.17,0.28,0.08,0.48,0.83,0.72,0.46,0.2,0.28
Age,30 - 60,0.69,0.6,0.78,0.35,0.26,0.22,0.4,0.65,0.74,0.66,0.41,0.51
Age,Greater than 60,0.66,0.78,0.61,0.12,0.56,0.39,0.22,0.88,0.44,0.07,0.5,0.28
Age,Less than 30,0.85,0.49,0.95,0.13,0.29,0.05,0.51,0.87,0.71,0.27,0.15,0.21


In [52]:
new_df = new_df.rename(columns={"Outcome": "label_value"})
audit = Audit(new_df.drop(columns=["BMI"]), label_column="label_value",
              reference_groups={'Pregnancies':'Greater than 10', 'Age':'Greater than 60'})

audit.audit(bias_args={
    "alpha": 0.05,
    "check_significance": True,
    "mask_significance": True
})
audit.disparities.style

  W = numer / denom
  W = numer / denom
  W = numer / denom


Unnamed: 0_level_0,Unnamed: 1_level_0,fdr_disparity,fnr_disparity,for_disparity,fpr_disparity,npv_disparity,ppr_disparity,pprev_disparity,precision_disparity,tnr_disparity,tpr_disparity,label_value_significance,score_significance,fdr_significance,fnr_significance,for_significance,fpr_significance,npv_significance,ppr_significance,pprev_significance,precision_significance,tnr_significance,tpr_significance
attribute_name,attribute_value,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Pregnancies,5 - 10,0.698413,1.176471,0.776119,0.442529,1.139925,2.625,0.698394,1.180952,1.418103,0.911765,False,True,False,False,False,False,False,True,True,False,False,False
Pregnancies,Greater than 10,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,False,False,False,False,False,False,False,False,False,False,False,False
Pregnancies,Less than 5,0.754209,1.433824,0.430025,0.183521,1.356234,3.09375,0.36471,1.147475,1.61236,0.783088,True,True,False,True,True,False,True,True,True,False,False,True
Age,30 - 60,0.453901,1.8,2.81407,0.557576,0.740847,8.8125,0.829412,1.702128,1.284416,0.771429,True,False,True,False,False,True,False,False,False,True,True,False
Age,Greater than 60,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,False,False,False,False,False,False,False,False,False,False,False,False
Age,Less than 30,0.521073,2.303571,1.017751,0.139245,0.997464,3.625,0.292929,1.615764,1.553342,0.627551,False,True,True,False,False,True,False,True,True,True,True,False


In [53]:
metrics = ['fpr','fdr']
disparity_tolerance = 1.25

In [54]:
audit.summary_plot(metrics=metrics, fairness_threshold=disparity_tolerance)

In [56]:
audit.disparity_plot(metrics=metrics, attribute='Pregnancies', fairness_threshold=disparity_tolerance)

In [2]:
!pip install dash
!pip install plotly

Collecting dash
  Downloading dash-2.18.1-py3-none-any.whl.metadata (10 kB)
Collecting dash-html-components==2.0.0 (from dash)
  Downloading dash_html_components-2.0.0-py3-none-any.whl.metadata (3.8 kB)
Collecting dash-core-components==2.0.0 (from dash)
  Downloading dash_core_components-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting dash-table==5.0.0 (from dash)
  Downloading dash_table-5.0.0-py3-none-any.whl.metadata (2.4 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading dash-2.18.1-py3-none-any.whl (7.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: dash-table, dash-html-comp

In [3]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd


data = {
    'attribute_name': ['Pregnancies', 'Pregnancies', 'Pregnancies', 'Age', 'Age', 'Age'],
    'attribute_value': ['5 - 10', 'Greater than 10', 'Less than 5', '30 - 60', 'Greater than 60', 'Less than 30'],
    'fdr_disparity': [0.698413, 1.000000, 0.754209, 0.453901, 1.000000, 0.521073],
    'fpr_disparity': [0.442529, 1.000000, 0.183521, 0.557576, 1.000000, 0.139245],
    'fnr_disparity': [1.176471, 1.000000, 1.433824, 1.800000, 1.000000, 2.303571],
    'ppr_disparity': [2.625000, 1.000000, 3.093750, 8.812500, 1.000000, 3.625000]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Initialize Dash App
app = dash.Dash(__name__)

# Define layout of the dashboard
app.layout = html.Div(children=[
    html.H1(children='Fairness Metrics Dashboard'),

    html.Div(children='''
        Select fairness metric to visualize:
    '''),

    # Dropdown to select which fairness metric to visualize
    dcc.Dropdown(
        id='metric-dropdown',
        options=[
            {'label': 'False Discovery Rate Disparity', 'value': 'fdr_disparity'},
            {'label': 'False Positive Rate Disparity', 'value': 'fpr_disparity'},
            {'label': 'False Negative Rate Disparity', 'value': 'fnr_disparity'},
            {'label': 'Positive Predictive Rate Disparity', 'value': 'ppr_disparity'}
        ],
        value='fdr_disparity'  # Default selection
    ),

    # Area to display the chart
    dcc.Graph(id='disparity-graph')
])

# Callback to update the graph based on dropdown selection
@app.callback(
    Output('disparity-graph', 'figure'),
    [Input('metric-dropdown', 'value')]
)
def update_graph(selected_metric):
    # Filter the data based on the selected metric
    fig = px.bar(
        df,
        x='attribute_value',
        y=selected_metric,
        color='attribute_name',
        labels={'attribute_value': 'Group', selected_metric: 'Disparity'},
        title=f'{selected_metric.replace("_", " ").title()} by Group'
    )

    return fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html


<IPython.core.display.Javascript object>