## Base imports

In [123]:
import sklearn
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

import sys
sys.path.append('../../holisticai')

In [124]:
def load_student_data():
    import requests, zipfile
    requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip')
    with zipfile.ZipFile('./student.zip', 'r') as zip_ref:
        zip_ref.extractall('.')
        
    data = pd.read_csv('student-mat.csv',delimiter=';')

    return data

def preprocess_data(data):
    
    categoricals = []
    for col in data.columns:
        if col != 'G3' and col!='sex' and data[col].dtype == object:
            categoricals.append(col)

    cat_encoder = OneHotEncoder()  
    enc = cat_encoder.fit_transform(data[categoricals])
    enc = pd.DataFrame(enc.toarray())
    df = pd.concat([data,enc],axis=1).drop(columns=categoricals) # add encoded columns
    
    df = df.rename(str, axis='columns')
    return df

# Measuring bias

## Loading

In [125]:
# Load data and split for training
data = load_student_data()
data = preprocess_data(data)

train, test = train_test_split(data, test_size=0.3,random_state=42)

# G3 is the students final grade
X_train = train.drop(columns=['G3'])
X_test = test.drop(columns=['G3'])
y_train = train['G3']
y_test = test['G3']

# Train a simple linear model
LR = LinearRegression()
model = LR.fit(X_train.drop(columns=['sex']), y_train)

# Predict some values
y_pred = model.predict(X_test.drop(columns=['sex']))

### Bias analysis

In [112]:
from holisticai.bias.metrics import regression_bias_metrics

group_a = X_test['sex']=='M'
group_b = X_test['sex']=='F'

regression_bias_metrics(group_a, group_b, y_pred, y_test)

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Disparate Impact Q90,4.754098,0
Disparate Impact Q80,1.901639,0
Disparate Impact Q50,1.331148,0
No Disparate Impact Level,12.633841,-
Average Score Difference,1.786258,0
Average Score Difference Q80,1.267822,0
Z Score Difference,0.413114,0
Z Score Difference Q80,0.655295,0
Max Statistical Parity,0.19983,0
Statistical Parity AUC,0.10662,0
