In [1]:
!pip list

Package               Version
--------------------- -----------
argon2-cffi           21.3.0
argon2-cffi-bindings  21.2.0
asttokens             2.0.8
attrs                 22.1.0
backcall              0.2.0
beautifulsoup4        4.11.1
bleach                5.0.1
certifi               2022.6.15
cffi                  1.15.1
charset-normalizer    2.1.1
colorama              0.4.5
cycler                0.11.0
debugpy               1.6.3
decorator             5.1.1
defusedxml            0.7.1
entrypoints           0.4
executing             0.10.0
fastjsonschema        2.16.1
fonttools             4.37.0
htmlmin               0.1.12
idna                  3.3
ImageHash             4.2.1
importlib-metadata    4.12.0
ipykernel             6.15.1
ipython               8.4.0
ipython-genutils      0.2.0
ipywidgets            8.0.1
jedi                  0.18.1
Jinja2                3.1.2
joblib                1.1.0
jsonschema            4.14.0
jupyter-client        7.3.4
jupyter-core          4.11


[notice] A new release of pip available: 22.2.1 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# Functions

In [2]:
%%writefile ./scripts/python/get_metrics.py

import numpy as np
import pandas as pd

# POST PROCESSING BIAS METRICS FOR THE TRAINED MODEL

def get_PPL(data_frame, column_to_group_by, column_to_count):
    """
    :param data_frame: 
    :param column_to_group_by: 
    :param column_to_count: 
    :return: 
    """
    
    ppl = []
    
    counts_total = data_frame.groupby([column_to_group_by]) \
    .size() \
    .rename('count_total') \
    .reset_index() \
    .sort_values(by=column_to_group_by, ascending=False) \
    .set_index(column_to_group_by)
    
    
    counts_grouped = data_frame.groupby([column_to_group_by, column_to_count]) \
    .size() \
    .rename('count') \
    .reset_index() \
    .sort_values(by=column_to_group_by, ascending=False) \
    .set_index(column_to_group_by)

    positive_counts = counts_grouped.loc[counts_grouped[column_to_count] == '1']
    df = pd.merge(positive_counts, counts_total, left_index=True, right_index=True)
    df['PPL'] = df['count'] / df['count_total']
    
    return df['PPL']


def get_DPPL(data_frame, column_to_group_by, column_to_count):
    """
    :param data_frame: 
    :param column_to_group_by: 
    :param column_to_count: 
    :return: 
    """
    
    ppl = []
    
    df = get_PPL(data_frame, column_to_group_by, column_to_count)

    for i in df.iteritems():
        ppl.append(i[1])
        
    return sorted(set([i - j for i in ppl for j in ppl if i != j]))


def get_CA(data_frame, column_to_group_by, column_to_count_acctuals, column_to_count_predictions):
    """
    :param data_frame: 
    :param column_to_group_by: 
    :param column_to_count_acctuals: 
    :param column_to_count_predictions: 
    :return: 
    """
    
    counts_total_labels = data_frame.groupby([column_to_group_by]) \
    .size() \
    .rename('count_total') \
    .reset_index() \
    .sort_values(by=column_to_group_by, ascending=False) \
    .set_index(column_to_group_by)
    
    
    counts_grouped_acctuals = data_frame.groupby([column_to_group_by, column_to_count_acctuals]) \
    .size() \
    .rename('count_grouped_acctuals') \
    .reset_index() \
    .sort_values(by=column_to_group_by, ascending=False) \
    .set_index(column_to_group_by)
    
    positive_counts_grouped_acctuals = counts_grouped_acctuals.loc[counts_grouped_acctuals[column_to_count_acctuals] == '1']
    
    
    counts_grouped_predictions = data_frame.groupby([column_to_group_by, column_to_count_predictions]) \
    .size() \
    .rename('count_gruped_predictions') \
    .reset_index() \
    .sort_values(by=column_to_group_by, ascending=False) \
    .set_index(column_to_group_by)

    positive_counts_grouped_predictions= counts_grouped_predictions.loc[counts_grouped_predictions[column_to_count_predictions] == '1']


    df = pd.merge(positive_counts_grouped_acctuals, positive_counts_grouped_predictions, left_index=True, right_index=True)
    df['DCA'] = df['count_grouped_acctuals'] / df['count_gruped_predictions']
            
    return df['DCA']


def get_DCA(data_frame, column_to_group_by, column_to_count_acctuals, column_to_count_predictions):
    """
    :param data_frame: 
    :param column_to_group_by: 
    :param column_to_count_acctuals: 
    :param column_to_count_predictions: 
    :return: 
    """
    
    ca = []
    
    df = get_CA(data_frame, column_to_group_by, column_to_count_acctuals, column_to_count_predictions)
    
    for i in df.iteritems():
        ca.append(i[1])
            
    return sorted(set([i - j for i in ca for j in ca if i != j]))


Overwriting ./scripts/python/get_metrics.py


# Libraries

In [3]:
# import sys
# print(sys.path)
# sys.path.insert(0, "C:\\Users\\KonuTech\\gfk_data_analyst\\gfk_ml_ops_data_analyst\\scripts\\python\\")

In [4]:
import os
# import sys
import json
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
from scripts.python.get_metrics import get_PPL, get_DPPL, get_CA, get_DCA

In [5]:
INPUT_PATH="input"
OUTPUT_PATH="output"
CONFIG_PATH="config"

INPUT_DATA="test_data2"
INPUT_DATA_CONFIG="input_config"
OUTPUT_DATA="output"

INPUT_EXTENSION="csv"
INPUT_CONFIG_EXTENSION="json"
OUTPUT_EXTENSION="csv"

INPUT_FILE=f"{INPUT_DATA}.{INPUT_EXTENSION}"
INPUT_CONFIG_FILE=f"{INPUT_DATA_CONFIG}.{INPUT_CONFIG_EXTENSION}"
OUTPUT_FILE=f"{OUTPUT_DATA}.{OUTPUT_EXTENSION}"

INPUT_ABS_APTH=os.path.abspath(os.path.join(INPUT_PATH, INPUT_FILE))
INPUT_FILE_CONFIG=os.path.abspath(os.path.join(CONFIG_PATH, INPUT_CONFIG_FILE))
OUTPUT_ABS_APTH=os.path.abspath(os.path.join(OUTPUT_PATH, OUTPUT_FILE))

In [6]:
with open(INPUT_FILE_CONFIG, encoding='utf-8') as f:
    CONFIG = json.load(f)

In [7]:
CONFIG

{'INPUTS': {'FILE_NAME': ['test_data2.csv'],
  'SEPARATOR': ',',
  'DECIMAL': None,
  'ENCODING': 'utf-8',
  'FLOAT_PRECISION': 'high',
  'INDEXES': ['period_end_date', 'translated_when'],
  'DATE_COLUMNS': ['period_end_date', 'translated_when'],
  'DTYPE': {'if_data_corrected': 'object',
   'prod_gr_id': 'object',
   'country_id_n': 'object',
   'delivery_type_id': 'object',
   'freq_id': 'object',
   'retailer_id': 'object',
   'brand_id': 'object',
   'predict_automatch': 'object',
   'class_acctual': 'object'},
  'COLUMNS_WITH_NAN_VALUES': [None]},
 'MODEL': [None],
 'OUTPUTS': {'NAME': ['output.csv']}}

# Load data

In [8]:
# LOAD CSV
df = pd.read_csv(
    INPUT_ABS_APTH,
#     index_col=CONFIG['INPUTS']['INDEXES'],
    sep=CONFIG['INPUTS']['SEPARATOR'],
    # quotechar='"',
    # thousands=',',
#     decimal=CONFIG['INPUTS']['DECIMAL'],
    encoding=CONFIG['INPUTS']['ENCODING'],
    infer_datetime_format=True,
    parse_dates=CONFIG['INPUTS']['DATE_COLUMNS'],
    engine="c",
    low_memory=False,
#     float_precision=CONFIG['INPUTS']['DATE_COLUMNS'],
    skipinitialspace=True,
    dtype=CONFIG['INPUTS']['DTYPE']
)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19697 entries, 0 to 19696
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   period_end_date    19640 non-null  datetime64[ns, UTC]
 1   translated_when    19697 non-null  datetime64[ns, UTC]
 2   if_data_corrected  19697 non-null  object             
 3   prod_gr_id         19697 non-null  object             
 4   country_id_n       18405 non-null  object             
 5   delivery_type_id   18362 non-null  object             
 6   freq_id            19697 non-null  object             
 7   retailer_id        19697 non-null  object             
 8   brand_id           19697 non-null  object             
 9   predict_automatch  19368 non-null  object             
 10  class_acctual      19697 non-null  object             
dtypes: datetime64[ns, UTC](2), object(9)
memory usage: 1.7+ MB


In [10]:
df.head(3)

Unnamed: 0,period_end_date,translated_when,if_data_corrected,prod_gr_id,country_id_n,delivery_type_id,freq_id,retailer_id,brand_id,predict_automatch,class_acctual
0,NaT,2020-10-15 06:58:28+00:00,0,426,121.0,,2,96,111,1,1
1,NaT,2020-10-15 06:38:34+00:00,0,426,,,2,92,95,0,1
2,2020-09-20 00:00:00+00:00,2020-09-23 12:27:51+00:00,0,426,121.0,42730.0,2,30,37,1,1


In [11]:
df.describe(
    include='all',
    datetime_is_numeric=True
)

Unnamed: 0,period_end_date,translated_when,if_data_corrected,prod_gr_id,country_id_n,delivery_type_id,freq_id,retailer_id,brand_id,predict_automatch,class_acctual
count,19640,19697,19697.0,19697.0,18405.0,18362.0,19697.0,19697.0,19697.0,19368.0,19697.0
unique,,,2.0,3.0,34.0,914.0,2.0,52.0,199.0,2.0,2.0
top,,,0.0,426.0,121.0,31480.0,2.0,30.0,33.0,1.0,1.0
freq,,,17085.0,11844.0,4153.0,998.0,11934.0,1197.0,587.0,12324.0,14639.0
mean,2020-10-14 23:30:22.729124096+00:00,2020-10-22 06:01:21.525206784+00:00,,,,,,,,,
min,2020-08-30 00:00:00+00:00,2020-09-01 03:05:51+00:00,,,,,,,,,
25%,2020-09-27 00:00:00+00:00,2020-09-29 13:45:29+00:00,,,,,,,,,
50%,2020-10-18 00:00:00+00:00,2020-10-22 04:30:49+00:00,,,,,,,,,
75%,2020-11-08 00:00:00+00:00,2020-11-13 11:57:53+00:00,,,,,,,,,
max,2020-12-01 00:00:00+00:00,2021-02-01 14:50:49+00:00,,,,,,,,,


# Pandas data profiler

In [12]:
profile = ProfileReport(
    df,
    title="Pandas Profiling Report",
    pool_size=0,
    infer_dtypes=False
)

In [13]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

# Post-Processing Bias Metrics for the Trained Model

## Difference in positive proportion in predicted labels (DPPL)

![image info](./docs/images/DPPL.png)

[SOURCE](https://pages.awscloud.com/rs/112-TZM-766/images/Amazon.AI.Fairness.and.Explainability.Whitepaper.pdf)


### CHARACTERISTICS OF METRIC:
> DPPPL > -1 AND DPPPL < 1

> For example, if the model grants loans to 50% of class 2 and to 60% of class 1, then it may be biased against class 2.
We would have to decide whether a 10% difference is material.

## VARIABLE: prod_gr_id

### prod_gr_id total count

In [14]:
total_counts = df.groupby(['prod_gr_id']) \
    .size() \
    .rename('count_total') \
    .reset_index() \
    .sort_values(by='prod_gr_id', ascending=False) \
    .set_index('prod_gr_id')

# total_counts

### prod_gr_id by predicted classes

In [15]:
grouped_counts = df.groupby(['prod_gr_id', 'predict_automatch']) \
    .size() \
    .rename('count') \
    .reset_index() \
    .sort_values(by='prod_gr_id', ascending=False) \
    .set_index('prod_gr_id')

# grouped_counts

In [16]:
positive_grouped_counts = grouped_counts.loc[grouped_counts['predict_automatch'] == '1']

# positive_grouped_counts

In [17]:
merged_counts = pd.merge(positive_grouped_counts, total_counts, left_index=True, right_index=True)

# merged_counts

### (DPPL) for prod_gr_id

In [18]:
ppl_prod_gr_id = get_PPL(
    data_frame=df,
    column_to_group_by='prod_gr_id',
    column_to_count='predict_automatch'
)

ppl_prod_gr_id

prod_gr_id
427    0.663796
426    0.616177
413    0.622158
Name: PPL, dtype: float64

In [19]:
dppl_prod_gr_id = get_DPPL(
    data_frame=df,
    column_to_group_by='prod_gr_id',
    column_to_count='predict_automatch'
)

dppl_prod_gr_id

[-0.047618696554866724,
 -0.04163783945326527,
 -0.0059808571016014556,
 0.0059808571016014556,
 0.04163783945326527,
 0.047618696554866724]

#### Bucketizing dppl_prod_gr_id

In [20]:
print(pd.cut(dppl_prod_gr_id, bins=3).value_counts())

(-0.0477, -0.0159]    2
(-0.0159, 0.0159]     2
(0.0159, 0.0476]      2
dtype: int64


#### CONCLUSIONS:

> Basing solely on **DPPL** we can loosely assume that there is no disparity impact among **prod_gr_id** Classes

## VARIABLE: country_id_n

### country_id_n total count

In [21]:
total_counts = df.groupby(['country_id_n']) \
    .size() \
    .rename('count_total') \
    .reset_index() \
    .sort_values(by='country_id_n', ascending=False) \
    .set_index('country_id_n')

# total_counts

### country_id_n by predicted classes

In [22]:
grouped_counts = df.groupby(['country_id_n', 'predict_automatch']) \
    .size() \
    .rename('count') \
    .reset_index() \
    .sort_values(by='country_id_n', ascending=False) \
    .set_index('country_id_n')

# grouped_counts

In [23]:
positive_grouped_counts = grouped_counts.loc[grouped_counts['predict_automatch'] == '1']

# positive_grouped_counts

In [24]:
merged_counts = pd.merge(positive_grouped_counts, total_counts, left_index=True, right_index=True)

# merged_counts

### (DPPL) for country_id_n

In [25]:
ppl_country_id_n = get_PPL(
    data_frame=df,
    column_to_group_by='country_id_n',
    column_to_count='predict_automatch'
)

ppl_country_id_n

country_id_n
177     0.681818
176     0.664384
160     0.680851
139     0.568493
138     0.634615
136     0.689655
126     0.680945
121     0.649892
116     0.649402
114     0.688312
113     0.665432
110     0.649778
109     0.386667
108     0.690469
107     0.672269
106     0.430536
105     0.670455
104     0.699856
103     0.711691
1011    0.731844
1010    0.680288
1002    0.733333
Name: PPL, dtype: float64

In [26]:
dppl_country_id_n = get_DPPL(
    data_frame=df,
    column_to_group_by='country_id_n',
    column_to_count='predict_automatch'
)

dppl_country_id_n

[-0.3466666666666666,
 -0.3451769087523277,
 -0.32502459326522887,
 -0.3131890331890332,
 -0.30380193390713,
 -0.3029885057471265,
 -0.30279688216414485,
 -0.30164502164502166,
 -0.30130712424980594,
 -0.2951515151515151,
 -0.29427868045297884,
 -0.29418439716312056,
 -0.2936217948717949,
 -0.2856022408963586,
 -0.28378787878787876,
 -0.2811548087627071,
 -0.2787654320987654,
 -0.27771689497716895,
 -0.2693192486865114,
 -0.2632249779276025,
 -0.26311111111111113,
 -0.26273572377158033,
 -0.25993214940460824,
 -0.2591187212446047,
 -0.2577752371424999,
 -0.25128173064899334,
 -0.25040889595045707,
 -0.2503146126605988,
 -0.24975201036927314,
 -0.24794871794871792,
 -0.24173245639383684,
 -0.239918094285357,
 -0.23489564759624365,
 -0.23384711047464718,
 -0.21935519342508075,
 -0.21924132660858936,
 -0.21886593926905856,
 -0.20407893344619615,
 -0.1818264840182649,
 -0.16484018264840172,
 -0.1633504247340628,
 -0.14319810924696397,
 -0.13795669951574313,
 -0.13136254917076828,
 -0.12197

#### Bucketizing dppl_country_id_n

In [27]:
print(pd.cut(dppl_country_id_n, bins=3).value_counts())

(-0.347, -0.116]     47
(-0.116, 0.116]     368
(0.116, 0.347]       47
dtype: int64


#### CONCLUSIONS:

> Although it is likely. Basing solely on **DPPL** we can't assume if there is a disparity impact among **prod_gr_id** Classes yet. 

## Disparate (Adverse) Impact (DI)

![image info](./docs/images/DI.png)

[SOURCE](https://pages.awscloud.com/rs/112-TZM-766/images/Amazon.AI.Fairness.and.Explainability.Whitepaper.pdf)

### CHARACTERISTICS OF METRIC:
> **DI >= 0**

> This measure may be considered fair if it resides in the **<0.8; 1.2>** range.

## (DI) for Variable: prod_gr_id

In [28]:
ppl_prod_gr_id

prod_gr_id
427    0.663796
426    0.616177
413    0.622158
Name: PPL, dtype: float64

In [29]:
di_prod_gr_id = sorted([abs(i / j) for i in ppl_prod_gr_id for j in ppl_prod_gr_id if i != j])

#### Bucketizing di_prod_gr_id

In [30]:
print(pd.cut(di_prod_gr_id, bins=4).value_counts())

(0.928, 0.966]    2
(0.966, 1.003]    1
(1.003, 1.04]     1
(1.04, 1.077]     2
dtype: int64


#### CONCLUSIONS:

> For each possible relation between Classes the metric resides between **<0.8; 1.2>**

> We can loosely assume that there is no disparity impact between Classes of **prod_gr_id** Variable

## (DI) for Variable: country_id_n

In [31]:
ppl_country_id_n

country_id_n
177     0.681818
176     0.664384
160     0.680851
139     0.568493
138     0.634615
136     0.689655
126     0.680945
121     0.649892
116     0.649402
114     0.688312
113     0.665432
110     0.649778
109     0.386667
108     0.690469
107     0.672269
106     0.430536
105     0.670455
104     0.699856
103     0.711691
1011    0.731844
1010    0.680288
1002    0.733333
Name: PPL, dtype: float64

In [32]:
di_country_id_n = sorted([abs(i / j) for i in ppl_country_id_n for j in ppl_country_id_n if i != j])

#### Bucketizing di_country_id_n

In [33]:
print(pd.cut(di_country_id_n, bins=4).value_counts())

(0.526, 0.87]      57
(0.87, 1.212]     359
(1.212, 1.554]     14
(1.554, 1.897]     32
dtype: int64


In [34]:
pd.Series(di_country_id_n).describe()

count    462.000000
mean       1.025311
std        0.238095
min        0.527273
25%        0.943997
50%        1.000000
75%        1.059325
max        1.896552
dtype: float64

#### CONCLUSIONS:

> There is more than 100 Classes of Variable **country_id_n** out of **<0.8; 1.2>** bounds.

> At this moment we can loosely assume that there are some examples of Disparity Impact in case of **country_id_n** Variable.

## Difference in Conditional Outcome (DCO)
> Type 1: **Difference in Conditional Acceptance (DCA)**

> Type 2: **Difference in Conditional Rejection (DCR)**

### Difference in Conditional Acceptance (DCA)

![image info](./docs/images/DCA.jpg)

[SOURCE](https://pages.awscloud.com/rs/112-TZM-766/images/Amazon.AI.Fairness.and.Explainability.Whitepaper.pdf)

#### CHARACTERISTICS OF METRIC:
> Unbounded

> Zero denominator is possible. In such case the allocations to each Class are too small and a warning should be issued.

## VARIABLE: prod_gr_id

In [35]:
ca_country_id_n = get_CA(
    data_frame=df,
    column_to_group_by='prod_gr_id',
    column_to_count_acctuals='class_acctual',
    column_to_count_predictions='predict_automatch'
)

ca_country_id_n

prod_gr_id
427    1.114989
426    1.203069
413    1.206378
Name: DCA, dtype: float64

In [36]:
dca_country_id_n = get_DCA(
    data_frame=df,
    column_to_group_by='prod_gr_id',
    column_to_count_acctuals='class_acctual',
    column_to_count_predictions='predict_automatch'
)

dca_country_id_n

[-0.09138882810439752,
 -0.08808051974645381,
 -0.0033083083579437123,
 0.0033083083579437123,
 0.08808051974645381,
 0.09138882810439752]

#### Bucketizing dca_country_id_n

In [37]:
print(pd.cut(di_country_id_n, bins=4).value_counts())

(0.526, 0.87]      57
(0.87, 1.212]     359
(1.212, 1.554]     14
(1.554, 1.897]     32
dtype: int64


### Difference in Conditional Rejection (DCR)

![image info](./docs/images/DCR.jpg)

[SOURCE](https://pages.awscloud.com/rs/112-TZM-766/images/Amazon.AI.Fairness.and.Explainability.Whitepaper.pdf)

#### CHARACTERISTICS OF METRIC:
> Unbounded

> Zero denominator is possible. In such case the allocations to each Class are too small and a warning should be issued.