In [1]:
!pip list

Package               Version
--------------------- -----------
argon2-cffi           21.3.0
argon2-cffi-bindings  21.2.0
asttokens             2.0.8
attrs                 22.1.0
backcall              0.2.0
beautifulsoup4        4.11.1
bleach                5.0.1
certifi               2022.6.15
cffi                  1.15.1
charset-normalizer    2.1.1
colorama              0.4.5
cycler                0.11.0
debugpy               1.6.3
decorator             5.1.1
defusedxml            0.7.1
entrypoints           0.4
executing             0.10.0
fastjsonschema        2.16.1
fonttools             4.37.0
htmlmin               0.1.12
idna                  3.3
ImageHash             4.2.1
importlib-metadata    4.12.0
ipykernel             6.15.1
ipython               8.4.0
ipython-genutils      0.2.0
ipywidgets            8.0.1
jedi                  0.18.1
Jinja2                3.1.2
joblib                1.1.0
jsonschema            4.14.0
jupyter-client        7.3.4
jupyter-core          4.11


[notice] A new release of pip available: 22.2.1 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import json
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport

In [3]:
INPUT_PATH="input"
OUTPUT_PATH="output"
CONFIG_PATH="config"

INPUT_DATA="test_data2"
INPUT_DATA_CONFIG="input_config"
OUTPUT_DATA="output"

INPUT_EXTENSION="csv"
INPUT_CONFIG_EXTENSION="json"
OUTPUT_EXTENSION="csv"

INPUT_FILE=f"{INPUT_DATA}.{INPUT_EXTENSION}"
INPUT_CONFIG_FILE=f"{INPUT_DATA_CONFIG}.{INPUT_CONFIG_EXTENSION}"
OUTPUT_FILE=f"{OUTPUT_DATA}.{OUTPUT_EXTENSION}"

INPUT_ABS_APTH=os.path.abspath(os.path.join(INPUT_PATH, INPUT_FILE))
INPUT_FILE_CONFIG=os.path.abspath(os.path.join(CONFIG_PATH, INPUT_CONFIG_FILE))
OUTPUT_ABS_APTH=os.path.abspath(os.path.join(OUTPUT_PATH, OUTPUT_FILE))

In [4]:
with open(INPUT_FILE_CONFIG, encoding='utf-8') as f:
    CONFIG = json.load(f)

In [5]:
CONFIG

{'INPUTS': {'FILE_NAME': ['test_data2.csv'],
  'SEPARATOR': ',',
  'DECIMAL': None,
  'ENCODING': 'utf-8',
  'FLOAT_PRECISION': 'high',
  'INDEXES': ['period_end_date', 'translated_when'],
  'DATE_COLUMNS': ['period_end_date', 'translated_when'],
  'DTYPE': {'if_data_corrected': 'object',
   'prod_gr_id': 'object',
   'country_id_n': 'object',
   'delivery_type_id': 'object',
   'freq_id': 'object',
   'retailer_id': 'object',
   'brand_id': 'object',
   'predict_automatch': 'object',
   'class_acctual': 'object'},
  'COLUMNS_WITH_NAN_VALUES': [None]},
 'MODEL': [None],
 'OUTPUTS': {'NAME': ['output.csv']}}

# Load data

In [6]:
# LOAD CSV
df = pd.read_csv(
    INPUT_ABS_APTH,
#     index_col=CONFIG['INPUTS']['INDEXES'],
    sep=CONFIG['INPUTS']['SEPARATOR'],
    # quotechar='"',
    # thousands=',',
#     decimal=CONFIG['INPUTS']['DECIMAL'],
    encoding=CONFIG['INPUTS']['ENCODING'],
    infer_datetime_format=True,
    parse_dates=CONFIG['INPUTS']['DATE_COLUMNS'],
    engine="c",
    low_memory=False,
#     float_precision=CONFIG['INPUTS']['DATE_COLUMNS'],
    skipinitialspace=True,
    dtype=CONFIG['INPUTS']['DTYPE']
)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19697 entries, 0 to 19696
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   period_end_date    19640 non-null  datetime64[ns, UTC]
 1   translated_when    19697 non-null  datetime64[ns, UTC]
 2   if_data_corrected  19697 non-null  object             
 3   prod_gr_id         19697 non-null  object             
 4   country_id_n       18405 non-null  object             
 5   delivery_type_id   18362 non-null  object             
 6   freq_id            19697 non-null  object             
 7   retailer_id        19697 non-null  object             
 8   brand_id           19697 non-null  object             
 9   predict_automatch  19368 non-null  object             
 10  class_acctual      19697 non-null  object             
dtypes: datetime64[ns, UTC](2), object(9)
memory usage: 1.7+ MB


In [8]:
df.head(3)

Unnamed: 0,period_end_date,translated_when,if_data_corrected,prod_gr_id,country_id_n,delivery_type_id,freq_id,retailer_id,brand_id,predict_automatch,class_acctual
0,NaT,2020-10-15 06:58:28+00:00,0,426,121.0,,2,96,111,1,1
1,NaT,2020-10-15 06:38:34+00:00,0,426,,,2,92,95,0,1
2,2020-09-20 00:00:00+00:00,2020-09-23 12:27:51+00:00,0,426,121.0,42730.0,2,30,37,1,1


In [9]:
df.describe(
    include='all',
    datetime_is_numeric=True
)

Unnamed: 0,period_end_date,translated_when,if_data_corrected,prod_gr_id,country_id_n,delivery_type_id,freq_id,retailer_id,brand_id,predict_automatch,class_acctual
count,19640,19697,19697.0,19697.0,18405.0,18362.0,19697.0,19697.0,19697.0,19368.0,19697.0
unique,,,2.0,3.0,34.0,914.0,2.0,52.0,199.0,2.0,2.0
top,,,0.0,426.0,121.0,31480.0,2.0,30.0,33.0,1.0,1.0
freq,,,17085.0,11844.0,4153.0,998.0,11934.0,1197.0,587.0,12324.0,14639.0
mean,2020-10-14 23:30:22.729124096+00:00,2020-10-22 06:01:21.525206784+00:00,,,,,,,,,
min,2020-08-30 00:00:00+00:00,2020-09-01 03:05:51+00:00,,,,,,,,,
25%,2020-09-27 00:00:00+00:00,2020-09-29 13:45:29+00:00,,,,,,,,,
50%,2020-10-18 00:00:00+00:00,2020-10-22 04:30:49+00:00,,,,,,,,,
75%,2020-11-08 00:00:00+00:00,2020-11-13 11:57:53+00:00,,,,,,,,,
max,2020-12-01 00:00:00+00:00,2021-02-01 14:50:49+00:00,,,,,,,,,


# Pandas data profiler

In [10]:
profile = ProfileReport(
    df,
    title="Pandas Profiling Report",
    pool_size=0,
    infer_dtypes=False
)

In [11]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

# Post-Processing Bias Metrics for the Trained Model

## Difference in positive proportion in predicted labels (DPPL)

![image info](./docs/images/DPPL.png)

### Variable: predict_automatch

### predicted classes

In [12]:
df['predict_automatch'].value_counts()

1    12324
0     7044
Name: predict_automatch, dtype: int64

### Variable: prod_gr_id

### prod_gr_id by predicted classes

In [49]:
prod_gr_id_counts = df.groupby(['prod_gr_id', 'predict_automatch']) \
    .size() \
    .rename('count') \
    .reset_index() \
    .sort_values(by='prod_gr_id', ascending=False) \
    .set_index('prod_gr_id')

prod_gr_id_counts

Unnamed: 0_level_0,predict_automatch,count
prod_gr_id,Unnamed: 1_level_1,Unnamed: 2_level_1
427,0,1116
427,1,2235
426,0,4278
426,1,7298
413,0,1650
413,1,2791


In [50]:
positive_prod_gr_id_counts = prod_gr_id_counts.loc[prod_gr_id_counts['predict_automatch'] == '1']
positive_prod_gr_id_counts

Unnamed: 0_level_0,predict_automatch,count
prod_gr_id,Unnamed: 1_level_1,Unnamed: 2_level_1
427,1,2235
426,1,7298
413,1,2791


In [244]:
def get_PPL(data_frame, column_to_count, predictions_column):
    """
    :param data_frame: 
    :param column_to_count: 
    :param predictions_column: 
    :return: 
    """
    
    pp = []
    
    counts = df.groupby([column_to_count, predictions_column]) \
    .size() \
    .rename('count') \
    .reset_index() \
    .sort_values(by=column_to_count, ascending=False) \
    .set_index(column_to_count)
    
    
    print('Predicted Labels counts: \n', counts, '\n')
    
    positive_counts = counts.loc[counts[predictions_column] == '1']
    total_actual_positive = df[predictions_column].value_counts()[0]

    for key, value in positive_counts.items():
        if key == 'count':
            print("Positive Proportion in Predicted Labels (PPL) for each Class of Variable [prod_gr_id]: \n")
            ppipl = value / total_actual_positive
            for i in ppipl.iteritems():
                print('Positive Proportion in Predicted Labels (PPL) for Variable [prod_gr_id] for Class ==', i[0],":", '\n', f'{i[1]:.0%}', '\n')
                pp.append(i[1])

            unique_abs_diff = sorted(set([abs(i - j) for i in pp for j in pp if i != j]))

            print("Differences in Positive Proportion in Predicted Labels (PPL) for Variable [prod_gr_id]")
            for j in unique_abs_diff:
                print(f'{j:.0%}')


In [245]:
get_PPL(
    data_frame=df,
    column_to_count='prod_gr_id',
    predictions_column='predict_automatch'
)

Predicted Labels counts: 
            predict_automatch  count
prod_gr_id                         
427                        0   1116
427                        1   2235
426                        0   4278
426                        1   7298
413                        0   1650
413                        1   2791 

Positive Proportion in Predicted Labels (PPL) for each Class of Variable [prod_gr_id]: 

Positive Proportion in Predicted Labels (PPL) for Variable [prod_gr_id] for Class == 427 : 
 18% 

Positive Proportion in Predicted Labels (PPL) for Variable [prod_gr_id] for Class == 426 : 
 59% 

Positive Proportion in Predicted Labels (PPL) for Variable [prod_gr_id] for Class == 413 : 
 23% 

Differences in Positive Proportion in Predicted Labels (PPL) for Variable [prod_gr_id]
5%
37%
41%


### CONCLUSIONS:

> The class prod_gr_id == 426 might be overrepresented in training sample.

> Develop separate model for prod_gr_id == 426

> Perform downsamplig to mitigate overrepresentation of prod_gr_id == 426

## Disparate (Adverse) Impact (DI)

![image info](./docs/images/DI.png)

In [22]:
x = abs(2791 / df['predict_automatch'].value_counts()[0] - 2235 / df['predict_automatch'].value_counts()[0])
f'{x:.3f}'

'0.045'

In [23]:
df['class_acctual'].value_counts()

1    14639
0     5058
Name: class_acctual, dtype: int64

### Positive prediction over Positive Actual

In [24]:
df['predict_automatch'].value_counts()[0] / df['class_acctual'].value_counts()[0]

0.8418607828403579

### Negative prediction over Negative Actual

In [25]:
df['predict_automatch'].value_counts()[1] / df['class_acctual'].value_counts()[0]

0.4811804084978482

#### Disparate (Adverse) Impact (DI)