In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# !pip list

# Libraries

In [3]:
import os
import json
import numpy as np
import pandas as pd
from scripts.python.get_metrics import *

In [4]:
from contextlib import redirect_stdout

In [5]:
INPUT_PATH="input"
OUTPUT_PATH="output"
CONFIG_PATH="config"

INPUT_DATA="test_data2"
INPUT_DATA_CONFIG="input_config"
OUTPUT_DATA="output"

INPUT_EXTENSION="csv"
INPUT_CONFIG_EXTENSION="json"
OUTPUT_EXTENSION="csv"

INPUT_FILE=f"{INPUT_DATA}.{INPUT_EXTENSION}"
INPUT_CONFIG_FILE=f"{INPUT_DATA_CONFIG}.{INPUT_CONFIG_EXTENSION}"
OUTPUT_FILE=f"{OUTPUT_DATA}.{OUTPUT_EXTENSION}"

INPUT_ABS_APTH=os.path.abspath(os.path.join(INPUT_PATH, INPUT_FILE))
INPUT_FILE_CONFIG=os.path.abspath(os.path.join(CONFIG_PATH, INPUT_CONFIG_FILE))
OUTPUT_ABS_APTH=os.path.abspath(os.path.join(OUTPUT_PATH, OUTPUT_FILE))

In [6]:
with open(INPUT_FILE_CONFIG, encoding='utf-8') as f:
    CONFIG = json.load(f)

In [7]:
CONFIG

{'INPUTS': {'FILE_NAME': ['test_data2.csv'],
  'SEPARATOR': ',',
  'DECIMAL': None,
  'ENCODING': 'utf-8',
  'FLOAT_PRECISION': 'high',
  'INDEXES': ['period_end_date', 'translated_when'],
  'DATE_COLUMNS': ['period_end_date', 'translated_when'],
  'DTYPE': {'if_data_corrected': 'object',
   'prod_gr_id': 'object',
   'country_id_n': 'object',
   'delivery_type_id': 'object',
   'freq_id': 'object',
   'retailer_id': 'object',
   'brand_id': 'object',
   'predict_automatch': 'float',
   'class_acctual': 'float'},
  'CATEGORICAL_FEATURES': ['country_id_n',
   'prod_gr_id',
   'retailer_id',
   'brand_id',
   'delivery_type_id',
   'week_number'],
  'COLUMNS_WITH_NAN_VALUES': [None]},
 'MODEL': {'TARGET': 'class_acctual',
  'PREDICTION': 'predict_automatch',
  'DATETIME': 'translated_when'},
 'OUTPUTS': {'NAME': [None],
  'COLUMNS_TO_EXCLUDE': ['if_data_corrected', 'freq_id'],
  'BREAKING_POINT_DT': '2020-11-28 00:00:00+00:00'}}

# Load data

In [8]:
df = pd.read_csv(
    INPUT_ABS_APTH,
    sep=CONFIG['INPUTS']['SEPARATOR'],
    encoding=CONFIG['INPUTS']['ENCODING'],
    infer_datetime_format=True,
    parse_dates=CONFIG['INPUTS']['DATE_COLUMNS'],
    engine="c",
    low_memory=False,
    skipinitialspace=True,
    dtype=CONFIG['INPUTS']['DTYPE']
)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19697 entries, 0 to 19696
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   period_end_date    19640 non-null  datetime64[ns, UTC]
 1   translated_when    19697 non-null  datetime64[ns, UTC]
 2   if_data_corrected  19697 non-null  object             
 3   prod_gr_id         19697 non-null  object             
 4   country_id_n       18405 non-null  object             
 5   delivery_type_id   18362 non-null  object             
 6   freq_id            19697 non-null  object             
 7   retailer_id        19697 non-null  object             
 8   brand_id           19697 non-null  object             
 9   predict_automatch  19368 non-null  float64            
 10  class_acctual      19697 non-null  float64            
dtypes: datetime64[ns, UTC](2), float64(2), object(7)
memory usage: 1.7+ MB


In [10]:
df.sort_values(by=['translated_when'], inplace=True)

In [11]:
df.describe(
    include='all',
    datetime_is_numeric=True
)

Unnamed: 0,period_end_date,translated_when,if_data_corrected,prod_gr_id,country_id_n,delivery_type_id,freq_id,retailer_id,brand_id,predict_automatch,class_acctual
count,19640,19697,19697.0,19697.0,18405.0,18362.0,19697.0,19697.0,19697.0,19368.0,19697.0
unique,,,2.0,3.0,34.0,914.0,2.0,52.0,199.0,,
top,,,0.0,426.0,121.0,31480.0,2.0,30.0,33.0,,
freq,,,17085.0,11844.0,4153.0,998.0,11934.0,1197.0,587.0,,
mean,2020-10-14 23:30:22.729124096+00:00,2020-10-22 06:01:21.525206784+00:00,,,,,,,,0.636307,0.74321
min,2020-08-30 00:00:00+00:00,2020-09-01 03:05:51+00:00,,,,,,,,0.0,0.0
25%,2020-09-27 00:00:00+00:00,2020-09-29 13:45:29+00:00,,,,,,,,0.0,0.0
50%,2020-10-18 00:00:00+00:00,2020-10-22 04:30:49+00:00,,,,,,,,1.0,1.0
75%,2020-11-08 00:00:00+00:00,2020-11-13 11:57:53+00:00,,,,,,,,1.0,1.0
max,2020-12-01 00:00:00+00:00,2021-02-01 14:50:49+00:00,,,,,,,,1.0,1.0


# Count

In [12]:
# clear a file
with open('output/agg_counts.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    output = df.groupby(column) \
        .size() \
        .rename('count') \
        .reset_index() \
        .sort_values(by=column, ascending=False) \
        .set_index(column)

    print(f"COLUMN: {column}\n\n{output}\n\n")
    
    # save output to log txt
    with open('output/agg_counts.txt', 'a') as f:
        with redirect_stdout(f):
            print(f"{output.to_string()}\n")

COLUMN: period_end_date

                           count
period_end_date                 
2020-12-01 00:00:00+00:00      3
2020-11-29 00:00:00+00:00    241
2020-11-22 00:00:00+00:00   2192
2020-11-15 00:00:00+00:00   1775
2020-11-08 00:00:00+00:00    966
2020-11-01 00:00:00+00:00   2063
2020-10-25 00:00:00+00:00   1373
2020-10-18 00:00:00+00:00   1442
2020-10-11 00:00:00+00:00   1296
2020-10-04 00:00:00+00:00   1218
2020-10-01 00:00:00+00:00    945
2020-09-27 00:00:00+00:00   1341
2020-09-20 00:00:00+00:00   1025
2020-09-13 00:00:00+00:00   1242
2020-09-06 00:00:00+00:00   1339
2020-09-01 00:00:00+00:00    876
2020-08-30 00:00:00+00:00    303


COLUMN: translated_when

                           count
translated_when                 
2021-02-01 14:50:49+00:00      1
2021-01-29 09:56:17+00:00      1
2021-01-28 13:48:19+00:00      1
2021-01-28 10:44:57+00:00      1
2021-01-27 17:35:47+00:00      1
...                          ...
2020-09-01 06:15:50+00:00      1
2020-09-01 06:15:32+00:0

# Sum

In [13]:
# clear a file
with open('output/agg_sum.txt', 'a') as f:
        f.truncate(0)

# get positive predictions
data_frame = df.loc[df['predict_automatch'] == 1]

# loop over columns and aggregate
for column in data_frame.columns:
    output = data_frame.groupby(column) \
        .size() \
        .rename('count_total') \
        .reset_index() \
        .sort_values(by=column, ascending=False) \
        .set_index(column)

    print(f"COLUMN: {column}\n\n{output}\n\n")
    
    # save output to log txt
    with open('output/agg_sum.txt', 'a') as f:
        with redirect_stdout(f):
            print(f"{output.to_string()}\n")

COLUMN: period_end_date

                           count_total
period_end_date                       
2020-12-01 00:00:00+00:00            1
2020-11-29 00:00:00+00:00          139
2020-11-22 00:00:00+00:00         1358
2020-11-15 00:00:00+00:00         1130
2020-11-08 00:00:00+00:00          570
2020-11-01 00:00:00+00:00         1291
2020-10-25 00:00:00+00:00          885
2020-10-18 00:00:00+00:00          908
2020-10-11 00:00:00+00:00          805
2020-10-04 00:00:00+00:00          744
2020-10-01 00:00:00+00:00          632
2020-09-27 00:00:00+00:00          822
2020-09-20 00:00:00+00:00          642
2020-09-13 00:00:00+00:00          764
2020-09-06 00:00:00+00:00          807
2020-09-01 00:00:00+00:00          577
2020-08-30 00:00:00+00:00          209


COLUMN: translated_when

                           count_total
translated_when                       
2021-02-01 14:50:49+00:00            1
2021-01-28 13:48:19+00:00            1
2021-01-28 10:44:57+00:00            1
2021-01-27 1

# Post-Processing Bias Metrics for the Trained Model

## Difference in positive proportion in predicted labels (DPPL)

![image info](./docs/images/metrics/DPPL.png)

[SOURCE](https://pages.awscloud.com/rs/112-TZM-766/images/Amazon.AI.Fairness.and.Explainability.Whitepaper.pdf)


#### CHARACTERISTICS OF METRIC:
> DPPPL > -1 AND DPPPL < 1

> For example, if the model grants loans to 50% of class 2 and to 60% of class 1, then it may be biased against class 2.
We would have to decide whether a 10% difference is material.

### PPL

In [14]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/PPL.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_PPL(
            data_frame=df,
            column_to_group_by=column,
            column_to_count='predict_automatch'
        )

        print(f"COLUMN: {column}\n\n{output}\n\n")

        # save output to log txt
        with open('output/PPL.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{output.to_string()}\n")

COLUMN: prod_gr_id

prod_gr_id
427    0.663796
426    0.616177
413    0.622158
Name: PPL, dtype: float64


COLUMN: country_id_n

country_id_n
177     0.681818
176     0.664384
160     0.680851
139     0.568493
138     0.634615
136     0.689655
126     0.680945
121     0.649892
116     0.649402
114     0.688312
113     0.665432
110     0.649778
109     0.386667
108     0.690469
107     0.672269
106     0.430536
105     0.670455
104     0.699856
103     0.711691
1011    0.731844
1010    0.680288
1002    0.733333
Name: PPL, dtype: float64


COLUMN: delivery_type_id

delivery_type_id
9974     0.222222
9949     0.333333
9158     0.750000
8705     0.500000
8088     0.372881
           ...   
1218     0.558824
11553    0.200000
11528    0.685484
10741    0.333333
10511    0.777778
Name: PPL, Length: 767, dtype: float64


COLUMN: retailer_id

retailer_id
96     0.613480
95     0.654649
94     0.623693
93     0.599437
92     0.669884
91     0.629393
90     0.623022
9      0.682927
88     0.6861

### DPPL

In [15]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/DPPL.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_DPPL(
            data_frame=df,
            column_to_group_by=column,
            column_to_count='predict_automatch'
        )
        
        output = pd.Series(output)
        print(f"COLUMN: {column}\n\n{output}\n\n")
        
        # save output to log txt
        with open('output/DPPL.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{column}\n{output.to_string()}\n")

COLUMN: prod_gr_id

0   -0.047619
1   -0.041638
2   -0.005981
3    0.005981
4    0.041638
5    0.047619
dtype: float64


COLUMN: country_id_n

0     -0.346667
1     -0.345177
2     -0.325025
3     -0.313189
4     -0.303802
         ...   
457    0.303802
458    0.313189
459    0.325025
460    0.345177
461    0.346667
Length: 462, dtype: float64


COLUMN: delivery_type_id

0       -0.909091
1       -0.864865
2       -0.857143
3       -0.833333
4       -0.812500
           ...   
28097    0.812500
28098    0.833333
28099    0.857143
28100    0.864865
28101    0.909091
Length: 28102, dtype: float64


COLUMN: retailer_id

0      -0.304911
1      -0.295732
2      -0.289233
3      -0.285687
4      -0.284443
          ...   
2445    0.284443
2446    0.285687
2447    0.289233
2448    0.295732
2449    0.304911
Length: 2450, dtype: float64


COLUMN: brand_id

0       -0.698413
1       -0.647059
2       -0.615385
3       -0.608696
4       -0.607504
           ...   
24323    0.607504
24324    0.6

### DPPL bucketized

In [16]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/DPPL_bucketized.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_DPPL(
            data_frame=df,
            column_to_group_by=column,
            column_to_count='predict_automatch'
        )
        
        bucketized_output = pd.cut(output, bins=3).value_counts()
        print(f"COLUMN: {column}\n\n{bucketized_output}\n\n")
        
        # save output to log txt
        with open('output/DPPL_bucketized.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{column}\n{bucketized_output.to_string()}\n")

COLUMN: prod_gr_id

(-0.0477, -0.0159]    2
(-0.0159, 0.0159]     2
(0.0159, 0.0476]      2
dtype: int64


COLUMN: country_id_n

(-0.347, -0.116]     47
(-0.116, 0.116]     368
(0.116, 0.347]       47
dtype: int64


COLUMN: delivery_type_id

(-0.911, -0.303]     2480
(-0.303, 0.303]     23143
(0.303, 0.909]       2479
dtype: int64


COLUMN: retailer_id

(-0.306, -0.102]     319
(-0.102, 0.102]     1812
(0.102, 0.305]       319
dtype: int64


COLUMN: brand_id

(-0.7, -0.233]      1398
(-0.233, 0.233]    21532
(0.233, 0.698]      1398
dtype: int64




#### CONCLUSIONS:

> Basing solely on **DPPL** we can loosely assume that there is no disparity impact among **prod_gr_id** Classes
>
> Although it is likely. Basing solely on **DPPL** we can't assume if there is a disparity impact among **prod_gr_id** Classes yet. 
>
> TODO:

#### CONCLUSIONS:

## Disparate (Adverse) Impact (DI)

![image info](./docs/images/metrics/DI.png)

[SOURCE](https://pages.awscloud.com/rs/112-TZM-766/images/Amazon.AI.Fairness.and.Explainability.Whitepaper.pdf)

#### CHARACTERISTICS OF METRIC:
> **DI >= 0**

> This measure may be considered fair if it resides in the **<0.8; 1.2>** range.

### DI

In [17]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/DI.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_PPL(
            data_frame=df,
            column_to_group_by=column,
            column_to_count='predict_automatch'
        )
        
        di_output = sorted([(i / j) for i in output for j in output if i != j])
        
        di_output = pd.Series(di_output)
        print(f"COLUMN: {column}\n\n{di_output}\n\n")
        
        # save output to log txt
        with open('output/DI.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{column}\n{di_output.to_string()}\n")

COLUMN: prod_gr_id

0    0.928263
1    0.937273
2    0.990387
3    1.009706
4    1.066925
5    1.077281
dtype: float64


COLUMN: country_id_n

0      0.527273
1      0.528346
2      0.543307
3      0.552495
4      0.560006
         ...   
457    1.785695
458    1.809972
459    1.840581
460    1.892699
461    1.896552
Length: 462, dtype: float64


COLUMN: delivery_type_id

0          0.090909
1          0.090909
2          0.090909
3          0.090909
4          0.090909
            ...    
521475    11.000000
521476    11.000000
521477    11.000000
521478    11.000000
521479    11.000000
Length: 521480, dtype: float64


COLUMN: retailer_id

0       0.583509
1       0.596047
2       0.596278
3       0.599245
4       0.600292
          ...   
2445    1.665856
2446    1.668767
2447    1.677069
2448    1.677719
2449    1.713769
Length: 2450, dtype: float64


COLUMN: brand_id

0        0.301587
1        0.301587
2        0.301587
3        0.331746
4        0.339286
           ...   
38081  

### DI bucketized

In [18]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/DI_bucketized.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_PPL(
            data_frame=df,
            column_to_group_by=column,
            column_to_count='predict_automatch'
        )
        
        di_output = sorted([(i / j) for i in output for j in output if i != j])
        
        bucketized_output = pd.cut(di_output, bins=3).value_counts()
        print(f"COLUMN: {column}\n\n{bucketized_output}\n\n")
        
        # save output to log txt
        with open('output/DI_bucketized.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{column}\n{bucketized_output.to_string()}\n")

COLUMN: prod_gr_id

(0.928, 0.978]    2
(0.978, 1.028]    2
(1.028, 1.077]    2
dtype: int64


COLUMN: country_id_n

(0.526, 0.984]    191
(0.984, 1.44]     232
(1.44, 1.897]      39
dtype: int64


COLUMN: delivery_type_id

(0.08, 3.727]     512863
(3.727, 7.364]      7966
(7.364, 11.0]        651
dtype: int64


COLUMN: retailer_id

(0.582, 0.96]      918
(0.96, 1.337]     1358
(1.337, 1.714]     174
dtype: int64


COLUMN: brand_id

(0.299, 1.306]    33916
(1.306, 2.311]     4099
(2.311, 3.316]       71
dtype: int64




#### CONCLUSIONS:

> For each possible relation between Classes the metric resides between **<0.8; 1.2>**

> We can loosely assume that there is no disparity impact between Classes of **prod_gr_id** Variable

> At this moment we can loosely assume that there are some examples of Disparity Impact in case of **country_id_n** Variable.

> Seems like **DI** tries to tell that distributions between countrie **106** and **1011** are diffrent from each other. If that is true I would recommend reducing dimmensionality by feature engineering. I would use **WoE** (**Weight of Evidence**) to do so.

> Dividing Ratios by Ratios, so values close to 1 are good

# Difference in Conditional Outcome (DCO)
> Type 1: **Difference in Conditional Acceptance (DCA)**

> Type 2: **Difference in Conditional Rejection (DCR)**

> When both **DCA** and **DCR** are very close to **0**, we can conclude that the proportion of qualified (as suggested by observed labels) applicants accepted by the model and the proportion of unqualified applicants rejected are nearly equal across both classes.

### Difference in Conditional Acceptance (DCA)

![image info](./docs/images/metrics/DCA.jpg)

[SOURCE](https://pages.awscloud.com/rs/112-TZM-766/images/Amazon.AI.Fairness.and.Explainability.Whitepaper.pdf)

#### CHARACTERISTICS OF METRIC:
> Unbounded

> Zero denominator is possible. In such case the allocations to each Class are too small and a warning should be issued.

### CA

In [19]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/CA.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_CA(
            data_frame=df,
            column_to_group_by=column,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch'
        )
        
        output = pd.Series(output)
        print(f"COLUMN: {column}\n\n{output}\n\n")
        
        # save output to log txt
        with open('output/CA.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{output.to_string()}\n")

COLUMN: prod_gr_id

prod_gr_id
427    1.114989
426    1.203069
413    1.206378
Name: CA, dtype: float64


COLUMN: country_id_n

country_id_n
177     1.040000
176     1.120766
160     1.031250
139     1.317269
138     1.212121
136     1.050000
126     1.090022
121     1.133012
116     1.147239
114     1.085954
113     1.123377
110     1.136799
109     2.000000
108     1.072022
107     1.079167
106     1.686901
105     1.105932
104     1.045361
103     1.063796
1011    1.091603
1010    1.116608
1002    1.090909
Name: CA, dtype: float64


COLUMN: delivery_type_id

delivery_type_id
9974     3.500000
9949     2.000000
9158     1.333333
8705     2.000000
8088     2.227273
           ...   
1218     1.157895
11553    4.000000
11528    1.164706
10741    2.000000
10511    1.142857
Name: CA, Length: 749, dtype: float64


COLUMN: retailer_id

retailer_id
96     1.233184
95     1.107246
94     1.189944
93     1.220657
92     1.121037
91     1.131980
90     1.182448
9      1.087912
88     1.102500


### DCA

In [20]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/DCA.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_DCA(
            data_frame=df,
            column_to_group_by=column,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch'
        )
        
        output = pd.Series(output)
        print(f"COLUMN: {column}\n\n{output}\n\n")
        
        # save output to log txt
        with open('output/DCA.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{output.to_string()}\n")

COLUMN: prod_gr_id

0   -0.091389
1   -0.088081
2   -0.003308
3    0.003308
4    0.088081
5    0.091389
dtype: float64


COLUMN: country_id_n

0     -0.968750
1     -0.960000
2     -0.954639
3     -0.950000
4     -0.936204
         ...   
457    0.936204
458    0.950000
459    0.954639
460    0.960000
461    0.968750
Length: 462, dtype: float64


COLUMN: delivery_type_id

0       -9.666667
1       -9.500000
2       -9.333333
3       -9.285714
4       -9.250000
           ...   
21523    9.250000
21524    9.285714
21525    9.333333
21526    9.500000
21527    9.666667
Length: 21528, dtype: float64


COLUMN: retailer_id

0      -0.594697
1      -0.584076
2      -0.583333
3      -0.577381
4      -0.572712
          ...   
2445    0.572712
2446    0.577381
2447    0.583333
2448    0.584076
2449    0.594697
Length: 2450, dtype: float64


COLUMN: brand_id

0       -1.583333
1       -1.500000
2       -1.458333
3       -1.444444
4       -1.424242
           ...   
18973    1.424242
18974    1.4

### DCA bucketized

In [21]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/DCA_bucketized.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_DCA(
            data_frame=df,
            column_to_group_by=column,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch'
        )
        
        bucketized_output = pd.cut(output, bins=3).value_counts()
        print(f"COLUMN: {column}\n\n{bucketized_output}\n\n")
        
        # save output to log txt
        with open('output/DCA_bucketized.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{column}\n{bucketized_output.to_string()}\n")

COLUMN: prod_gr_id

(-0.0916, -0.0305]    2
(-0.0305, 0.0305]     2
(0.0305, 0.0914]      2
dtype: int64


COLUMN: country_id_n

(-0.971, -0.323]     40
(-0.323, 0.323]     382
(0.323, 0.969]       40
dtype: int64


COLUMN: delivery_type_id

(-9.686, -3.222]      737
(-3.222, 3.222]     20053
(3.222, 9.667]        738
dtype: int64


COLUMN: retailer_id

(-0.596, -0.198]     279
(-0.198, 0.198]     1892
(0.198, 0.595]       279
dtype: int64


COLUMN: brand_id

(-1.587, -0.528]      900
(-0.528, 0.528]     17178
(0.528, 1.583]        900
dtype: int64




#### CONCLUSIONS:

> TODO: Diffrances are really small

## Difference in Conditional Rejection (DCR)

![image info](./docs/images/metrics/DCR.jpg)

[SOURCE](https://pages.awscloud.com/rs/112-TZM-766/images/Amazon.AI.Fairness.and.Explainability.Whitepaper.pdf)

#### CHARACTERISTICS OF METRIC:
> Unbounded

> Zero denominator is possible. In such case the allocations to each Class are too small and a warning should be issued.

### CR

In [22]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/CR.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_CR(
            data_frame=df,
            column_to_group_by=column,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch'
        )
        
        output = pd.Series(output)
        print(f"COLUMN: {column}\n\n{output}\n\n")
        
        # save output to log txt
        with open('output/CR.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{output.to_string()}\n")

COLUMN: prod_gr_id

prod_gr_id
427    0.784050
426    0.716223
413    0.678182
Name: CR, dtype: float64


COLUMN: country_id_n

country_id_n
177     0.914286
176     0.765396
160     0.933333
139     0.582011
138     0.648649
136     0.888889
126     0.817330
121     0.784384
116     0.920863
114     0.817757
113     0.768797
110     0.942308
109     0.372807
108     0.849844
107     0.844828
106     0.481840
105     0.798246
104     0.916256
103     0.856000
1011    0.765957
1010    0.763359
1002    0.750000
Name: CR, dtype: float64


COLUMN: delivery_type_id

delivery_type_id
9974     0.285714
9966     1.000000
9949     0.500000
9617     1.000000
8632     1.000000
           ...   
1218     0.857143
11553    0.250000
11528    0.657895
10741    0.500000
10511    0.500000
Name: CR, Length: 545, dtype: float64


COLUMN: retailer_id

retailer_id
96     0.638989
95     0.801105
94     0.691589
93     0.671362
92     0.763314
91     0.782609
90     0.701149
9      0.825301
88     0.780220


### DCR

In [23]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/DCR.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_DCR(
            data_frame=df,
            column_to_group_by=column,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch'
        )
        
        output = pd.Series(output)
        print(f"COLUMN: {column}\n\n{output}\n\n")
        
        # save output to log txt
        with open('output/DCR.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{output.to_string()}\n")

COLUMN: prod_gr_id

0   -0.105868
1   -0.067828
2   -0.038041
3    0.038041
4    0.067828
5    0.105868
dtype: float64


COLUMN: country_id_n

0     -0.569501
1     -0.560526
2     -0.548056
3     -0.543449
4     -0.541479
         ...   
457    0.541479
458    0.543449
459    0.548056
460    0.560526
461    0.569501
Length: 462, dtype: float64


COLUMN: delivery_type_id

0       -2.900000
1       -2.875000
2       -2.857143
3       -2.843750
4       -2.833333
           ...   
11923    2.833333
11924    2.843750
11925    2.857143
11926    2.875000
11927    2.900000
Length: 11928, dtype: float64


COLUMN: retailer_id

0      -1.020408
1      -0.962366
2      -0.925000
3      -0.884615
4      -0.882514
          ...   
2155    0.882514
2156    0.884615
2157    0.925000
2158    0.962366
2159    1.020408
Length: 2160, dtype: float64


COLUMN: brand_id

0       -1.750000
1       -1.636364
2       -1.610000
3       -1.600000
4       -1.560976
           ...   
12207    1.560976
12208    1.6

### DCR bucketized

In [24]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/DCR_bucketized.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_DCR(
            data_frame=df,
            column_to_group_by=column,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch'
        )
        
        bucketized_output = pd.cut(output, bins=3).value_counts()
        print(f"COLUMN: {column}\n\n{bucketized_output}\n\n")
        
        # save output to log txt
        with open('output/DCR_bucketized.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{column}\n{bucketized_output.to_string()}\n")

COLUMN: prod_gr_id

(-0.106, -0.0353]    3
(-0.0353, 0.0353]    0
(0.0353, 0.106]      3
dtype: int64


COLUMN: country_id_n

(-0.571, -0.19]     61
(-0.19, 0.19]      340
(0.19, 0.57]        61
dtype: int64


COLUMN: delivery_type_id

(-2.906, -0.967]      445
(-0.967, 0.967]     11039
(0.967, 2.9]          444
dtype: int64


COLUMN: retailer_id

(-1.022, -0.34]      83
(-0.34, 0.34]      1994
(0.34, 1.02]         83
dtype: int64


COLUMN: brand_id

(-1.754, -0.583]      552
(-0.583, 0.583]     11110
(0.583, 1.75]         550
dtype: int64




#### CONCLUSIONS:

> TODO:

## Recall Difference (RD)

![image info](./docs/images/metrics/RD.jpg)

[SOURCE](https://pages.awscloud.com/rs/112-TZM-766/images/Amazon.AI.Fairness.and.Explainability.Whitepaper.pdf)

#### CHARACTERISTICS OF METRIC:
> TODO:

### CONFUSION MATRIX (by Class)

In [25]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/CM.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_class_cm(
            data_frame=df,
            acctuals='class_acctual',
            predictions='predict_automatch',
            column_to_group_by=column
        )
        
        print(f"COLUMN: {column}\n\n{output}\n\n")
        
        # save output to log txt
        with open('output/CM.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{output.to_string()}\n")

COLUMN: prod_gr_id

  Class  TN   FN    FP    TP
0   426  75  193  2989  8587
1   413  11   34  1108  3333
2   427   4   12   871  2480


COLUMN: country_id_n

   Class  TN  FN    FP    TP
0    126   1   4   348  1001
1    113   1   9   408  1202
2    114   1   1   174   517
3    110  25  57   269   774
4    139   0   0   110   328
5    108   3   9   812  2313
6    121  17  41  1078  3017
7    109   0   2    85   288
8    107   0   1    98   258
9    103   2   2   212   665
10   106   0   1   199   527
11   176   0   2   261   759
12   116  11  26   117   348
13   105   0   2    91   259
14   104   3   2   183   505
15   177   0   0    32    78
16  1010   0   2   100   314
17   136   0   0     8    21
18   138   1   0    23    80
19  1011   0   1    36   142
20   160   0   0    14    33
21   128   1   1     0     0
22   118   1   2     0     0
23   119   1   6     0     0
24   131   2   8     0     0
25   150   2   8     0     0
26   170   1   4     0     0
27   141   2   7     0     0

### RD

In [26]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/RD.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_RD(
            data_frame=df,
            acctuals='class_acctual',
            predictions='predict_automatch',
            column_to_group_by=column
        )
        
        output = pd.Series(output)
        print(f"COLUMN: {column}\n\n{output}\n\n")
        
        # save output to log txt
        with open('output/RD.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{output.to_string()}\n")

COLUMN: prod_gr_id

0   -0.017166
1   -0.011884
2   -0.005283
3    0.005283
4    0.011884
5    0.017166
dtype: float64


COLUMN: country_id_n

0     -1.000000
1     -0.998106
2     -0.998069
3     -0.997372
4     -0.997001
         ...   
299    0.997001
300    0.997372
301    0.998069
302    0.998106
303    1.000000
Length: 304, dtype: float64


COLUMN: delivery_type_id

Series([], dtype: float64)


COLUMN: retailer_id

0      -1.000000
1      -0.998047
2      -0.997930
3      -0.997602
4      -0.997382
          ...   
1715    0.997382
1716    0.997602
1717    0.997930
1718    0.998047
1719    1.000000
Length: 1720, dtype: float64


COLUMN: brand_id

0      -1.000000
1      -0.996324
2      -0.996000
3      -0.995781
4      -0.995455
          ...   
3155    0.995455
3156    0.995781
3157    0.996000
3158    0.996324
3159    1.000000
Length: 3160, dtype: float64




### RD bucketized

In [27]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/RD_bucketized.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_RD(
            data_frame=df,
            acctuals='class_acctual',
            predictions='predict_automatch',
            column_to_group_by=column
        )
        
        # To bypass some empty matrices
        try:
            bucketized_output = pd.cut(output, bins=3).value_counts()
            print(f"COLUMN: {column}\n\n{bucketized_output}\n\n")

            # save output to log txt
            with open('output/RD_bucketized.txt', 'a') as f:
                with redirect_stdout(f):
                    print(f"{column}\n{bucketized_output.to_string()}\n")
        except Exception:
            pass

COLUMN: prod_gr_id

(-0.0172, -0.00572]    2
(-0.00572, 0.00572]    2
(0.00572, 0.0172]      2
dtype: int64


COLUMN: country_id_n

(-1.002, -0.333]     17
(-0.333, 0.333]     270
(0.333, 1.0]         17
dtype: int64


COLUMN: retailer_id

(-1.002, -0.333]      80
(-0.333, 0.333]     1560
(0.333, 1.0]          80
dtype: int64


COLUMN: brand_id

(-1.002, -0.333]     238
(-0.333, 0.333]     2684
(0.333, 1.0]         238
dtype: int64




#### CONCLUSIONS:

> TODO:

## Difference in label rates (DLR)

![image info](./docs/images/metrics/DAR.png)

[SOURCE](https://pages.awscloud.com/rs/112-TZM-766/images/Amazon.AI.Fairness.and.Explainability.Whitepaper.pdf)

#### CHARACTERISTICS OF METRIC:
> DAR > -1 AND DAR < 1

> DRR > -1 AND DRR < 1

> DAR is the same as precision difference between the first and second classes.

### AR

In [28]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/AR.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_AR(
            data_frame=df,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch',
            column_to_group_by=column
        )
        
        print(f"COLUMN: {column}\n\n{output}\n\n")
        
        # save output to log txt
        with open('output/AR.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{output.to_string()}\n")

COLUMN: prod_gr_id

   class_acctual  count_grouped_acctuals  predict_automatch  \
0            1.0                    2492                1.0   
1            1.0                    8780                1.0   
2            1.0                    3367                1.0   

   count_grouped_predictions Class  TN   FN    FP    TP  
0                       2235   427   4   12   871  2480  
1                       7298   426  75  193  2989  8587  
2                       2791   413  11   34  1108  3333  


COLUMN: country_id_n

    class_acctual  count_grouped_acctuals  predict_automatch  \
0             1.0                      78                1.0   
1             1.0                     761                1.0   
2             1.0                      33                1.0   
3             1.0                     328                1.0   
4             1.0                      80                1.0   
5             1.0                      21                1.0   
6             1.0      

COLUMN: brand_id

     class_acctual  count_grouped_acctuals  predict_automatch  \
0              1.0                     384                1.0   
1              1.0                     421                1.0   
2              1.0                     272                1.0   
3              1.0                      30                1.0   
4              1.0                      35                1.0   
..             ...                     ...                ...   
191            1.0                     121                1.0   
192            1.0                      70                1.0   
193            1.0                      79                1.0   
194            1.0                     273                1.0   
195            1.0                     237                1.0   

     count_grouped_predictions Class TN FN   FP   TP  
0                          320    99  1  2  142  382  
1                          366    96  1  0  157  421  
2                          237    95

### DAR

In [29]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/DAR.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_DAR(
            data_frame=df,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch',
            column_to_group_by=column
        )
        
        output = pd.Series(output)
        print(f"COLUMN: {column}\n\n{output}\n\n")
        
        # save output to log txt
        with open('output/DAR.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{output.to_string()}\n")

COLUMN: prod_gr_id

0   -0.084576
1   -0.067004
2   -0.017572
3    0.017572
4    0.067004
5    0.084576
dtype: float64


COLUMN: country_id_n

0     -0.954957
1     -0.946207
2     -0.944970
3     -0.936207
4     -0.927383
         ...   
457    0.927383
458    0.936207
459    0.944970
460    0.946207
461    0.954957
Length: 462, dtype: float64


COLUMN: delivery_type_id

0       -249.000000
1       -248.800000
2       -248.750000
3       -248.666667
4       -248.600000
            ...    
31275    248.600000
31276    248.666667
31277    248.750000
31278    248.800000
31279    249.000000
Length: 31280, dtype: float64


COLUMN: retailer_id

0      -0.534031
1      -0.514424
2      -0.476740
3      -0.460585
4      -0.451614
          ...   
2247    0.451614
2248    0.460585
2249    0.476740
2250    0.514424
2251    0.534031
Length: 2252, dtype: float64


COLUMN: brand_id

0       -20.916667
1       -20.833333
2       -20.791667
3       -20.777778
4       -20.768707
           ...    
20

### DAR bucketized

In [30]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/DAR.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_DAR(
            data_frame=df,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch',
            column_to_group_by=column
        )
        
        bucketized_output = pd.cut(output, bins=3).value_counts()
        print(f"COLUMN: {column}\n\n{bucketized_output}\n\n")
        
        # save output to log txt
        with open('output/DAR_bucketized.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{column}\n{bucketized_output.to_string()}\n")

COLUMN: prod_gr_id

(-0.0847, -0.0282]    2
(-0.0282, 0.0282]     2
(0.0282, 0.0846]      2
dtype: int64


COLUMN: country_id_n

(-0.957, -0.318]     40
(-0.318, 0.318]     382
(0.318, 0.955]       40
dtype: int64


COLUMN: delivery_type_id

(-249.498, -83.0]      367
(-83.0, 83.0]        30547
(83.0, 249.0]          366
dtype: int64


COLUMN: retailer_id

(-0.535, -0.178]     204
(-0.178, 0.178]     1844
(0.178, 0.534]       204
dtype: int64


COLUMN: brand_id

(-20.958, -6.972]      430
(-6.972, 6.972]      19318
(6.972, 20.917]        430
dtype: int64




### RR

In [31]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/RR.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_RR(
            data_frame=df,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch',
            column_to_group_by=column
        )
        
        print(f"COLUMN: {column}\n\n{output}\n\n")
        
        # save output to log txt
        with open('output/RR.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{output.to_string()}\n")

COLUMN: prod_gr_id

   class_acctual  count_grouped_acctuals  predict_automatch  \
0            0.0                     875                0.0   
1            0.0                    3064                0.0   
2            0.0                    1119                0.0   

   count_grouped_predictions Class  TN   FN    FP    TP  
0                       1116   427   4   12   871  2480  
1                       4278   426  75  193  2989  8587  
2                       1650   413  11   34  1108  3333  


COLUMN: country_id_n

    class_acctual  count_grouped_acctuals  predict_automatch  \
0             0.0                      32                0.0   
1             0.0                     261                0.0   
2             0.0                      14                0.0   
3             0.0                     110                0.0   
4             0.0                      24                0.0   
5             0.0                       8                0.0   
6             0.0      

COLUMN: brand_id

     class_acctual  count_grouped_acctuals  predict_automatch  \
0              0.0                     143                0.0   
1              0.0                     158                0.0   
2              0.0                      87                0.0   
3              0.0                       8                0.0   
4              0.0                       5                0.0   
..             ...                     ...                ...   
189            0.0                      42                0.0   
190            0.0                      28                0.0   
191            0.0                      19                0.0   
192            0.0                      96                0.0   
193            0.0                      83                0.0   

     count_grouped_predictions Class TN FN   FP   TP  
0                          204    99  1  2  142  382  
1                          212    96  1  0  157  421  
2                          120    95

### DRR

In [32]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/DRR.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_DRR(
            data_frame=df,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch',
            column_to_group_by=column
        )
        
        output = pd.Series(output)
        print(f"COLUMN: {column}\n\n{output}\n\n")
        
        # save output to log txt
        with open('output/DRR.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{output.to_string()}\n")

COLUMN: prod_gr_id

0   -0.008487
1   -0.006336
2   -0.002152
3    0.002152
4    0.006336
5    0.008487
dtype: float64


COLUMN: country_id_n

0     -0.034200
1     -0.033742
2     -0.033272
3     -0.033115
4     -0.032815
         ...   
105    0.032815
106    0.033115
107    0.033272
108    0.033742
109    0.034200
Length: 110, dtype: float64


COLUMN: delivery_type_id

0      -8.000000
1      -7.998536
2      -7.997664
3      -7.997076
4      -7.993939
          ...   
1525    7.993939
1526    7.997076
1527    7.997664
1528    7.998536
1529    8.000000
Length: 1530, dtype: float64


COLUMN: retailer_id

0     -0.204545
1     -0.203105
2     -0.202981
3     -0.202683
4     -0.202348
         ...   
695    0.202348
696    0.202683
697    0.202981
698    0.203105
699    0.204545
Length: 700, dtype: float64


COLUMN: brand_id

0      -0.473684
1      -0.470952
2      -0.470559
3      -0.469778
4      -0.469465
          ...   
1309    0.469465
1310    0.469778
1311    0.470559
1312    0

### DRR bucketized

In [33]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/DRR_bucketized.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_DRR(
            data_frame=df,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch',
            column_to_group_by=column
        )
        
        bucketized_output = pd.cut(output, bins=3).value_counts()
        print(f"COLUMN: {column}\n\n{bucketized_output}\n\n")
        
        # save output to log txt
        with open('output/DRR_bucketized.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{column}\n{bucketized_output.to_string()}\n")

COLUMN: prod_gr_id

(-0.0085, -0.00283]    2
(-0.00283, 0.00283]    2
(0.00283, 0.00849]     2
dtype: int64


COLUMN: country_id_n

(-0.0343, -0.0114]    24
(-0.0114, 0.0114]     62
(0.0114, 0.0342]      24
dtype: int64


COLUMN: delivery_type_id

(-8.016, -2.667]      39
(-2.667, 2.667]     1452
(2.667, 8.0]          39
dtype: int64


COLUMN: retailer_id

(-0.205, -0.0682]     70
(-0.0682, 0.0682]    560
(0.0682, 0.205]       70
dtype: int64


COLUMN: brand_id

(-0.475, -0.158]      66
(-0.158, 0.158]     1182
(0.158, 0.474]        66
dtype: int64




#### CONCLUSIONS:

> TODO:

## Accuracy DIfference (AD)

![image info](./docs/images/metrics/AD.jpg)

[SOURCE](https://pages.awscloud.com/rs/112-TZM-766/images/Amazon.AI.Fairness.and.Explainability.Whitepaper.pdf)

#### CHARACTERISTICS OF METRIC:
> Unbounded metric

> Care needed when FN = 0

### AD

In [34]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/AR.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_AR(
            data_frame=df,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch',
            column_to_group_by=column
        )
        
        print(f"COLUMN: {column}\n\n{output}\n\n")
        
        # save output to log txt
        with open('output/AR.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{output.to_string()}\n")

COLUMN: prod_gr_id

   class_acctual  count_grouped_acctuals  predict_automatch  \
0            1.0                    2492                1.0   
1            1.0                    8780                1.0   
2            1.0                    3367                1.0   

   count_grouped_predictions Class  TN   FN    FP    TP  
0                       2235   427   4   12   871  2480  
1                       7298   426  75  193  2989  8587  
2                       2791   413  11   34  1108  3333  


COLUMN: country_id_n

    class_acctual  count_grouped_acctuals  predict_automatch  \
0             1.0                      78                1.0   
1             1.0                     761                1.0   
2             1.0                      33                1.0   
3             1.0                     328                1.0   
4             1.0                      80                1.0   
5             1.0                      21                1.0   
6             1.0      

COLUMN: brand_id

     class_acctual  count_grouped_acctuals  predict_automatch  \
0              1.0                     384                1.0   
1              1.0                     421                1.0   
2              1.0                     272                1.0   
3              1.0                      30                1.0   
4              1.0                      35                1.0   
..             ...                     ...                ...   
191            1.0                     121                1.0   
192            1.0                      70                1.0   
193            1.0                      79                1.0   
194            1.0                     273                1.0   
195            1.0                     237                1.0   

     count_grouped_predictions Class TN FN   FP   TP  
0                          320    99  1  2  142  382  
1                          366    96  1  0  157  421  
2                          237    95

# AD

In [35]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/AD.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_AD(
            data_frame=df,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch',
            column_to_group_by=column
        )
        
        output = pd.Series(output)
        print(f"COLUMN: {column}\n\n{output}\n\n")
        
        # save output to log txt
        with open('output/AD.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{output.to_string()}\n")

COLUMN: prod_gr_id

0   -0.014089
1   -0.007681
2   -0.006408
3    0.006408
4    0.007681
5    0.014089
dtype: float64


COLUMN: country_id_n

0     -0.097872
1     -0.091168
2     -0.090909
3     -0.089778
4     -0.084861
         ...   
457    0.084861
458    0.089778
459    0.090909
460    0.091168
461    0.097872
Length: 462, dtype: float64


COLUMN: delivery_type_id

0       -1.000000
1       -0.916667
2       -0.909091
3       -0.900000
4       -0.888889
           ...   
20915    0.888889
20916    0.900000
20917    0.909091
20918    0.916667
20919    1.000000
Length: 20920, dtype: float64


COLUMN: retailer_id

0      -0.268271
1      -0.256526
2      -0.255606
3      -0.233816
4      -0.230056
          ...   
2445    0.230056
2446    0.233816
2447    0.255606
2448    0.256526
2449    0.268271
Length: 2450, dtype: float64


COLUMN: brand_id

0       -0.428571
1       -0.399160
2       -0.388889
3       -0.375940
4       -0.375000
           ...   
21415    0.375000
21416    0.3

# A

In [36]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/A.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_A(
            data_frame=df,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch',
            column_to_group_by=column
        )
        
        print(f"COLUMN: {column}\n\n{output}\n\n")
        
        # save output to log txt
        with open('output/A.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{output.to_string()}\n")

COLUMN: prod_gr_id

   class_acctual  count_grouped_acctuals  predict_automatch  \
0            0.0                     875                0.0   
1            0.0                    3064                0.0   
2            0.0                    1119                0.0   

   count_grouped_predictions Class  TN   FN    FP    TP  
0                       1116   427   4   12   871  2480  
1                       4278   426  75  193  2989  8587  
2                       1650   413  11   34  1108  3333  


COLUMN: country_id_n

    class_acctual  count_grouped_acctuals  predict_automatch  \
0             0.0                      32                0.0   
1             0.0                     261                0.0   
2             0.0                      14                0.0   
3             0.0                     110                0.0   
4             0.0                      24                0.0   
5             0.0                       8                0.0   
6             0.0      

COLUMN: brand_id

     class_acctual  count_grouped_acctuals  predict_automatch  \
0              0.0                     143                0.0   
1              0.0                     158                0.0   
2              0.0                      87                0.0   
3              0.0                       8                0.0   
4              0.0                       5                0.0   
..             ...                     ...                ...   
189            0.0                      42                0.0   
190            0.0                      28                0.0   
191            0.0                      19                0.0   
192            0.0                      96                0.0   
193            0.0                      83                0.0   

     count_grouped_predictions Class TN FN   FP   TP  
0                          204    99  1  2  142  382  
1                          212    96  1  0  157  421  
2                          120    95

# AD

In [37]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/AD.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_AD(
            data_frame=df,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch',
            column_to_group_by=column
        )
        
        output = pd.Series(output)
        print(f"COLUMN: {column}\n\n{output}\n\n")
        
        # save output to log txt
        with open('output/AD.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{output.to_string()}\n")

COLUMN: prod_gr_id

0   -0.014089
1   -0.007681
2   -0.006408
3    0.006408
4    0.007681
5    0.014089
dtype: float64


COLUMN: country_id_n

0     -0.097872
1     -0.091168
2     -0.090909
3     -0.089778
4     -0.084861
         ...   
457    0.084861
458    0.089778
459    0.090909
460    0.091168
461    0.097872
Length: 462, dtype: float64


COLUMN: delivery_type_id

0       -1.000000
1       -0.916667
2       -0.909091
3       -0.900000
4       -0.888889
           ...   
20915    0.888889
20916    0.900000
20917    0.909091
20918    0.916667
20919    1.000000
Length: 20920, dtype: float64


COLUMN: retailer_id

0      -0.268271
1      -0.256526
2      -0.255606
3      -0.233816
4      -0.230056
          ...   
2445    0.230056
2446    0.233816
2447    0.255606
2448    0.256526
2449    0.268271
Length: 2450, dtype: float64


COLUMN: brand_id

0       -0.428571
1       -0.399160
2       -0.388889
3       -0.375940
4       -0.375000
           ...   
21415    0.375000
21416    0.3

# AD bucketized

In [38]:
columns_to_exclude = [
    'period_end_date',
    'translated_when',
    'if_data_corrected',
    'freq_id',
    'predict_automatch',
    'class_acctual'
]

# clear a file
with open('output/AD_bucketized.txt', 'a') as f:
        f.truncate(0)

# loop over columns and aggregate
for column in df.columns:
    if column not in columns_to_exclude:
        output = get_AD(
            data_frame=df,
            column_to_count_acctuals='class_acctual',
            column_to_count_predictions='predict_automatch',
            column_to_group_by=column
        )
        
        bucketized_output = pd.cut(output, bins=3).value_counts()
        print(f"COLUMN: {column}\n\n{bucketized_output}\n\n")
        
        # save output to log txt
        with open('output/AD_bucketized.txt', 'a') as f:
            with redirect_stdout(f):
                print(f"{column}\n{bucketized_output.to_string()}\n")

COLUMN: prod_gr_id

(-0.0141, -0.0047]    3
(-0.0047, 0.0047]     0
(0.0047, 0.0141]      3
dtype: int64


COLUMN: country_id_n

(-0.0981, -0.0326]     83
(-0.0326, 0.0326]     296
(0.0326, 0.0979]       83
dtype: int64


COLUMN: delivery_type_id

(-1.002, -0.333]     1161
(-0.333, 0.333]     18597
(0.333, 1.0]         1162
dtype: int64


COLUMN: retailer_id

(-0.269, -0.0894]     179
(-0.0894, 0.0894]    2092
(0.0894, 0.268]       179
dtype: int64


COLUMN: brand_id

(-0.429, -0.143]     1493
(-0.143, 0.143]     18435
(0.143, 0.429]       1492
dtype: int64




#### CONCLUSIONS:

> TODO: