In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip list

Package               Version
--------------------- -----------
altair                4.2.0
argon2-cffi           21.3.0
argon2-cffi-bindings  21.2.0
asttokens             2.0.8
attrs                 22.1.0
backcall              0.2.0
beautifulsoup4        4.11.1
bleach                5.0.1
bokeh                 2.4.3
certifi               2022.6.15
cffi                  1.15.1
charset-normalizer    2.1.1
click                 8.1.3
colorama              0.4.5
colorcet              3.0.0
cycler                0.11.0
dataclasses           0.6
debugpy               1.6.3
decorator             5.1.1
defusedxml            0.7.1
emoji                 2.0.0
entrypoints           0.4
evidently             0.1.56.dev0
executing             0.10.0
fastjsonschema        2.16.1
fonttools             4.37.0
fsspec                2022.7.1
greenlet              1.1.3
holoviews             1.14.9
htmlmin               0.1.12
hvplot                0.8.0
idna                  3.3
ImageHash             


[notice] A new release of pip available: 22.2.1 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# Libraries

In [3]:
import os
import json
import pandas as pd
import pandasql as ps

In [4]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [5]:
INPUT_PATH="input"
OUTPUT_PATH="output"
CONFIG_PATH="config"

INPUT_DATA="test_data2"
INPUT_DATA_CONFIG="input_config"
OUTPUT_DATA="output"

INPUT_EXTENSION="csv"
INPUT_CONFIG_EXTENSION="json"
OUTPUT_EXTENSION="csv"

INPUT_FILE=f"{INPUT_DATA}.{INPUT_EXTENSION}"
INPUT_CONFIG_FILE=f"{INPUT_DATA_CONFIG}.{INPUT_CONFIG_EXTENSION}"
OUTPUT_FILE=f"{OUTPUT_DATA}.{OUTPUT_EXTENSION}"

INPUT_ABS_APTH=os.path.abspath(os.path.join(INPUT_PATH, INPUT_FILE))
INPUT_FILE_CONFIG=os.path.abspath(os.path.join(CONFIG_PATH, INPUT_CONFIG_FILE))
OUTPUT_ABS_APTH=os.path.abspath(os.path.join(OUTPUT_PATH, OUTPUT_FILE))

In [6]:
with open(INPUT_FILE_CONFIG, encoding='utf-8') as f:
    CONFIG = json.load(f)

In [7]:
CONFIG

{'INPUTS': {'FILE_NAME': ['test_data2.csv'],
  'SEPARATOR': ',',
  'DECIMAL': None,
  'ENCODING': 'utf-8',
  'FLOAT_PRECISION': 'high',
  'INDEXES': ['period_end_date', 'translated_when'],
  'DATE_COLUMNS': ['period_end_date', 'translated_when'],
  'DTYPE': {'if_data_corrected': 'object',
   'prod_gr_id': 'object',
   'country_id_n': 'object',
   'delivery_type_id': 'object',
   'freq_id': 'object',
   'retailer_id': 'object',
   'brand_id': 'object',
   'predict_automatch': 'float',
   'class_acctual': 'float'},
  'CATEGORICAL_FEATURES': ['country_id_n',
   'prod_gr_id',
   'retailer_id',
   'brand_id',
   'delivery_type_id',
   'week_number'],
  'COLUMNS_WITH_NAN_VALUES': [None]},
 'MODEL': {'TARGET': 'class_acctual',
  'PREDICTION': 'predict_automatch',
  'DATETIME': 'translated_when'},
 'OUTPUTS': {'NAME': [None],
  'COLUMNS_TO_EXCLUDE': ['if_data_corrected', 'freq_id'],
  'BREAKING_POINT_DT': '2020-11-28 00:00:00+00:00'}}

# Load data

In [8]:
df = pd.read_csv(
    INPUT_ABS_APTH,
    sep=CONFIG['INPUTS']['SEPARATOR'],
    encoding=CONFIG['INPUTS']['ENCODING'],
    infer_datetime_format=True,
    parse_dates=CONFIG['INPUTS']['DATE_COLUMNS'],
    engine="c",
    low_memory=False,
    skipinitialspace=True,
    dtype=CONFIG['INPUTS']['DTYPE']
)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19697 entries, 0 to 19696
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   period_end_date    19640 non-null  datetime64[ns, UTC]
 1   translated_when    19697 non-null  datetime64[ns, UTC]
 2   if_data_corrected  19697 non-null  object             
 3   prod_gr_id         19697 non-null  object             
 4   country_id_n       18405 non-null  object             
 5   delivery_type_id   18362 non-null  object             
 6   freq_id            19697 non-null  object             
 7   retailer_id        19697 non-null  object             
 8   brand_id           19697 non-null  object             
 9   predict_automatch  19368 non-null  float64            
 10  class_acctual      19697 non-null  float64            
dtypes: datetime64[ns, UTC](2), float64(2), object(7)
memory usage: 1.7+ MB


In [10]:
df.sort_values(by=['translated_when'], inplace=True)

In [11]:
df.head(10)

Unnamed: 0,period_end_date,translated_when,if_data_corrected,prod_gr_id,country_id_n,delivery_type_id,freq_id,retailer_id,brand_id,predict_automatch,class_acctual
1908,2020-08-30 00:00:00+00:00,2020-09-01 03:05:51+00:00,0,426,126,22866,1,174,280,1.0,1.0
1907,2020-08-30 00:00:00+00:00,2020-09-01 03:35:26+00:00,0,426,126,22866,2,93,96,1.0,1.0
4503,2020-08-30 00:00:00+00:00,2020-09-01 03:44:29+00:00,0,413,113,12731,2,95,182,1.0,1.0
8009,2020-09-01 00:00:00+00:00,2020-09-01 06:15:32+00:00,1,426,114,18614,1,173,279,1.0,1.0
5091,2020-09-01 00:00:00+00:00,2020-09-01 06:15:50+00:00,1,426,114,18614,1,34,234,1.0,1.0
8010,2020-09-01 00:00:00+00:00,2020-09-01 06:21:31+00:00,1,426,114,18614,2,93,96,1.0,1.0
9945,2020-09-01 00:00:00+00:00,2020-09-01 06:24:04+00:00,1,426,114,18614,1,117,243,1.0,1.0
9944,2020-09-01 00:00:00+00:00,2020-09-01 06:24:13+00:00,1,426,114,18614,2,11,33,1.0,1.0
10999,2020-09-01 00:00:00+00:00,2020-09-01 06:25:53+00:00,1,426,114,18614,1,26,169,0.0,0.0
9947,2020-09-01 00:00:00+00:00,2020-09-01 06:26:45+00:00,1,426,114,18614,1,33,379,0.0,0.0


In [12]:
df.describe(
    include='all',
    datetime_is_numeric=True
)

Unnamed: 0,period_end_date,translated_when,if_data_corrected,prod_gr_id,country_id_n,delivery_type_id,freq_id,retailer_id,brand_id,predict_automatch,class_acctual
count,19640,19697,19697.0,19697.0,18405.0,18362.0,19697.0,19697.0,19697.0,19368.0,19697.0
unique,,,2.0,3.0,34.0,914.0,2.0,52.0,199.0,,
top,,,0.0,426.0,121.0,31480.0,2.0,30.0,33.0,,
freq,,,17085.0,11844.0,4153.0,998.0,11934.0,1197.0,587.0,,
mean,2020-10-14 23:30:22.729124096+00:00,2020-10-22 06:01:21.525206784+00:00,,,,,,,,0.636307,0.74321
min,2020-08-30 00:00:00+00:00,2020-09-01 03:05:51+00:00,,,,,,,,0.0,0.0
25%,2020-09-27 00:00:00+00:00,2020-09-29 13:45:29+00:00,,,,,,,,0.0,0.0
50%,2020-10-18 00:00:00+00:00,2020-10-22 04:30:49+00:00,,,,,,,,1.0,1.0
75%,2020-11-08 00:00:00+00:00,2020-11-13 11:57:53+00:00,,,,,,,,1.0,1.0
max,2020-12-01 00:00:00+00:00,2021-02-01 14:50:49+00:00,,,,,,,,1.0,1.0


**An example of model's underperformance:**

country_id == '106'

./docs/images/monthly_stability/class_acctual/country_id_n/CLASS_106_monthly_stability_grouped.jpg

Observed:

![image info](./docs/images/monthly_stability/class_acctual/country_id_n/CLASS_106_monthly_stability_grouped.jpg)


./docs/images/monthly_stability/class_acctual/country_id_n/CLASS_106_monthly_stability_grouped.jpg

Predicted:

![image info](./docs/images/monthly_stability/class_acctual/country_id_n/CLASS_106_monthly_stability_grouped.jpg)

In [13]:
df_sub = df[['period_end_date', 'translated_when', 'country_id_n', 'prod_gr_id', 'predict_automatch', 'class_acctual']]

In [14]:
pysqldf(
    """
    SELECT
        'PREDICTED' AS type,
        strftime('%Y-%m', period_end_date) AS month_year
        ,country_id_n
        ,predict_automatch
        --,class_acctual
        ,SUM(predict_automatch) AS predict_automatch_sum
        --,SUM(class_acctual) AS class_acctual_sum
    FROM df_sub
    WHERE
        country_id_n = '106'
        --AND month_year = '2020-11'
    GROUP BY
        month_year
        ,country_id_n
        ,predict_automatch
        --,class_acctual
    ;
    """
)

Unnamed: 0,type,month_year,country_id_n,predict_automatch,predict_automatch_sum
0,PREDICTED,2020-08,106,0.0,0.0
1,PREDICTED,2020-08,106,1.0,1.0
2,PREDICTED,2020-09,106,0.0,0.0
3,PREDICTED,2020-09,106,1.0,115.0
4,PREDICTED,2020-10,106,,
5,PREDICTED,2020-10,106,0.0,0.0
6,PREDICTED,2020-10,106,1.0,108.0
7,PREDICTED,2020-11,106,0.0,0.0
8,PREDICTED,2020-11,106,1.0,89.0


In [15]:
pysqldf(
    """
    SELECT
        'OBSERVED' AS type,
        strftime('%Y-%m', period_end_date) AS month_year
        ,country_id_n
        --,predict_automatch
        ,class_acctual
        --,SUM(predict_automatch) AS predict_automatch_sum
        ,SUM(class_acctual) AS class_acctual_sum
    FROM df_sub
    WHERE
        country_id_n = '106'
        --AND month_year = '2020-11'
    GROUP BY
        month_year
        ,country_id_n
        --,predict_automatch
        ,class_acctual
    ;
    """
)

Unnamed: 0,type,month_year,country_id_n,class_acctual,class_acctual_sum
0,OBSERVED,2020-08,106,0.0,0.0
1,OBSERVED,2020-08,106,1.0,2.0
2,OBSERVED,2020-09,106,0.0,0.0
3,OBSERVED,2020-09,106,1.0,189.0
4,OBSERVED,2020-10,106,0.0,0.0
5,OBSERVED,2020-10,106,1.0,166.0
6,OBSERVED,2020-11,106,0.0,0.0
7,OBSERVED,2020-11,106,1.0,171.0


In [16]:
pysqldf(
    """
    SELECT
        'OBSERVED' AS type,
        strftime('%Y-%m', period_end_date) AS month_year
        --,country_id_n
        --,predict_automatch
        --,class_acctual
        --,SUM(predict_automatch) AS predict_automatch_sum
        ,SUM(class_acctual) AS class_acctual_sum
    FROM df_sub
    --WHERE
        --country_id_n = '106'
        --AND month_year = '2020-11'
    GROUP BY
        month_year
        --,country_id_n
        --,predict_automatch
        --,class_acctual
    ;
    """
)

Unnamed: 0,type,month_year,class_acctual_sum
0,OBSERVED,,47.0
1,OBSERVED,2020-08,240.0
2,OBSERVED,2020-09,4306.0
3,OBSERVED,2020-10,4675.0
4,OBSERVED,2020-11,5369.0
5,OBSERVED,2020-12,2.0


In [17]:
pysqldf(
    """
    SELECT
        'PREDICTED' AS type,
        strftime('%Y-%m', period_end_date) AS month_year
        --,country_id_n
        --,predict_automatch
        --,class_acctual
        ,SUM(predict_automatch) AS predict_automatch_sum
        --,SUM(class_acctual) AS class_acctual_sum
    FROM df_sub
    --WHERE
        --country_id_n = '106'
        --AND month_year = '2020-11'
    GROUP BY
        month_year
        --,country_id_n
        --,predict_automatch
        --,class_acctual
    ;
    """
)

Unnamed: 0,type,month_year,predict_automatch_sum
0,PREDICTED,,40.0
1,PREDICTED,2020-08,209.0
2,PREDICTED,2020-09,3612.0
3,PREDICTED,2020-10,3974.0
4,PREDICTED,2020-11,4488.0
5,PREDICTED,2020-12,1.0


In [18]:
pysqldf(
    """
    SELECT
        'OBSERVED' AS type,
        strftime('%Y-%m', translated_when) AS month_year_translated
        --,country_id_n
        --,predict_automatch
        --,class_acctual
        --,SUM(predict_automatch) AS predict_automatch_sum
        ,SUM(class_acctual) AS class_acctual_sum
    FROM df_sub
    --WHERE
        --country_id_n = '106'
        --AND month_year = '2020-11'
    GROUP BY
        month_year_translated
        --,country_id_n
        --,predict_automatch
        --,class_acctual
    ;
    """
)

Unnamed: 0,type,month_year_translated,class_acctual_sum
0,OBSERVED,2020-09,3800.0
1,OBSERVED,2020-10,4823.0
2,OBSERVED,2020-11,5320.0
3,OBSERVED,2020-12,583.0
4,OBSERVED,2021-01,112.0
5,OBSERVED,2021-02,1.0


# Something fishy

# Demand or Supplie dropped ? A product got out of fashion?

In [19]:
pysqldf(
    """
    SELECT DISTINCT
        prod_gr_id
        ,COUNT(prod_gr_id)
    FROM (
    SELECT
        strftime('%Y-%m', period_end_date) AS month_year
        ,translated_when
        ,prod_gr_id
        --,predict_automatch
        --,class_acctual
        --,SUM(predict_automatch) AS predict_automatch_sum
        --,SUM(class_acctual) AS class_acctual_sum
    FROM df_sub
    WHERE
        strftime('%Y', translated_when) < '2021'
        --country_id_n = '105'
        --AND month_year = '2020-11'
    --GROUP BY
        --month_year
        --,country_id_n
        --,predict_automatch
        --,class_acctual
    )
    GROUP BY
    prod_gr_id
    ;
    """
)

Unnamed: 0,prod_gr_id,COUNT(prod_gr_id)
0,413,4482
1,426,11701
2,427,3362


In [20]:
pysqldf(
    """
    SELECT DISTINCT
        prod_gr_id
        ,COUNT(prod_gr_id)
    FROM (
    SELECT
        strftime('%Y-%m', period_end_date) AS month_year
        ,translated_when
        ,prod_gr_id
        --,predict_automatch
        --,class_acctual
        --,SUM(predict_automatch) AS predict_automatch_sum
        --,SUM(class_acctual) AS class_acctual_sum
    FROM df_sub
    WHERE
        strftime('%Y', translated_when) >= '2021'
        --country_id_n = '105'
        --AND month_year = '2020-11'
    --GROUP BY
        --month_year
        --,country_id_n
        --,predict_automatch
        --,class_acctual
    )
    GROUP BY
    prod_gr_id
    ;
    """
)

Unnamed: 0,prod_gr_id,COUNT(prod_gr_id)
0,413,4
1,426,143
2,427,5


In [21]:
# pysqldf(
#     """
#     SELECT
#         strftime('%Y-%m', period_end_date) AS month_year
#         ,country_id_n
#         ,predict_automatch
#         ,class_acctual
#         --,SUM(predict_automatch) AS predict_automatch_sum
#         --,SUM(class_acctual) AS class_acctual_sum
#     FROM df_sub
#     WHERE
#         country_id_n = '105'
#         AND month_year = '2020-11'
#     --GROUP BY
#         --month_year
#         --,country_id_n
#         --,predict_automatch
#         --,class_acctual
#     ;
#     """
# )