In [71]:
import pickle
import pandas as pd
from io import BytesIO
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col,lit
from pyspark.sql import functions as F,Window
from pyspark.sql.types import BooleanType, IntegerType, StringType
from pyspark.sql.types import *
from pyspark import SparkContext, SparkConf
from functools import reduce

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.svm import LinearSVC, LinearSVR
from statsmodels.regression.linear_model import OLS

from statsmodels.tools.tools import add_constant
from sklearn.metrics import classification_report, cohen_kappa_score
import scipy.stats as stats

from joblib import dump, load

import numpy as np
import os

from tqdm import tqdm
import traceback

from pandas.api.types import is_string_dtype, is_numeric_dtype

from typing import Any
import json

In [2]:
so_code = "BC"
current_part = 1

In [3]:
spark = SparkSession.builder.getOrCreate() 
spark

In [80]:
# visit_plan_path = "data\\" + so_code + "\\visit_plan_BC.csv"
# shipment_data_path = "data\\" + so_code + "\\shipments_BC.csv"
# date_analysis_data_path = "data\\" + so_code + "\\bank_holidays_BC.csv"
# shipment_split_data_path = "data\\" + so_code + "\\shipment_split_vp_BC.csv"
# sr_loading_data_path = "data\\" + so_code + "\\sr_loading_BC.csv"
# sr_unloading_data_path = "data\\" + so_code + "\\sr_unloading_BC.csv"
# stock_collection_data_path = "data\\" + so_code + "\\stock_collection_BC.csv"
# credit_requests_data_path = "data\\" + so_code + "\\credit_requests_BC.csv"
# pre_easter_effect_data_path = "data\\" + so_code + "\\pre_easter_week_data.csv"
# test_period_start_date = pd.Timestamp(year=2018, month=1, day=1)

root_path = "D:\\Projects\\JTI\\Romania\\data\\"

visit_plan_path = root_path + so_code + "\\pickles\\visit_plan_BC.pkl"
shipment_data_path = root_path + so_code + "\\pickles\\shipments_BC.pkl"
date_analysis_data_path = root_path + so_code + "\\pickles\\bank_holidays_BC.pkl"
shipment_split_data_path = root_path + so_code + "\\pickles\\shipment_split_vp_BC.pkl"
sr_loading_data_path = root_path + so_code + "\\pickles\\sr_loading_BC.pkl"
sr_unloading_data_path = root_path + so_code + "\\pickles\\sr_unloading_BC.pkl"
stock_collection_data_path = root_path + so_code + "\\pickles\\stock_collection_BC.pkl"
credit_requests_data_path = root_path + so_code + "\\pickles\\credit_requests_BC.pkl"
pre_easter_effect_data_path = root_path + so_code + "\\pickles\\pre_easter_week_data.pkl"
test_period_start_date = pd.Timestamp(year=2018, month=1, day=1)

pred_start_date = test_period_start_date
y_col = 'shipments'
so_code = "BC"
output_path = "D:\\Romania\\"
run_log_path = "D:\\Romania\\"

run_log_path = os.path.join(output_path,
                                'run_log_{}_{}.txt'.format(so_code,
                                                           current_part))

remove_first_month_from_training = False

# Load Data

In [5]:
take_shipment_data_cols = ['invoice_date', 'agent_code', 'product_code',
                           'quantity', 'pos_code', 'promo_id', 'return']

take_future_visit_plan_data_cols = ['visit_date', 'agent_code',
                                    'pos_code', 'visit_order_day']

take_pos_visit_plan_data_cols = ['visit_date', 'pos_code',
                                 'pre_nonreplacement_holiday',
                                 'double_sell', 'triple_sell']

take_holidays_data_cols = ['DayNumberOfWeek', 'VisitPlanWeek', 'IsWorkingDate',
                           'NationalDoubleSell', 'NationalTripleSell',
                           'PricelistChgAnn', 'PricelistChg']

take_credit_req_data_cols = ['PlannedStartDate', 'PlannedClosedDate',
                             'ReqType', 'IncrCoef', 'AgentCode']

preprocess_constant_cols = ['product_code', 'pos_code', 'product_cat']

In [6]:
# Reconcile agent replacements, holiday adjustments in visit plan data
def __visit_plan_reconciliation(visit_plan_data: pd.DataFrame,
                                weekday_holiday_dates: list) -> pd.DataFrame:
    visit_plan_df: pd.DataFrame = visit_plan_data.reset_index()
    visit_plan_df = visit_plan_df.sort_values(['visit_date'])
    visit_plan_df.loc[
        visit_plan_df['visit_date'].isin(weekday_holiday_dates),
        'weekday_holiday'] = True
    visit_plan_df['weekday_holiday'] = visit_plan_df['weekday_holiday'].fillna(False)

    def __create_new_markers(pos_plan_df: pd.DataFrame) -> pd.DataFrame:
        current_pos = pos_plan_df['pos_code'].iloc[0]
        # print(current_pos)
        pos_plan_df['type_lead_1'] = pos_plan_df['type'].shift(-1)
        pos_plan_df.loc[
            pos_plan_df['type_lead_1'] == 'Holiday without replacement',
            'pre_nonreplacement_holiday'] = True
        pos_plan_df['pre_nonreplacement_holiday'] = \
            pos_plan_df['pre_nonreplacement_holiday'].fillna(False)
        pos_plan_df['double_sell'] = pos_plan_df['weekday_holiday'].shift(-1).fillna(False)
        pos_plan_df['weekday_holiday(-2)'] = pos_plan_df['weekday_holiday'].shift(-2).fillna(False)
        pos_plan_df['triple_sell'] = (pos_plan_df['double_sell']
                                      & pos_plan_df['weekday_holiday(-2)'])
        pos_plan_df.loc[pos_plan_df['triple_sell'], 'double_sell'] = False
        return pos_plan_df.drop(columns=['type_lead_1', 'weekday_holiday(-2)'])

    visit_plan_df = visit_plan_df.groupby(['pos_code']).apply(
        __create_new_markers)
    # Holiday without replacement means no agent actually went there
    visit_plan_df = visit_plan_df.loc[
        visit_plan_df['type'] != 'Holiday without replacement']
    # Take only non-weekday-holiday dates
    visit_plan_df = visit_plan_df.loc[~visit_plan_df['weekday_holiday']]
    visit_plan_df.loc[visit_plan_df['type'] == 'Replacement', 'agent_code'] = \
        visit_plan_df.loc[visit_plan_df['type'] == 'Replacement',
                          'backupsalesagent_code'].astype(int)

    return visit_plan_df


# Add derived columns as required to date_analysis_data
def __date_analysis_additions(date_analysis_data: pd.DataFrame) -> pd.DataFrame:
    # manual correction according to seen data
    date_analysis_data.loc[
        [pd.Timestamp(day=7, month=12, year=2018)],
        'IsWorkingDate'] = 0
    date_analysis_data['WeekdayHoliday'] = ((date_analysis_data['DayNumberOfWeek'] <= 5)
                                            & (date_analysis_data['IsWorkingDate'] == 0))
    date_analysis_data['PriceChgPeriod'] = (date_analysis_data['PricelistChgAnn']
                                            | date_analysis_data['PricelistChg'])
    date_analysis_data['PriceChgPeriod'] = date_analysis_data['PriceChgPeriod'].cumsum()
    date_analysis_data['PriceChgEffect'] = \
        date_analysis_data['PriceChgPeriod'].apply(lambda x: x % 2 != 0)
    date_analysis_data['PriceChgPeriod'] = (date_analysis_data['PriceChgEffect']
                                            | date_analysis_data['PricelistChg'])
    # Convert to sparse then query index to find block locations
    temp_ts: pd.SparseSeries = date_analysis_data['PriceChgPeriod'].to_sparse(
        fill_value=False)
    block_locs = zip(temp_ts.sp_index.blocs, temp_ts.sp_index.blengths)
    # Map the sparse blocks back to the dense timeseries
    block_infos = [(date_analysis_data['PriceChgPeriod'].iloc[start:(start + length)],
                    length)
                   for (start, length) in block_locs]
    for series_block, length in block_infos:
        values = range(length)
        date_analysis_data.loc[series_block.index, 'daysSincePriceChgAnn'] = values
        date_analysis_data.loc[series_block.index, 'daysFromPriceChg'] = values[::-1]
    date_analysis_data['daysSincePriceChgAnn'] = \
        date_analysis_data['daysSincePriceChgAnn'].fillna(-1).astype(int)
    date_analysis_data['daysFromPriceChg'] = \
        date_analysis_data['daysFromPriceChg'].fillna(-1).astype(int)

    return date_analysis_data.drop(columns=['PricelistChgAnn',
                                            'PriceChgPeriod',
                                            'PriceChgEffect'])


# prepare shipment data into consumable format
def __prep_consumable_shipments_data(shipment_data: pd.DataFrame,
                                     train_period_shipments: pd.DataFrame) -> pd.DataFrame:
    shipment_records: pd.DataFrame = shipment_data.loc[shipment_data['return'] == 0].copy()
    shipments_train_agg: pd.DataFrame = train_period_shipments.groupby(
        ['invoice_date', 'product_code']).agg({'quantity': 'sum'})

    shipments_train_agg = shipments_train_agg.reset_index().groupby(
        ['product_code']).agg({'quantity': 'mean'})
    shipments_train_agg['contribution%'] = (shipments_train_agg['quantity']
                                            / shipments_train_agg['quantity'].sum())*100
    shipments_train_agg = shipments_train_agg.sort_values(['contribution%'])
    shipments_train_agg['contribution%_cumsum'] = shipments_train_agg['contribution%'].cumsum()
    # take products which make up 5% or a little more than 5% of the total quantity
    num_small_products = len(shipments_train_agg.loc[
                                 shipments_train_agg['contribution%_cumsum'] < 5])
    small_products = shipments_train_agg.index[:num_small_products+1]
    # pd.Series(small_products).to_csv('small_products_BW.csv')
    num_medium_products = len(shipments_train_agg.loc[
                                  (shipments_train_agg['contribution%_cumsum'] >= 5)
                                  & (shipments_train_agg['contribution%_cumsum'] < 20)])
    medium_products = shipments_train_agg.index[num_small_products+1
                                                : num_small_products+num_medium_products+1]
    # pd.Series(medium_products).to_csv('medium_products_BW.csv')

    shipment_records = shipment_records.groupby(['invoice_date', 'pos_code',
                                                 'product_code']).agg({'agent_code': 'first',
                                                                       'quantity': 'sum',
                                                                       'promo_id': 'first'})

    return_records: pd.DataFrame = shipment_data.loc[shipment_data['return'] == 1].copy()
    return_records = return_records.groupby(['invoice_date', 'pos_code',
                                             'product_code']).agg({'agent_code': 'first',
                                                                   'quantity': 'sum',
                                                                   'return': 'first'})
    return_records['return'] = return_records['return'].astype('bool')

    shipment_records['return'] = return_records['return']
    shipment_records['return'] = shipment_records['return'].fillna(False)
    shipment_records = shipment_records.reset_index()
    shipment_records.loc[
        shipment_records['product_code'].isin(small_products), 'product_cat'] = 'small'
    shipment_records['product_cat'] = shipment_records['product_cat'].fillna('large')
    shipment_records = shipment_records.set_index('invoice_date')
    return shipment_records.sort_index()


def __prep_credit_req_data(credit_req_data: pd.DataFrame,
                           pred_start_date):
    credit_req_components: list = []

    def __flatten_credit_req_days(row: pd.DataFrame) -> None:
        start_date = row["PlannedStartDate"]
        end_date = row["PlannedClosedDate"]

        ret = pd.DataFrame(index=pd.date_range(start_date, end_date))
        ret.loc[:, 'request_type'] = row['ReqType']
        ret.loc[:, 'increment_coeff'] = row['IncrCoef']
        ret.loc[:, 'agent_code'] = row['AgentCode']

        credit_req_components.append(ret)

    credit_req_data.apply(__flatten_credit_req_days, axis=1)
    processed_credit_req_data = pd.concat(credit_req_components)
    processed_credit_req_data['agent_code'] = processed_credit_req_data[
        'agent_code'].astype('int').astype('str')
    processed_credit_req_data.index.name = 'visit_date'
    processed_credit_req_data = processed_credit_req_data.reset_index()
    processed_credit_req_data = processed_credit_req_data.drop_duplicates().reset_index(drop=True)

    return processed_credit_req_data

In [7]:
def load_data(visit_plan_path: str,
              shipment_data_path: str,
              date_analysis_data_path: str,
              shipment_split_data_path: str,
              sr_loading_data_path: str,
              sr_unloading_data_path: str,
              stock_collection_data_path: str,
              credit_requests_data_path: str,
              pre_easter_effect_data_path: str,
              pred_start_date: pd.Timestamp) -> tuple:
    shipment_data: pd.DataFrame = pd.read_pickle(shipment_data_path)
    shipment_data = shipment_data[take_shipment_data_cols]
    shipment_data['agent_code'] = shipment_data['agent_code'].astype(str)
    shipment_data['pos_code'] = shipment_data['pos_code'].astype(str)
    products_to_consider = shipment_data.loc[
        shipment_data['invoice_date']
        >= pd.Timestamp(day=1, month=1, year=2018)]['product_code'].unique()
    shipment_data = shipment_data.loc[
        shipment_data['product_code'].isin(products_to_consider)]
    shipment_records = __prep_consumable_shipments_data(shipment_data,
                                                        shipment_data[
                                                            (shipment_data['invoice_date']
                                                             < pd.Timestamp(day=1,
                                                                            month=1,
                                                                            year=2018))
                                                        ])
    print('Completed loading shipment data.')

    date_analysis_data: pd.DataFrame = pd.read_pickle(date_analysis_data_path)
    date_analysis_data = date_analysis_data.set_index('Date')
    date_analysis_data = date_analysis_data[take_holidays_data_cols]
    date_analysis_data = __date_analysis_additions(date_analysis_data)
    print('Completed loading different date features data.')

    visit_plan_data: pd.DataFrame = pd.read_pickle(visit_plan_path)
    weekday_holiday_dates = date_analysis_data[date_analysis_data['WeekdayHoliday']].index.tolist()
    visit_plan_data = __visit_plan_reconciliation(visit_plan_data, weekday_holiday_dates)
    visit_plan_data['agent_code'] = visit_plan_data['agent_code'].astype(str)
    visit_plan_data['pos_code'] = visit_plan_data['pos_code'].astype(str)
    # remove any weekend dates, if any, from visit plan
    visit_plan_data = visit_plan_data[
        visit_plan_data['visit_date'].isin(
            date_analysis_data[date_analysis_data['IsWorkingDate'] == 1].index)]

    future_visit_plan_data = visit_plan_data.loc[
        visit_plan_data['visit_date'] >= pred_start_date,
        take_future_visit_plan_data_cols].copy()
    future_visit_plan_data.set_index(['visit_date', 'agent_code', 'pos_code'],
                                     inplace=True)
    future_visit_plan_data.sort_index(inplace=True)

    pos_visit_plan_data = visit_plan_data
    pos_visit_plan_data = pos_visit_plan_data[take_pos_visit_plan_data_cols]
    pos_visit_plan_data.set_index(['pos_code'], inplace=True)
    print('Completed loading visit plan related data.')

    shipment_split_data: pd.DataFrame = pd.read_pickle(shipment_split_data_path)
    shipment_split_data['agent_code'] = shipment_split_data['agent_code'].astype(str)
    shipment_split_data.set_index(['visit_date', 'agent_code', 'product_code'],
                                  inplace=True)
    print('Completed loading ER, non-ER shipment split data.')

    sr_loading_data: pd.DataFrame = pd.read_pickle(sr_loading_data_path)
    sr_loading_data['agent_code'] = sr_loading_data['agent_code'].astype(str)
    sr_loading_data.set_index(['visit_date', 'agent_code', 'product_code'],
                              inplace=True)
    print('Completed loading SR loading data.')

    sr_unloading_data: pd.DataFrame = pd.read_pickle(sr_unloading_data_path)
    sr_unloading_data['agent_code'] = sr_unloading_data['agent_code'].astype(str)
    sr_unloading_data.set_index(['visit_date', 'agent_code', 'product_code'],
                                inplace=True)
    print('Completed loading SR unloading data.')

    stock_collection_data: pd.DataFrame = pd.read_pickle(stock_collection_data_path)
    stock_collection_data.rename(columns={'stock_date': 'invoice_date'}, inplace=True)
    stock_collection_data['pos_code'] = stock_collection_data['pos_code'].astype(str)
    print('Completed loading stock collection data.')

    credit_requests_data: pd.DataFrame = pd.read_pickle(credit_requests_data_path)
    credit_requests_data = credit_requests_data[take_credit_req_data_cols]
    credit_requests_data = __prep_credit_req_data(credit_requests_data, pred_start_date)
    print('Completed loading credit requests data.')

    pre_easter_effect_data: pd.DataFrame = pd.read_pickle(pre_easter_effect_data_path)
    pre_easter_effect_data = pre_easter_effect_data.set_index('dates')
    print('Completed loading pre easter effect data.')

    return date_analysis_data, shipment_records, visit_plan_data, \
       future_visit_plan_data, pos_visit_plan_data, shipment_split_data, \
       sr_loading_data, sr_unloading_data, stock_collection_data, \
       credit_requests_data, pre_easter_effect_data

In [8]:
bank_holidays_df, shipment_data_df, visit_plan_df, future_visit_plan_df,\
pos_visit_plan_df, shipment_split_df, sr_loading_df, sr_unloading_df, stock_collection_df,\
credit_requests_df, pre_easter_effect_df = load_data(visit_plan_path,shipment_data_path,
                                                     date_analysis_data_path,
                                                     shipment_split_data_path,
                                                     sr_loading_data_path,
                                                     sr_unloading_data_path,
                                                     stock_collection_data_path,
                                                     credit_requests_data_path,
                                                     pre_easter_effect_data_path,
                                                     test_period_start_date)

Completed loading shipment data.
Completed loading different date features data.
Completed loading visit plan related data.
Completed loading ER, non-ER shipment split data.
Completed loading SR loading data.
Completed loading SR unloading data.
Completed loading stock collection data.
Completed loading credit requests data.
Completed loading pre easter effect data.


In [9]:
shipment_temp: pd.DataFrame = shipment_data_df.reset_index()
shipment_temp = shipment_temp.rename(columns={'invoice_date': 'visit_date'})
shipment_temp = shipment_temp[['visit_date', 'pos_code', 'agent_code']]
# we need to take all poses visited by the agent on a particular date,
# irrespective of product. So we are kind of taking the union of pos sets from all
# products on a particular date for each agent.
shipment_temp = shipment_temp.drop_duplicates()

credit_requests_df.agent_code = credit_requests_df.agent_code.astype(str)
credit_requests_pos_map: pd.DataFrame = pd.merge(credit_requests_df,
                                                 shipment_temp,
                                                 how='left',
                                                 on=['visit_date', 'agent_code'])
credit_requests_pos_map = credit_requests_pos_map.dropna()
credit_requests_pos_map = credit_requests_pos_map.sort_values(
    ['visit_date', 'increment_coeff'], ascending=False)
credit_requests_pos_map = credit_requests_pos_map.drop_duplicates(
    subset=['visit_date', 'pos_code'], keep='first')

In [10]:
sc = SparkContext.getOrCreate()

In [11]:
sc

In [12]:
stock_collection_df.invoice_date = pd.to_datetime(stock_collection_df.invoice_date)

In [29]:

list_dataframes = [("shipment_data", shipment_data_df),
                   ("pos_visit_plan", pos_visit_plan_df),
                   ("bank_holidays", bank_holidays_df),
                   ("stock_collection", stock_collection_df),
                   ("credit_requests", credit_requests_pos_map),
                   ("pre_easter_effect", pre_easter_effect_df)]

In [30]:
broadcasted_df = sc.broadcast(list_dataframes)

In [31]:
visit_dates = future_visit_plan_df.reset_index().visit_date.unique()
agent_codes = future_visit_plan_df.reset_index().agent_code.unique()
pos_codes = future_visit_plan_df.reset_index().pos_code.unique()

product_codes = shipment_data_df.product_code

In [32]:
pro_code = "TBHBK"
# pos_code = '10068415'

In [33]:
extract_timeseries_tasks = [] # Extract timeseries tasks
for pos_code in pos_codes:
# for pro_code in product_codes[:3]:
    extract_timeseries_tasks = extract_timeseries_tasks + [(pro_code, pos_code)]

In [34]:
# extract_timeseries_tasks = []
# extract_timeseries_tasks = extract_timeseries_tasks + [(pro_code, pos_code)]

In [35]:
extract_timeseries_tasks

[('TBHBK', '10067501'),
 ('TBHBK', '10067671'),
 ('TBHBK', '10067673'),
 ('TBHBK', '10067676'),
 ('TBHBK', '10067680'),
 ('TBHBK', '10067863'),
 ('TBHBK', '10068296'),
 ('TBHBK', '10068297'),
 ('TBHBK', '10068390'),
 ('TBHBK', '10068399'),
 ('TBHBK', '10068400'),
 ('TBHBK', '10068401'),
 ('TBHBK', '10068415'),
 ('TBHBK', '10068571'),
 ('TBHBK', '10068669'),
 ('TBHBK', '10068684'),
 ('TBHBK', '10068715'),
 ('TBHBK', '20761568'),
 ('TBHBK', '20919020'),
 ('TBHBK', '22347169'),
 ('TBHBK', '23199805'),
 ('TBHBK', '23555218'),
 ('TBHBK', '28008078'),
 ('TBHBK', '10030899'),
 ('TBHBK', '10067614'),
 ('TBHBK', '10067616'),
 ('TBHBK', '10067620'),
 ('TBHBK', '10067624'),
 ('TBHBK', '10067728'),
 ('TBHBK', '10068007'),
 ('TBHBK', '10068492'),
 ('TBHBK', '10068494'),
 ('TBHBK', '10068516'),
 ('TBHBK', '10068531'),
 ('TBHBK', '10068667'),
 ('TBHBK', '10068670'),
 ('TBHBK', '10068787'),
 ('TBHBK', '20069141'),
 ('TBHBK', '20347355'),
 ('TBHBK', '21942877'),
 ('TBHBK', '23441505'),
 ('TBHBK', '2350

In [36]:
def extract_ts(pro_code, pos_code):
    shipment_data_local = broadcasted_df.value[0][1].copy()
    pos_visit_plan_data = broadcasted_df.value[1][1].copy()
    inflated_demand_marker_df = broadcasted_df.value[2][1].copy()
    stock_collection_data = broadcasted_df.value[3][1].copy()
    credit_requests_data = broadcasted_df.value[4][1].copy()
    pre_easter_effect_data = broadcasted_df.value[5][1].copy()

    inflated_demand_marker_df.index = pd.to_datetime(inflated_demand_marker_df.index)

    # pos_visit_plan_data.index = pos_visit_plan_data.index.astype(int)
    pos_visit_plan_data.visit_date = pd.to_datetime(pos_visit_plan_data.visit_date)

    # stock_collection_data.invoice_date = pd.to_datetime(stock_collection_data.invoice_date)
    # pre_easter_effect_data.index = pd.to_datetime(pre_easter_effect_data.index)

    last_visit_date = pos_visit_plan_data[pos_visit_plan_data.index == pos_code].visit_date.max()

    pos_pro_df = shipment_data_local[(shipment_data_local.index <= pd.to_datetime(last_visit_date)) & 
                                         (shipment_data_local.pos_code == pos_code) & 
                                         (shipment_data_local.product_code == pro_code)]

    if len(pos_pro_df):
        pos_pro_df.pos_code = pos_pro_df.pos_code.astype(int)
        credit_requests_data.pos_code = credit_requests_data.pos_code.astype(int)

        current_pos_code = pos_code
        visit_plan_dates = pos_visit_plan_data[pos_visit_plan_data.index == current_pos_code]
        visit_plan_dates = visit_plan_dates.reset_index().set_index('visit_date')
        visit_plan_dates = visit_plan_dates[~visit_plan_dates.index.duplicated(keep='last')]
        visit_plan_dates['plan_date'] = True

        current_product_code = pro_code
        stock_df = stock_collection_data.loc[
            (stock_collection_data['pos_code'] == current_pos_code) &
            (stock_collection_data['product_code'] == current_product_code)]
        stock_df = stock_df.set_index('invoice_date')

        credit_requests_df = credit_requests_data.loc[
            credit_requests_data['pos_code'] == current_pos_code]
        credit_requests_df = credit_requests_df.set_index('visit_date')

        pos_pro_df.sort_index(inplace=True)

        pos_pro_df.rename(columns={'quantity': 'shipments'}, inplace=True)

        first_nonzero_sale_date = pos_pro_df.index[0]
        if len(pos_pro_df) < 2:
            second_nonzero_sale_date = None
        else:
            second_nonzero_sale_date = pos_pro_df.index[1]
        if len(pos_pro_df) < 3:
            third_nonzero_sale_date = None
        else:
            third_nonzero_sale_date = pos_pro_df.index[2]
        if len(pos_pro_df) < 4:
            fourth_nonzero_sale_date = None
        else:
            fourth_nonzero_sale_date = pos_pro_df.index[3]
        if len(pos_pro_df) < 5:
            fifth_nonzero_sale_date = None
        else:
            fifth_nonzero_sale_date = pos_pro_df.index[4]
        if len(pos_pro_df) < 6:
            sixth_nonzero_sale_date = None
        else:
            sixth_nonzero_sale_date = pos_pro_df.index[5]
        nonzero_shipment_dates: pd.Series = pd.Series(pos_pro_df.index)
        pos_pro_df.at[pos_pro_df.index[0], 'is_first_nonzero_sale_date'] = True
        pos_pro_df.at[pos_pro_df.index[-1], 'is_last_sale_date'] = True
        pos_pro_df['nonzero_Shipments_1'] = pos_pro_df['shipments'].shift(1)
        overflow_shipment_1 = pos_pro_df['shipments'].iloc[-1]
        pos_pro_df['nonzero_Shipments_2'] = pos_pro_df['nonzero_Shipments_1'].shift(1)
        overflow_shipment_2 = pos_pro_df['nonzero_Shipments_1'].iloc[-1]
        pos_pro_df['nonzero_Shipments_3'] = pos_pro_df['nonzero_Shipments_2'].shift(1)
        overflow_shipment_3 = pos_pro_df['nonzero_Shipments_2'].iloc[-1]
        pos_pro_df['nonzero_Shipments_4'] = pos_pro_df['nonzero_Shipments_3'].shift(1)
        overflow_shipment_4 = pos_pro_df['nonzero_Shipments_3'].iloc[-1]
        pos_pro_df['nonzero_Shipments_5'] = pos_pro_df['nonzero_Shipments_4'].shift(1)
        overflow_shipment_5 = pos_pro_df['nonzero_Shipments_4'].iloc[-1]
        pos_pro_df['nonzero_Shipments_6'] = pos_pro_df['nonzero_Shipments_5'].shift(1)
        overflow_shipment_6 = pos_pro_df['nonzero_Shipments_5'].iloc[-1]
        pos_pro_df['days_since_last_nonzero_sale'] = nonzero_shipment_dates.diff().dt.days.values
        pos_pro_df['days_since_last_nonzero_sale_1'] = (nonzero_shipment_dates.shift(1)
                                                        - nonzero_shipment_dates.shift(2)).dt.days.values
        overflow_days_since_last_nonzero_sale_1 = pos_pro_df['days_since_last_nonzero_sale'].iloc[-1]
        pos_pro_df['days_since_last_nonzero_sale_2'] = (nonzero_shipment_dates.shift(2)
                                                        - nonzero_shipment_dates.shift(3)).dt.days.values
        overflow_days_since_last_nonzero_sale_2 = pos_pro_df[
            'days_since_last_nonzero_sale'].shift(1).iloc[-1]
        pos_pro_df['days_since_last_nonzero_sale_3'] = (nonzero_shipment_dates.shift(3)
                                                        - nonzero_shipment_dates.shift(4)).dt.days.values
        overflow_days_since_last_nonzero_sale_3 = pos_pro_df[
            'days_since_last_nonzero_sale'].shift(2).iloc[-1]
        # print(pos_pro_df.head(10))

        new_index = visit_plan_dates.index.union(pos_pro_df.index)
        new_index = new_index.union([last_visit_date])
        pos_pro_df = pos_pro_df.reindex(new_index)

        global preprocess_constant_cols
        pos_pro_df.loc[:, preprocess_constant_cols] = \
            pos_pro_df.loc[:, preprocess_constant_cols].fillna(method='ffill')
        pos_pro_df.loc[:, preprocess_constant_cols] = \
            pos_pro_df.loc[:, preprocess_constant_cols].fillna(method='bfill')
        pos_pro_df.index.names = ['invoice_date']

        pos_pro_df['is_first_nonzero_sale_date'] = pos_pro_df['is_first_nonzero_sale_date'].fillna(False)
        pos_pro_df['is_last_sale_date'] = pos_pro_df['is_last_sale_date'].fillna(False)
        pos_pro_df['nonzero_Shipments_1'] = pos_pro_df['nonzero_Shipments_1'].fillna(method='bfill')
        pos_pro_df.loc[pos_pro_df.index <= first_nonzero_sale_date,
                       'nonzero_Shipments_1'] = 0
        pos_pro_df['nonzero_Shipments_1'] = pos_pro_df['nonzero_Shipments_1'].fillna(overflow_shipment_1)
        pos_pro_df.loc[pos_pro_df.index <= first_nonzero_sale_date,
                       'days_since_last_nonzero_sale'] = -1
        if second_nonzero_sale_date:
            pos_pro_df['nonzero_Shipments_2'] = pos_pro_df['nonzero_Shipments_2'].fillna(method='bfill')
            pos_pro_df.loc[pos_pro_df.index <= second_nonzero_sale_date,
                           'nonzero_Shipments_2'] = 0
            pos_pro_df['nonzero_Shipments_2'] = pos_pro_df['nonzero_Shipments_2'].fillna(overflow_shipment_2)
            pos_pro_df['days_since_last_nonzero_sale_1'] = \
                pos_pro_df['days_since_last_nonzero_sale_1'].fillna(method='bfill')
            pos_pro_df.loc[pos_pro_df.index <= second_nonzero_sale_date,
                           'days_since_last_nonzero_sale_1'] = -1
            pos_pro_df['days_since_last_nonzero_sale_1'] = \
                pos_pro_df['days_since_last_nonzero_sale_1'].fillna(overflow_days_since_last_nonzero_sale_1)
        else:
            pos_pro_df['nonzero_Shipments_2'] = 0
            pos_pro_df['days_since_last_nonzero_sale_1'] = -1
        if third_nonzero_sale_date:
            pos_pro_df['nonzero_Shipments_3'] = pos_pro_df['nonzero_Shipments_3'].fillna(method='bfill')
            pos_pro_df.loc[pos_pro_df.index <= third_nonzero_sale_date,
                           'nonzero_Shipments_3'] = 0
            pos_pro_df['nonzero_Shipments_3'] = pos_pro_df['nonzero_Shipments_3'].fillna(overflow_shipment_3)
            pos_pro_df['days_since_last_nonzero_sale_2'] = \
                pos_pro_df['days_since_last_nonzero_sale_2'].fillna(method='bfill')
            pos_pro_df.loc[pos_pro_df.index <= third_nonzero_sale_date,
                           'days_since_last_nonzero_sale_2'] = -1
            pos_pro_df['days_since_last_nonzero_sale_2'] = \
                pos_pro_df['days_since_last_nonzero_sale_2'].fillna(overflow_days_since_last_nonzero_sale_2)
        else:
            pos_pro_df['nonzero_Shipments_3'] = 0
            pos_pro_df['days_since_last_nonzero_sale_2'] = -1
        if fourth_nonzero_sale_date:
            pos_pro_df['nonzero_Shipments_4'] = pos_pro_df['nonzero_Shipments_4'].fillna(method='bfill')
            pos_pro_df.loc[pos_pro_df.index <= fourth_nonzero_sale_date,
                           'nonzero_Shipments_4'] = 0
            pos_pro_df['nonzero_Shipments_4'] = pos_pro_df['nonzero_Shipments_4'].fillna(overflow_shipment_4)
            pos_pro_df['days_since_last_nonzero_sale_3'] = \
                pos_pro_df['days_since_last_nonzero_sale_3'].fillna(method='bfill')
            pos_pro_df.loc[pos_pro_df.index <= fourth_nonzero_sale_date,
                           'days_since_last_nonzero_sale_3'] = -1
            pos_pro_df['days_since_last_nonzero_sale_3'] = \
                pos_pro_df['days_since_last_nonzero_sale_3'].fillna(overflow_days_since_last_nonzero_sale_3)
        else:
            pos_pro_df['nonzero_Shipments_4'] = 0
            pos_pro_df['days_since_last_nonzero_sale_3'] = -1
        if fifth_nonzero_sale_date:
            pos_pro_df['nonzero_Shipments_5'] = pos_pro_df['nonzero_Shipments_5'].fillna(method='bfill')
            pos_pro_df.loc[pos_pro_df.index <= fifth_nonzero_sale_date,
                           'nonzero_Shipments_5'] = 0
            pos_pro_df['nonzero_Shipments_5'] = pos_pro_df['nonzero_Shipments_5'].fillna(overflow_shipment_5)
        else:
            pos_pro_df['nonzero_Shipments_5'] = 0
        if sixth_nonzero_sale_date:
            pos_pro_df['nonzero_Shipments_6'] = pos_pro_df['nonzero_Shipments_6'].fillna(method='bfill')
            pos_pro_df['nonzero_Shipments_6'] = pos_pro_df['nonzero_Shipments_6'].fillna(overflow_shipment_6)
            pos_pro_df.loc[pos_pro_df.index <= sixth_nonzero_sale_date,
                           'nonzero_Shipments_6'] = 0
        else:
            pos_pro_df['nonzero_Shipments_6'] = 0
        pos_pro_df['return_1'] = pos_pro_df['return'].shift()
        pos_pro_df['return_1'] = pos_pro_df['return_1'].fillna(False)
        pos_pro_df['shipments'] = pos_pro_df['shipments'].fillna(0)
        pos_pro_df['promo_id'] = pos_pro_df['promo_id'].fillna(0).astype('int')

        pos_pro_df.at[pos_pro_df.index[0], 'is_first_plan_date'] = True
        pos_pro_df['is_first_plan_date'] = pos_pro_df['is_first_plan_date'].fillna(False)
        pos_pro_df['days_since_first_sale'] = (pos_pro_df.index
                                               - pos_pro_df.index[0]).days
        pos_pro_df['days_since_first_nonzero_sale'] = (pos_pro_df.index
                                                       - first_nonzero_sale_date).days
        pos_pro_df.loc[pos_pro_df.index < first_nonzero_sale_date,
                       'days_since_first_nonzero_sale'] = -1
        pos_pro_df['days_since_last_sale'] = pos_pro_df['days_since_first_sale'].diff().fillna(0)
        pos_pro_df['stock'] = stock_df['stock']
        pos_pro_df['stock'] = pos_pro_df['stock'].fillna(-999)
        pos_pro_df['week'] = pos_pro_df.index.week
        pos_pro_df['month'] = pos_pro_df.index.month
        pos_pro_df['is_odd_month'] = pos_pro_df['month'].apply(lambda x: x & 1)
        pos_pro_df['quarter'] = pos_pro_df.index.quarter
        pos_pro_df['year'] = pos_pro_df.index.year
        # This is the i-th year this pos is buying this product
        pos_pro_df['year_of_engagement'] = (pos_pro_df['year']
                                            - pos_pro_df['year'].min() + 1)
        pos_pro_df['VisitPlanWeek'] = inflated_demand_marker_df['VisitPlanWeek']
        pos_pro_df['DayNumberOfWeek'] = inflated_demand_marker_df['DayNumberOfWeek']
        pos_pro_df['IsWorkingDate'] = inflated_demand_marker_df['IsWorkingDate']
        pos_pro_df['weekday_holiday'] = inflated_demand_marker_df['WeekdayHoliday']
        pos_pro_df['days_since_price_chg_ann'] = inflated_demand_marker_df['daysSincePriceChgAnn']
        pos_pro_df['days_from_price_chg'] = inflated_demand_marker_df['daysFromPriceChg']
        pos_pro_df['PricelistChg'] = inflated_demand_marker_df['PricelistChg']
        pos_pro_df['double_sell'] = visit_plan_dates['double_sell']
        pos_pro_df['double_sell'] = pos_pro_df['double_sell'].fillna(False)
        pos_pro_df['triple_sell'] = visit_plan_dates['triple_sell']
        pos_pro_df['triple_sell'] = pos_pro_df['triple_sell'].fillna(False)
        pos_pro_df['isVisitPlan'] = visit_plan_dates['plan_date']
        pos_pro_df['isVisitPlan'] = pos_pro_df['isVisitPlan'].fillna(False)
        pos_pro_df['pre_nonreplacement_holiday'] = \
            visit_plan_dates['pre_nonreplacement_holiday']
        pos_pro_df['pre_nonreplacement_holiday'] = pos_pro_df['pre_nonreplacement_holiday'].fillna(False)
        pos_pro_df.loc[
            (pos_pro_df['double_sell'] | pos_pro_df['triple_sell']),
            'pre_nonreplacement_holiday'] = False
        pos_pro_df['credit_request_coeff'] = credit_requests_df['increment_coeff']
        pos_pro_df['credit_request_coeff'] = pos_pro_df['credit_request_coeff'].fillna(0)
        pos_pro_df['credit_request_type'] = credit_requests_df['request_type']
        pos_pro_df['credit_request_type'] = pos_pro_df['credit_request_type'].fillna('nill')
        pos_pro_df['days_from_easter'] = pre_easter_effect_data['days_from_easter']
        pos_pro_df['days_from_easter'] = pos_pro_df['days_from_easter'].fillna(-1)
        pos_pro_df['isMissedPlan'] = ((pos_pro_df['stock'] == -999)
                                      & (pos_pro_df['shipments'] == 0)
                                      & pos_pro_df['isVisitPlan'])
        pos_pro_df['is_zero_sale'] = (pos_pro_df['shipments'] == 0)
        # Convert to sparse then query index to find block locations
        is_nonstart_zero: pd.Series = pos_pro_df.loc[
            pos_pro_df['days_since_first_nonzero_sale'] != -1,
            'is_zero_sale']
        temp_ts: pd.SparseSeries = is_nonstart_zero.to_sparse(fill_value=False)
        block_locs = zip(temp_ts.sp_index.blocs, temp_ts.sp_index.blengths)
        # Map the sparse blocks back to the dense timeseries
        blocks = [is_nonstart_zero.iloc[start-1:(start + length)] for (start, length) in block_locs]
        blocks = [pd.Series((block.index - block.index[0]),
                            index=block.index).dt.days.iloc[1:]
                  for block in blocks if not block.empty]
        # Map the days since last nonzero sale blocks back to original df
        for block in blocks:
            pos_pro_df.loc[block.index, 'days_since_last_nonzero_sale'] = block

        pos_pro_df['shipments_1'] = pos_pro_df['shipments'].shift(1).fillna(0)
        pos_pro_df['days_since_last_sale_1'] = pos_pro_df['days_since_last_sale'].shift(1).fillna(-1)
        pos_pro_df['double_sell_1'] = pos_pro_df['double_sell'].shift(1).fillna(False)
        pos_pro_df.loc[
            (pos_pro_df['double_sell'] | pos_pro_df['triple_sell']), 'double_sell_1'] = False
        pos_pro_df['triple_sell_1'] = pos_pro_df['triple_sell'].shift(1).fillna(False)
        pos_pro_df.loc[
            (pos_pro_df['double_sell'] | pos_pro_df['triple_sell']), 'triple_sell_1'] = False
        pos_pro_df['isVisitPlan_1'] = pos_pro_df['isVisitPlan'].shift(1).fillna(False)  # After ER
        pos_pro_df['PricelistChg_1'] = pos_pro_df['PricelistChg'].shift(1).fillna(False)
        pos_pro_df['return_1'] = pos_pro_df['return'].shift(1).fillna(False)  # After return
        pos_pro_df['isMissedPlan_1'] = pos_pro_df['isMissedPlan'].shift(1).fillna(False)
        pos_pro_df['stock_1'] = pos_pro_df['stock'].shift(1).fillna(-999)
        pos_pro_df['is_zero_sale_1'] = pos_pro_df['is_zero_sale'].shift(1).fillna(False)
        is_zero_1: pd.Series = pos_pro_df['is_zero_sale_1']
        # Convert to sparse then query index to find block locations
        temp_ts: pd.SparseSeries = is_zero_1.to_sparse(fill_value=False)
        block_locs = zip(temp_ts.sp_index.blocs, temp_ts.sp_index.blengths)
        # Map the sparse blocks back to the dense timeseries
        blocks = [is_zero_1.iloc[start:(start + length)] for (start, length) in block_locs]
        blocks = [block.astype(int).cumsum() for block in blocks if not block.empty]
        # Map the cumsum blocks back to original df
        pos_pro_df['num_consecutive_zero_sales'] = 0
        for block in blocks:
            pos_pro_df.loc[block.index, 'num_consecutive_zero_sales'] = block
        is_nonzero_1: pd.Series = (pos_pro_df['shipments'] != 0).shift(1).fillna(False)
        # Convert to sparse then query index to find block locations
        temp_ts: pd.SparseSeries = is_nonzero_1.to_sparse(fill_value=False)
        block_locs = zip(temp_ts.sp_index.blocs, temp_ts.sp_index.blengths)
        # Map the sparse blocks back to the dense timeseries
        blocks = [is_nonzero_1.iloc[start:(start + length)] for (start, length) in block_locs]
        blocks = [block.astype(int).cumsum() for block in blocks if not block.empty]
        # Map the cumsum blocks back to original df
        pos_pro_df['num_consecutive_nonzero_sales'] = 0
        for block in blocks:
            pos_pro_df.loc[block.index, 'num_consecutive_nonzero_sales'] = block

        pos_pro_df['double_sell_lead_1'] = pos_pro_df[
            'double_sell'].shift(-1).fillna(False)  # Before double sell
        pos_pro_df.loc[
            (pos_pro_df['double_sell'] | pos_pro_df['triple_sell']
             | pos_pro_df['double_sell_1'] | pos_pro_df['triple_sell_1']),
            'double_sell_lead_1'] = False
        pos_pro_df['triple_sell_lead_1'] = pos_pro_df[
            'triple_sell'].shift(-1).fillna(False)  # Before triple sell
        pos_pro_df.loc[
            (pos_pro_df['double_sell'] | pos_pro_df['triple_sell']
             | pos_pro_df['double_sell_1'] | pos_pro_df['triple_sell_1']),
            'triple_sell_lead_1'] = False

        pos_pro_df['shipments_2'] = pos_pro_df['shipments_1'].shift(1).fillna(0)
        pos_pro_df['days_since_last_sale_2'] = pos_pro_df['days_since_last_sale_1'].shift(1).fillna(-1)
        pos_pro_df['double_sell_2'] = pos_pro_df['double_sell_1'].shift(1).fillna(False)
        pos_pro_df.loc[
            (pos_pro_df['double_sell'] | pos_pro_df['triple_sell']), 'double_sell_2'] = False
        pos_pro_df['triple_sell_2'] = pos_pro_df['triple_sell_1'].shift(1).fillna(False)
        pos_pro_df.loc[
            (pos_pro_df['double_sell'] | pos_pro_df['triple_sell']), 'triple_sell_2'] = False
        pos_pro_df['isVisitPlan_2'] = pos_pro_df['isVisitPlan_1'].shift(1).fillna(False)
        pos_pro_df['isMissedPlan_2'] = pos_pro_df['isMissedPlan_1'].shift(1).fillna(False)
        pos_pro_df['stock_2'] = pos_pro_df['stock_1'].shift(1).fillna(-999)
        pos_pro_df['is_zero_sale_2'] = pos_pro_df['is_zero_sale_1'].shift(1).fillna(False)

        pos_pro_df['shipments_3'] = pos_pro_df['shipments_2'].shift(1).fillna(0)
        pos_pro_df['days_since_last_sale_3'] = pos_pro_df['days_since_last_sale_2'].shift(1).fillna(-1)
        pos_pro_df['stock_3'] = pos_pro_df['stock_2'].shift(1).fillna(-999)
        pos_pro_df['is_zero_sale_3'] = pos_pro_df['is_zero_sale_2'].shift(1).fillna(False)
        pos_pro_df['shipments_4'] = pos_pro_df['shipments_3'].shift(1).fillna(0)
        pos_pro_df['stock_4'] = pos_pro_df['stock_3'].shift(1).fillna(-999)
        pos_pro_df['shipments_5'] = pos_pro_df['shipments_4'].shift(1).fillna(0)
        pos_pro_df['stock_5'] = pos_pro_df['stock_4'].shift(1).fillna(-999)
        pos_pro_df['shipments_6'] = pos_pro_df['shipments_5'].shift(1).fillna(0)
        pos_pro_df['stock_6'] = pos_pro_df['stock_5'].shift(1).fillna(-999)

        past_3_shipment_labels = ['shipments_{}'.format(i) for i in range(1, 4)]
        past_6_shipment_labels = ['shipments_{}'.format(i) for i in range(1, 7)]
        past_3_shipments_interval_labels = ['days_since_last_sale_{}'.format(i) for i in range(1, 4)]
        past_3_stock_labels = ['stock_{}'.format(i) for i in range(1, 4)]
        past_6_stock_labels = ['stock_{}'.format(i) for i in range(1, 7)]
        past_3_nonzero_shipment_labels = ['nonzero_Shipments_{}'.format(i) for i in range(1, 4)]
        past_6_nonzero_shipment_labels = ['nonzero_Shipments_{}'.format(i) for i in range(1, 7)]
        past_3_nonzero_shipments_interval_labels = ['days_since_last_nonzero_sale_{}'.format(i)
                                                    for i in range(1, 4)]
        pos_pro_df['ma_3'] = pos_pro_df[past_3_shipment_labels].mean(axis=1)
        pos_pro_df['ma_6'] = pos_pro_df[past_6_shipment_labels].mean(axis=1)
        pos_pro_df['mm_3'] = pos_pro_df[past_3_shipment_labels].median(axis=1)
        pos_pro_df['mm_6'] = pos_pro_df[past_6_shipment_labels].median(axis=1)
        pos_pro_df['ma_interval_3'] = pos_pro_df[past_3_shipments_interval_labels].mean(axis=1)
        pos_pro_df['ma_Stock_3'] = pos_pro_df[past_3_stock_labels].mean(axis=1)
        pos_pro_df['ma_Stock_6'] = pos_pro_df[past_6_stock_labels].mean(axis=1)
        pos_pro_df['ma_nonzero_3'] = pos_pro_df[past_3_nonzero_shipment_labels].mean(axis=1)
        pos_pro_df['ma_nonzero_6'] = pos_pro_df[past_6_nonzero_shipment_labels].mean(axis=1)
        pos_pro_df['mm_nonzero_3'] = pos_pro_df[past_3_nonzero_shipment_labels].median(axis=1)
        pos_pro_df['mm_nonzero_6'] = pos_pro_df[past_6_nonzero_shipment_labels].median(axis=1)
        pos_pro_df['ma_nonzero_interval_3'] = \
            pos_pro_df[past_3_nonzero_shipments_interval_labels].mean(axis=1)

        pos_pro_df['Chg_Shipments_1_from_ma_3'] = pos_pro_df['shipments_1'] - pos_pro_df['ma_3']
        pos_pro_df['Chg_Shipments_1_from_ma_6'] = pos_pro_df['shipments_1'] - pos_pro_df['ma_6']
        pos_pro_df['Chg_Shipments_1_from_mm_3'] = pos_pro_df['shipments_1'] - pos_pro_df['mm_3']
        pos_pro_df['Chg_Shipments_1_from_mm_6'] = pos_pro_df['shipments_1'] - pos_pro_df['mm_6']
        pos_pro_df['Chg_pct_Shipments_1_from_Shipments_2'] = ((pos_pro_df['shipments_1']
                                                               - pos_pro_df['shipments_2'])
                                                              / pos_pro_df['shipments_2']).clip(-5, 5)
        pos_pro_df['Chg_pct_Shipments_1_from_Shipments_2'] = \
            pos_pro_df['Chg_pct_Shipments_1_from_Shipments_2'].fillna(-999)
        pos_pro_df['Chg_Stock_1_from_ma_Stock_3'] = pos_pro_df['stock_1'] - pos_pro_df['ma_Stock_3']
        pos_pro_df['Chg_Stock_1_from_ma_Stock_6'] = pos_pro_df['stock_1'] - pos_pro_df['ma_Stock_6']
        pos_pro_df['Chg_pct_Stock_1_from_Stock_2'] = ((pos_pro_df['stock_1']
                                                       - pos_pro_df['stock_2'])
                                                      / pos_pro_df['stock_2']).clip(-5, 5)
        pos_pro_df['Chg_pct_Stock_1_from_Stock_2'] = \
            pos_pro_df['Chg_pct_Stock_1_from_Stock_2'].fillna(-999)
        pos_pro_df['Chg_nonzero_Shipments_1_from_ma_nonzero_3'] = \
            pos_pro_df['nonzero_Shipments_1'] - pos_pro_df['ma_nonzero_3']
        pos_pro_df['Chg_nonzero_Shipments_1_from_ma_nonzero_6'] = \
            pos_pro_df['nonzero_Shipments_1'] - pos_pro_df['ma_nonzero_6']
        pos_pro_df['Chg_nonzero_Shipments_1_from_mm_nonzero_3'] = \
            pos_pro_df['nonzero_Shipments_1'] - pos_pro_df['mm_nonzero_3']
        pos_pro_df['Chg_nonzero_Shipments_1_from_mm_nonzero_6'] = \
            pos_pro_df['nonzero_Shipments_1'] - pos_pro_df['mm_nonzero_6']
        pos_pro_df['Chg_pct_nonzero_Shipments_1_from_nonzero_Shipments_2'] = \
            ((pos_pro_df['nonzero_Shipments_1'] - pos_pro_df['nonzero_Shipments_2'])
             / pos_pro_df['nonzero_Shipments_2']).clip(-5, 5)
        pos_pro_df['Chg_pct_nonzero_Shipments_1_from_nonzero_Shipments_2'] = \
            pos_pro_df['Chg_pct_nonzero_Shipments_1_from_nonzero_Shipments_2'].fillna(-999)

        pos_pro_df = pos_pro_df.loc[(pos_pro_df['IsWorkingDate'] == 1) &
                                    (pos_pro_df['DayNumberOfWeek'] <= 5)]
        pos_pro_df.drop(columns=['return', 'IsWorkingDate', 'PricelistChg'],
                        inplace=True)

        # try:
        #     assert not pos_pro_df.isnull().any().any()
        # except AssertionError as ae:
        #     null_track = pos_pro_df.isnull().any()
        #     print(null_track[null_track])
        #     pos_pro_df.to_csv('temp.csv')
        #     exit()
        pos_pro_df.fillna(0, inplace=True)
        # print(pos_pro_df.head())

    return pos_pro_df


In [37]:
extract_ts_tasks_RDD = sc.parallelize(extract_timeseries_tasks, numSlices=len(extract_timeseries_tasks))
extracted_ts_data = extract_ts_tasks_RDD.map(lambda v_date_pos_code: extract_ts(v_date_pos_code[0], 
                                                                                v_date_pos_code[1]))

# Convert RDD to Dataframe

In [38]:
rdd_partition_count = extracted_ts_data.count()

KeyboardInterrupt: 

In [None]:
all_features_set = pd.concat(extracted_ts_data.take(rdd_partition_count))

In [None]:
all_features_set.stock

In [None]:
all_features_set.shipments.sum()

In [None]:
all_features_set.promo_id.sum()

In [None]:
all_features_set = pd.DataFrame()
for i in range(1,rdd_partition_count+1):
    print("partition no", i)
    all_features_set = all_features_set.append(extracted_ts_data.take(rdd_partition_count)[i-1])

In [None]:
all_features_set.to_csv(root_path + "test_TWIBLH.csv")

In [None]:
root_path

In [None]:
prepared_ts_data = all_features_set

In [None]:
prepared_product_data = prepared_ts_data.reset_index()

In [None]:
prepared_ts_data.sort_values(['product_code'], inplace=True)

In [None]:
prepared_product_data = prepared_product_data.set_index(
            ['product_code', 'invoice_date']).sort_index()

In [None]:
prepared_product_data.reset_index(inplace=True)

In [19]:
product_codes =  ["TBHBK"]

In [20]:
product_list_tasks = []
for pro_code in product_codes:
        product_list_tasks = product_list_tasks + [(pro_code)]

In [21]:
product_list_tasks

['TBHBK']

In [13]:
prepared_product_data = pd.read_csv("D:\\Projects\\JTI\\Romania\\output\\feature_data\\feature_data_spark.csv")

In [14]:
prepared_product_data.invoice_date = pd.to_datetime(prepared_product_data.invoice_date)
prepared_product_data = prepared_product_data.drop(columns=["agent_code"])
prepared_product_data.pos_code = prepared_product_data.pos_code.astype(str)

In [15]:
prepared_product_data.head()

Unnamed: 0,invoice_date,pos_code,product_code,promo_id,shipments,product_cat,is_first_nonzero_sale_date,is_last_sale_date,nonzero_Shipments_1,nonzero_Shipments_2,...,Chg_Shipments_1_from_mm_6,Chg_pct_Shipments_1_from_Shipments_2,Chg_Stock_1_from_ma_Stock_3,Chg_Stock_1_from_ma_Stock_6,Chg_pct_Stock_1_from_Stock_2,Chg_nonzero_Shipments_1_from_ma_nonzero_3,Chg_nonzero_Shipments_1_from_ma_nonzero_6,Chg_nonzero_Shipments_1_from_mm_nonzero_3,Chg_nonzero_Shipments_1_from_mm_nonzero_6,Chg_pct_nonzero_Shipments_1_from_nonzero_Shipments_2
0,2016-01-06,10067501,TBHBKH,0,0.5,large,True,False,0.0,0.0,...,0.0,-999.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-999.0
1,2016-01-13,10067501,TBHBKH,149,0.6,large,False,False,0.5,0.0,...,0.5,5.0,666.066667,832.583333,-1.0001,0.333333,0.416667,0.5,0.5,5.0
2,2016-01-20,10067501,TBHBKH,0,0.0,large,False,False,0.6,0.5,...,0.6,0.2,333.033333,666.066667,0.0,0.233333,0.416667,0.1,0.6,0.2
3,2016-01-27,10067501,TBHBKH,0,0.0,large,False,False,0.6,0.5,...,0.0,-1.0,0.333333,499.966667,5.0,0.233333,0.416667,0.1,0.6,0.2
4,2016-02-03,10067501,TBHBKH,0,0.0,large,False,False,0.6,0.5,...,0.0,-999.0,0.166667,333.366667,0.0,0.233333,0.416667,0.1,0.6,0.2


In [46]:
prepared_product_df = prepared_product_data

In [47]:
list_dataframes = [("shipment_data", shipment_data_df),
                   ("future_visit_plan_df", future_visit_plan_df),
                   ("prepared_product_data", prepared_product_df),
                   ("bank_holidays", bank_holidays_df),
                   ("shipment_split_df", shipment_split_df),
                   ("sr_loading_df", sr_loading_df),
                   ("sr_unloading_df", sr_unloading_df)]

In [48]:
broadcasted_df = sc.broadcast(list_dataframes)

In [81]:
product_list_tasks_RDD = sc.parallelize(product_list_tasks, numSlices=len(product_list_tasks))
product_list_result = product_list_tasks_RDD.map(lambda pro_code: high_level_model_handler(pro_code))

In [82]:
product_list_result.count()

1

# High Level Model Config

In [25]:
val_start_date = '2018-01-01'
test_start_date = '2018-07-01'
run_phase = 'test'
# run_phase = 'val'

high_std_thr = 0.95
low_density_thr = 0.10
low_vp_thr = 0.75
high_sparsity_thr = 0.10

regression_mode = 'ensemble'

quantity_lower_limit = 0.5
very_high_oos_threshold = 10
loading_size_thr = 5


clf_drop_cols = ['VisitPlanWeek', 'week', 'is_odd_month', 'DayNumberOfWeek',
                 'quarter', 'year', 'year_of_engagement',
                 'shipments_1', 'shipments_2', 'shipments_3', 'shipments_4', 'shipments_5', 'shipments_6',
                 'nonzero_Shipments_6', 'nonzero_Shipments_5', 'nonzero_Shipments_4',
                 'nonzero_Shipments_3', 'nonzero_Shipments_2',
                 'days_since_last_sale_3', 'days_since_last_sale_2', 'days_since_last_sale_1',
                 'days_since_last_nonzero_sale_3', 'days_since_last_nonzero_sale_2', 'days_since_last_nonzero_sale_1',
                 'mm_3', 'Chg_Shipments_1_from_mm_3',
                 'mm_nonzero_3', 'Chg_nonzero_Shipments_1_from_mm_nonzero_3',
                 'stock_6', 'stock_5', 'stock_4',
                 'credit_request_type',
                 # 'is_bad_weather_last_three', 'is_bad_weather_last_six',
                 # below two lines are features that shouldn't ever be included in model #
                 'is_last_sale_date', 'isMissedPlan', 'stock', 'promo_id',
                 'isVisitPlan', 'is_zero_sale', 'is_first_nonzero_sale_date']

keep_in_reg_result = ['invoice_date', 'known_shipment', 'predicted_loading',
                      'product_cat', 'pos_code',
                      'VisitPlanWeek', 'month', 'year', 'week', 'days_from_easter',
                      'pre_nonreplacement_holiday',
                      'pred_is_nonzero_shipments',
                      'double_sell', 'triple_sell', 'credit_request_type',
                      'double_sell_1', 'triple_sell_1',
                      'days_since_price_chg_ann', 'days_from_price_chg',
                      'isVisitPlan_1']
use_drop_reg_cols_instead_of_keep = False
reg_keep_cols_1 = ['pos_code', 'week', 'month',
                   'nonzero_Shipments_1', 'nonzero_Shipments_2', 'nonzero_Shipments_3',
                   'nonzero_Shipments_4', 'nonzero_Shipments_5', 'nonzero_Shipments_6',
                   'days_since_last_nonzero_sale', 'ma_nonzero_interval_3',
                   'is_first_plan_date', 'days_since_first_nonzero_sale',
                   'double_sell', 'triple_sell', 'credit_request_coeff',
                   'double_sell_1', 'triple_sell_1',
                   'days_since_price_chg_ann', 'return_1',
                   'pre_nonreplacement_holiday', 'isVisitPlan_1',
                   'is_zero_sale_1', 'num_consecutive_zero_sales',
                   'ma_3', 'ma_6',
                   'ma_nonzero_3', 'ma_nonzero_6']
reg_keep_cols_2 = ['DayNumberOfWeek', 'pos_code',
                   'pre_nonreplacement_holiday', 'month',
                   'days_since_price_chg_ann',
                   'triple_sell', 'double_sell',
                   'credit_request_coeff', 'days_from_easter']
                   # 'is_bad_weather_last_three', 'is_bad_weather_last_six']

reg_drop_cols = ['shipments_6', 'shipments_5', 'shipments_4',  #'shipments_3', 'shipments_2',#'shipments_1',
                 'Chg_pct_Shipments_1_from_Shipments_2',
                 'ma_6', 'ma_3',
                 # 'Chg_Shipments_1_from_ma_6', 'Chg_Shipments_1_from_ma_3',
                 'mm_6', 'mm_3',
                 'Chg_Shipments_1_from_mm_6', 'Chg_Shipments_1_from_mm_3',
                 'days_since_first_sale',  #'is_first_plan_date',
                 # 'ma_interval_3',
                 'days_since_last_sale_3', 'days_since_last_sale_2',
                 'days_since_last_sale_1', 'days_since_last_sale',
                 'double_sell_lead_1',  # 'triple_sell_lead_1',
                 # 'double_sell_2', 'double_sell_1', 'double_sell',
                 # 'triple_sell_2', 'triple_sell_1', 'triple_sell',
                 'credit_request_type',  # 'days_from_easter', 'credit_request_coeff',
                 'isVisitPlan_2',  #'isVisitPlan_1', 'return_1',
                 'days_from_price_chg', # 'days_since_price_chg_ann', 'PricelistChg_1',
                 'num_consecutive_nonzero_sales',  #'num_consecutive_zero_sales',
                 # 'nonzero_Shipments_6', 'nonzero_Shipments_5', 'nonzero_Shipments_4',
                 # 'nonzero_Shipments_3', 'nonzero_Shipments_2', 'nonzero_Shipments_1',
                 'Chg_pct_nonzero_Shipments_1_from_nonzero_Shipments_2',
                 # 'ma_nonzero_6', 'ma_nonzero_3',
                 'Chg_nonzero_Shipments_1_from_ma_nonzero_6',  #'Chg_nonzero_Shipments_1_from_ma_nonzero_3',
                 'mm_nonzero_6', 'mm_nonzero_3',
                 'Chg_nonzero_Shipments_1_from_mm_nonzero_6', 'Chg_nonzero_Shipments_1_from_mm_nonzero_3',
                 'isMissedPlan_2',  #'isMissedPlan_1',
                 'stock_6', 'stock_5', 'stock_4', 'stock_3',  #'stock_2', 'stock_1',
                 'Chg_pct_Stock_1_from_Stock_2',
                 'ma_Stock_6', 'ma_Stock_3',
                 'Chg_Stock_1_from_ma_Stock_6', 'Chg_Stock_1_from_ma_Stock_3',
                 # 'days_since_first_nonzero_sale',
                 # 'ma_nonzero_interval_3',
                 'days_since_last_nonzero_sale_3', 'days_since_last_nonzero_sale_2',
                 # 'days_since_last_nonzero_sale_1', 'days_since_last_nonzero_sale',
                 'week', 'weekday_holiday', 'is_odd_month',
                 # 'month', 'quarter', 'year', 'year_of_engagement', 'DayNumberOfWeek',
                 'promo_id', 'is_zero_sale_3', 'is_zero_sale_2',  #'is_zero_sale_1',
                 'product_cat',  #'pre_nonreplacement_holiday',
                 # 'is_bad_weather_last_three', 'is_bad_weather_last_six',
                 # below two lines are features that shouldn't ever be included in model #
                 'stock', 'isMissedPlan', 'is_last_sale_date', 'is_zero_sale',
                 'is_first_nonzero_sale_date', 'isVisitPlan']

# Summer event hosting markets related params
ct_event_agents = ['1810', '1811', '1815', '1816', '1822',
                   '1823', '1825', '1826', '1828', '1829']


# Aggregation

In [43]:

# aggregates predicted shipments at date, agent level for a product
# and applies dynamic buffering to get loading prediction
def high_level_post_model_aggregator(product_date_agent_forecasts_data: pd.DataFrame,
                                     train_data_with_agent: pd.DataFrame,
                                     past_test_data_with_agent: pd.DataFrame,
                                     inflated_demand_marker_data: pd.DataFrame,
                                     agg_buffer_pct: float,
                                     force_loading_upper_limit: bool) -> pd.DataFrame:
    current_date = product_date_agent_forecasts_data['visit_date'].iloc[0]
    current_agent = product_date_agent_forecasts_data['agent_code'].iloc[0]
    # print(current_date, current_agent)

    train_data_cols = ['visit_date', 'agent_code', 'pos_code', 'shipments']
    poswise_forecast = product_date_agent_forecasts_data.copy()
    current_pos_list = poswise_forecast['pos_code']
    train_df = train_data_with_agent.loc[
        train_data_with_agent['pos_code'].isin(current_pos_list), train_data_cols]
    # The pos-set wise data for days in latest week can paint an incomplete aggregate picture.
    # Thus pos-set wise data should only be taken till last week to current date
    past_test_data_with_agent = past_test_data_with_agent.loc[
        (past_test_data_with_agent['year'] < current_date.year)
        | (past_test_data_with_agent['week'] < current_date.weekofyear)]
    if len(past_test_data_with_agent) > 0:
        past_test_df = past_test_data_with_agent.loc[
            past_test_data_with_agent['pos_code'].isin(current_pos_list), train_data_cols]
        past_test_df = past_test_df.loc[past_test_df['visit_date'] < current_date]
        train_df = pd.concat([train_df, past_test_df], ignore_index=True)
    train_agent_df = train_data_with_agent.loc[
        train_data_with_agent['agent_code'] == current_agent, train_data_cols]
    if len(past_test_data_with_agent) > 0:
        past_test_agent_df = past_test_data_with_agent.loc[
            past_test_data_with_agent['agent_code'] == current_agent, train_data_cols]
        past_test_agent_df = past_test_agent_df.loc[past_test_agent_df['visit_date'] < current_date]
        train_agent_df = pd.concat([train_agent_df, past_test_agent_df], ignore_index=True)
    new_pos_list = current_pos_list[~current_pos_list.isin(train_df['pos_code'])]
    # print('new pos count:', glen(new_pos_list))
    new_pos_count = len(new_pos_list)
    train_pos_count = len(current_pos_list) - new_pos_count
    current_pos_count = len(current_pos_list)
    if train_pos_count != 0:
        extrapolation_factor: float = current_pos_count/train_pos_count
    else:
        extrapolation_factor: float = np.nan

    # Agg the results
    agg_forecast = poswise_forecast.groupby(['visit_date']).agg(
        {'predicted_loading': 'sum',
         'known_shipment': 'sum',
         'orig_pred_load': 'sum',
         'pred_is_nonzero_shipments': 'mean',
         'pre_nonreplacement_holiday': 'mean',
         'VisitPlanWeek': 'first',
         'week': 'first',
         'month': 'first',
         'year': 'first',
         'credit_request_type': 'first',
         'days_from_easter': 'first',
         'days_since_price_chg_ann': 'first',
         'days_from_price_chg': 'first',
         'agent_code': 'first',
         'product_cat': 'first'})
    assert len(agg_forecast) == 1
    agg_forecast['original_predicted_loading'] = agg_forecast['predicted_loading']
    agg_forecast['pred_is_nonzero_shipments'] = \
        agg_forecast['pred_is_nonzero_shipments'].astype('float')
    agg_forecast['pred_zero_pct'] = 1 - agg_forecast['pred_is_nonzero_shipments']
    agg_forecast = agg_forecast.drop(columns=['pred_is_nonzero_shipments'])
    agg_forecast['pre_nonreplacement_holiday'] = \
        agg_forecast['pre_nonreplacement_holiday'].astype('float')
    agg_forecast['NationalDoubleSell'] = inflated_demand_marker_data['NationalDoubleSell']
    agg_forecast['NationalTripleSell'] = inflated_demand_marker_data['NationalTripleSell']
    agg_forecast['curr_pos_count'] = [current_pos_count]
    agg_forecast['new_pos_count'] = [new_pos_count]
    assert len(agg_forecast) == 1

    nan_count = 0
    if train_pos_count == 0:
        limit_pos_set_max_sale = np.nan
        limit_pos_set_peak_sale = np.nan
        limit_pos_set_very_high_sale = np.nan
        limit_pos_set_high_sale = np.nan
        limit_pos_set_median_sale = np.nan
        train_shipments_mean = np.nan
        train_shipments_min = np.nan
        nan_count += 1
    else:
        # prepare
        shipments_train_sample = train_df.groupby(
            [pd.Grouper(key='visit_date', freq='W'), 'pos_code']).agg({'shipments': 'mean'})
        shipments_train_sample = shipments_train_sample.reset_index().groupby(
            'visit_date').agg({'shipments': 'sum', 'pos_code': 'count'})
        shipments_train_sample.columns = ['shipments_train_sum', 'pos_code_count']
        limit_pos_set_max_sale = (shipments_train_sample['shipments_train_sum'].quantile(1)
                                  * extrapolation_factor)
        limit_pos_set_peak_sale = (shipments_train_sample['shipments_train_sum'].quantile(0.999)
                                   * extrapolation_factor)
        limit_pos_set_very_high_sale = (shipments_train_sample['shipments_train_sum'].quantile(0.99)
                                        * extrapolation_factor)
        limit_pos_set_high_sale = (shipments_train_sample['shipments_train_sum'].quantile(0.98)
                                   * extrapolation_factor)
        limit_pos_set_median_sale = (shipments_train_sample['shipments_train_sum'].median()
                                     * extrapolation_factor)

        shipments_train_sample = shipments_train_sample.last('6M')
        train_shipments_mean = shipments_train_sample['shipments_train_sum'].mean()
        train_shipments_min = shipments_train_sample['shipments_train_sum'].min()
    if len(train_agent_df) > 0:
        train_agent_shipments = train_agent_df.groupby(
            'visit_date').agg({'shipments': 'sum'})
        train_agent_shipments.columns = ['agent_shipments_train_sum']
        limit_agent_max_sale = train_agent_shipments[
            'agent_shipments_train_sum'].quantile(1)
        limit_agent_peak_sale = train_agent_shipments[
            'agent_shipments_train_sum'].quantile(0.999)
        limit_agent_very_high_sale = train_agent_shipments[
            'agent_shipments_train_sum'].quantile(0.99)
        limit_agent_high_sale = train_agent_shipments[
            'agent_shipments_train_sum'].quantile(0.98)
        limit_agent_median_sale = train_agent_shipments[
            'agent_shipments_train_sum'].median()
    else:
        limit_agent_max_sale = np.nan
        limit_agent_peak_sale = np.nan
        limit_agent_very_high_sale = np.nan
        limit_agent_high_sale = np.nan
        limit_agent_median_sale = np.nan
        nan_count += 1
    if nan_count < 2:
        limit_max_sale = np.nanmax([limit_pos_set_max_sale, limit_agent_max_sale])
        limit_peak_sale = np.nanmax([limit_pos_set_peak_sale, limit_agent_peak_sale])
        limit_very_high_sale = np.nanmax([limit_pos_set_very_high_sale, limit_agent_very_high_sale])
        limit_high_sale = np.nanmax([limit_pos_set_high_sale, limit_agent_high_sale])
    else:
        # case where both agent and pos set is entirely new
        limit_max_sale = np.nan
        limit_peak_sale = np.nan
        limit_very_high_sale = np.nan
        limit_high_sale = np.nan
    if limit_pos_set_median_sale != np.nan and limit_agent_median_sale != np.nan:
        limit_median_sale = np.nanmean([limit_pos_set_median_sale, limit_agent_median_sale])
    elif limit_pos_set_median_sale != np.nan:
        limit_median_sale = limit_pos_set_median_sale
    elif limit_agent_median_sale != np.nan:
        limit_median_sale = limit_agent_median_sale
    else:
        limit_median_sale = quantity_lower_limit
    agg_forecast['train_shipments_mean'] = [train_shipments_mean]
    agg_forecast['train_shipments_min'] = [train_shipments_min]
    agg_forecast['limit_max_sale'] = [limit_max_sale]
    agg_forecast['limit_peak_sale'] = [limit_peak_sale]
    agg_forecast['limit_very_high_sale'] = [limit_very_high_sale]
    agg_forecast['limit_high_sale'] = [limit_high_sale]
    agg_forecast['limit_median_sale'] = [limit_median_sale]

    # agg loading buffer
    def _agg_loading_levels_list(predicted_loading):
        return [predicted_loading * (1 + agg_buffer_pct),
                predicted_loading + agg_loading_buffer]

    agg_forecast['nominal_agg_buffer'] = [train_shipments_mean*agg_buffer_pct]
    train_shipments_mean = train_shipments_mean*extrapolation_factor
    agg_loading_buffer = train_shipments_mean*agg_buffer_pct
    agg_forecast['used_agg_buffer'] = [agg_loading_buffer]
    # print(agg_loading_buffer)
    agg_forecast['predicted_loading'] = \
        agg_forecast['predicted_loading'].apply(lambda x:
                                                np.nanmean(_agg_loading_levels_list(x)))

    # Upper limit
    if force_loading_upper_limit:
        if (agg_forecast['NationalTripleSell'].iloc[0]
                or (agg_forecast['credit_request_type'].iloc[0] == 'TRIPLE')):
            agg_forecast['predicted_loading'] = [np.nanmin(
                [agg_forecast['predicted_loading'].iloc[0], limit_max_sale])]
        elif (agg_forecast['NationalDoubleSell'].iloc[0]
              or (agg_forecast['credit_request_type'].iloc[0] == 'DOUBLE')
              or (agg_forecast['days_from_easter'].iloc[0] >= 0)
              or agg_forecast['pre_nonreplacement_holiday'].iloc[0] > 0):
            agg_forecast['predicted_loading'] = [np.nanmin(
                [agg_forecast['predicted_loading'].iloc[0], limit_peak_sale])]
        elif agg_forecast['predicted_loading'].iloc[0] <= loading_size_thr:
            agg_forecast['predicted_loading'] = [np.nanmin(
                [agg_forecast['predicted_loading'].iloc[0], limit_very_high_sale])]
        else:
            agg_forecast['predicted_loading'] = [np.nanmin(
                [agg_forecast['predicted_loading'].iloc[0], limit_high_sale])]

    # if ((agg_forecast.index.month[0] == 12 and agg_forecast.index.day[0] < 20)
    #         or (agg_forecast.index.month[0] < 12)):
    agg_forecast['predicted_loading'] = \
        agg_forecast['predicted_loading'].clip(limit_median_sale, None)

    # Finalize
    agg_forecast['predicted_loading'] = \
        np.round(agg_forecast['predicted_loading'] + np.nanmin([2, train_shipments_min * 0.2]), 1)
    if agg_forecast['predicted_loading'].iloc[0] <= quantity_lower_limit:
        agg_forecast['predicted_loading'] = quantity_lower_limit + 0.1
    else:
        agg_forecast['predicted_loading'] = np.round(agg_forecast['predicted_loading'] + 0.5, 1)

    return agg_forecast


def high_level_post_agg_comparison_prep(agg_result_data: pd.DataFrame,
                                        product_code: str,
                                        shipment_data: pd.DataFrame,
                                        shipment_split_data: pd.DataFrame,
                                        sr_loading_data: pd.DataFrame,
                                        sr_unloading_data: pd.DataFrame) -> pd.DataFrame:
    forecast_comparison_df = agg_result_data.reset_index().set_index(
        ['visit_date', 'agent_code', 'product_code'])

    # Add shipment split data: plan compliant & ER
    forecast_comparison_df = forecast_comparison_df.merge(
        shipment_split_data,
        how='left',
        on=['visit_date', 'agent_code', 'product_code'])
    forecast_comparison_df.loc[:, ['non_vp_shipment',
                                   'vp_shipment']] = \
        forecast_comparison_df.loc[:, ['non_vp_shipment', 'vp_shipment']].fillna(0)

    # Add actual shipment for comparison with predicted loading
    shipment_compare = shipment_data[shipment_data['product_code'] == product_code]
    shipment_compare = shipment_compare.groupby(['invoice_date',
                                                 'agent_code',
                                                 'product_code'])[['quantity']].sum()
    shipment_compare.reset_index(inplace=True)
    shipment_compare.rename(columns={'invoice_date': 'visit_date'}, inplace=True)
    shipment_compare.set_index(['visit_date', 'agent_code', 'product_code'], inplace=True)
    forecast_comparison_df['actual_shipment'] = shipment_compare['quantity'].round(1)
    # For non-ER runs
    # forecast_comparison_df['total_shipment'] = shipment_compare['quantity'].round(1)
    # forecast_comparison_df['actual_shipment'] = forecast_comparison_df['vp_shipment'].round(1)

    # add actual loading data
    forecast_comparison_df['sr_loading'] = sr_loading_data['quantity']
    forecast_comparison_df['sr_unloading'] = sr_unloading_data['quantity']

    # if all loaded quantities are unloaded, loading won't be null but shipment wil be
    forecast_comparison_df.loc[
        forecast_comparison_df['sr_loading'].notna(), 'actual_shipment'] = forecast_comparison_df.loc[
        forecast_comparison_df['sr_loading'].notna(), 'actual_shipment'].fillna(0)

    # performance comparison
    residual = forecast_comparison_df['predicted_loading'] - forecast_comparison_df['actual_shipment']
    forecast_comparison_df['asl_unloading'] = residual.apply(lambda x: x if x > 0 else 0)
    forecast_comparison_df['OOS_amt_indicative'] = residual.apply(lambda x: -x if x < 0 else 0)
    forecast_comparison_df['is_OOS'] = residual.apply(lambda x: 1 if x <= 0 else 0)
    jti_residual = forecast_comparison_df['sr_loading'] - forecast_comparison_df['actual_shipment']
    forecast_comparison_df['jti_unloading'] = jti_residual.apply(lambda x: x if x > 0 else 0)
    forecast_comparison_df['jti_OOS'] = jti_residual.apply(lambda x: 1 if x <= 0 else 0)

    return forecast_comparison_df


def get_forecast_performance_figures(forecast_comparison_data: pd.DataFrame,) -> tuple:
    forecast_comparison_df = forecast_comparison_data[
        forecast_comparison_data['sr_loading'].notna()]
    pred_unloading = (forecast_comparison_df['asl_unloading'].sum()
                      / forecast_comparison_df['predicted_loading'].sum())*100
    pred_oos = forecast_comparison_df['is_OOS'].mean()*100
    jti_unloading = (forecast_comparison_df['jti_unloading'].sum()
                     / forecast_comparison_df['sr_loading'].sum())*100
    jti_oos = forecast_comparison_df['jti_OOS'].mean()*100

    return pred_unloading, pred_oos, jti_unloading, jti_oos

# Models

In [41]:
def rf_regressor(x_trn: pd.DataFrame,
                 y_trn: np.ndarray,
                 x_val: pd.DataFrame,
                 y_val: np.ndarray) -> tuple:
    x_trn, x_val = x_trn.copy(), x_val.copy()
    y_trn, y_val = y_trn.copy(), y_val.copy()
    model = RandomForestRegressor(n_estimators=400, min_samples_leaf=3,
                                  n_jobs=-1, random_state=7)
    _ = model.fit(x_trn, y_trn)

    training_score = model.score(x_trn, y_trn)
    validation_score = model.score(x_val, y_val)

    return model, training_score, validation_score

In [42]:
def rf_classifier(x_trn: pd.DataFrame,
                  y_trn: np.ndarray,
                  x_val: pd.DataFrame,
                  y_val: np.ndarray) -> tuple:
    x_trn, x_val = x_trn.copy(), x_val.copy()
    y_trn, y_val = y_trn.copy(), y_val.copy()

    model = RandomForestClassifier(n_estimators=400, min_samples_leaf=16,
                                   class_weight='balanced',
                                   n_jobs=1, random_state=7)
    _ = model.fit(x_trn, y_trn)

    training_score = model.score(x_trn, y_trn)
    validation_score = model.score(x_val, y_val)

    clf_report = classification_report(y_val, model.predict(x_val))
    ck_score = cohen_kappa_score(y_val, model.predict(x_val))

    return model, training_score, validation_score, clf_report, ck_score

# High Level Model Handler Tools

In [73]:
def single_date_split_model_forecast(product_data: pd.DataFrame,
                                     inflated_demand_marker_data: pd.DataFrame,
                                     present_agent_map_data: pd.DataFrame,
                                     past_agent_map_data: pd.DataFrame,
                                     pred_date: pd.Timestamp,
                                     remove_first_month_from_training: bool,
                                     y_col: str,
                                     model_period: str,
                                     hyperparams_dict: dict,
                                     product_code: str,
                                     so_code: str,
                                     model_pickle_path: str,
                                     run_log_path: str,
                                     int_result_dump_path: str):
    period_column_switcher: dict = {
        'annual': 'year',
        'monthly': 'month',
        'weekly': 'week'}
    period_column = period_column_switcher[model_period]
    period_designator = eval('pred_date.{}'.format(period_column))
    # print(model_period)

    # split data into train & test segments
    _train_test_split = train_test_split(
        product_data, pred_date, remove_first_month_from_training, model_period)
    if _train_test_split is not None:
        df_train, df_test, df_leftover, extra_test_features_introduced = _train_test_split
    else:
        return None

    # check for condition for re-training period model
    period_data =product_data.loc[product_data['year'] == pred_date.year]
    period_data = period_data.loc[period_data[period_column] == period_designator]
    dates_in_period_sorted = period_data.sort_index().index.unique()
    is_pred_date_first_in_period: bool = (dates_in_period_sorted[0] == pred_date)

    # model data & get forecast
    df_test_after_clf, forecast_result = model(
        df_train,
        df_test,
        y_col,
        is_pred_date_first_in_period,
        extra_test_features_introduced,
        model_period,
        product_code,
        so_code,
        model_pickle_path,
        run_log_path
    )

    # post model forecast data prep
    forecast_result =forecast_result[
        keep_in_reg_result + extra_test_features_introduced]
    forecast_result = forecast_result.rename(columns={'invoice_date': 'visit_date'})
    forecast_result = forecast_result.merge(
        present_agent_map_data,
        how='left',
        on=['visit_date', 'pos_code'])
    forecast_result['orig_pred_load'] = forecast_result['predicted_loading']
    # print(forecast_result.head())
    train_data_with_agent = df_train.reset_index()
    train_data_with_agent = train_data_with_agent.merge(past_agent_map_data,
                                                        how='left',
                                                        on=['invoice_date', 'pos_code'])
    train_data_with_agent.rename(columns={'invoice_date': 'visit_date'}, inplace=True)
    # print(train_data_with_agent.head())
    past_data_not_in_train_with_agent = df_leftover.reset_index()
    past_data_not_in_train_with_agent = past_data_not_in_train_with_agent.merge(
        past_agent_map_data,
        how='left',
        on=['invoice_date', 'pos_code'])
    past_data_not_in_train_with_agent.rename(columns={'invoice_date': 'visit_date'}, inplace=True)
    # print(past_data_not_in_train_with_agent)

    # forecast aggregation & buffer addition
    forecast_df =forecast_result.groupby(
        'agent_code').apply(high_level_post_model_operations,
                                     train_data_with_agent=train_data_with_agent,
                                     past_test_data_with_agent=past_data_not_in_train_with_agent,
                                     inflated_demand_marker_data=inflated_demand_marker_data,
                                     model_type=model_period,
                                     product_code=product_code,
                                     hyperparams_dict=hyperparams_dict,
                                     int_result_dump_path=int_result_dump_path,
                                     run_log_path=run_log_path)
    forecast_df = forecast_df.drop(columns=['agent_code'])

    return forecast_result, forecast_df


def train_test_split(product_data: pd.DataFrame,
                     pred_date: pd.Timestamp,
                     remove_first_month_from_training: bool,
                     model_period = None):
    first_month_in_data = product_data['month'].min()
    first_year_in_data = product_data['year'].min()

    if model_period == 'annual':
        df_train =product_data.loc[product_data['year'] < pred_date.year].copy()
    elif model_period == 'monthly':
        df_train =product_data.loc[product_data.index
                                                  < pd.Timestamp(year=pred_date.year,
                                                                 month=pred_date.month,
                                                                 day=1)].copy()
    elif model_period == 'weekly':
        df_train =product_data.loc[(product_data['year'] < pred_date.year)
                                                  | (product_data['week'] < pred_date.weekofyear)].copy()
    else:
        df_train =product_data.loc[product_data.index < pred_date].copy()

    if remove_first_month_from_training:
        df_train = df_train.loc[
            ~((df_train['month'] == first_month_in_data)
              & (df_train['year'] == first_year_in_data))].copy()

    df_test =product_data.loc[
        product_data.index == pred_date]
    df_test = df_test.loc[df_test['isVisitPlan']].copy()
    if len(df_test) == 0:
        return None
    problematic_pos, df_test, extra_test_features_introduced = \
        _flag_problematic_pos(df_train, df_test.reset_index())
    df_test = df_test.set_index('invoice_date')

    df_leftover = product_data.loc[(product_data.index > df_train.index.max())
                                   & (product_data.index < df_test.index.min())]

    return df_train, df_test, df_leftover, extra_test_features_introduced


def prepare_features_for_sklearn_modelling(x_train_df: pd.DataFrame,
                                           x_test_df: pd.DataFrame) -> tuple:
    # transform the features to a format compatible with sklearn ml models
    categorize_train_dict = _categorize_train(x_train_df)
    x_train = categorize_train_dict['x_train_df']

    data_preproc_train_dict = _data_preproc_train(x_train, max_cat_count=0)
    x_train = data_preproc_train_dict['x_train_df']

    x_test, _ = _categorize_val(categorize_train_dict, x_test_df)
    x_test, _ = _data_preproc_val(data_preproc_train_dict, x_test)
    # for cases where category variable seen for first time in x_test
    x_test = x_test.fillna(0)

    return x_train, x_test


def nonzero_shipment_classification(df_train_clf: pd.DataFrame,
                                    df_test_clf: pd.DataFrame,
                                    y_col: str,
                                    clf_cols_to_remove: list,
                                    extra_test_features_introduced: list,
                                    product_code: str,
                                    rerun_model: bool,
                                    model_pickle_path: str,
                                    run_log_path: str) -> pd.DataFrame:
    clf_train_features_to_remove = clf_cols_to_remove
    clf_train_features_to_remove.append(y_col)
    # print(clf_train_features_to_remove)
    y_train_clf = df_train_clf[y_col]
    x_train_clf =df_train_clf.drop(columns=clf_train_features_to_remove)
    # print(x_train_clf.head())
    # print(x_train_clf.columns.tolist())
    # print('no. of features:', glen(x_train.columns))

    clf_test_features_to_remove = clf_train_features_to_remove.copy()
    clf_test_features_to_remove.extend(extra_test_features_introduced)
    # print(clf_test_features_to_remove)
    y_test_clf = df_test_clf[y_col]
    x_test_clf =df_test_clf.drop(columns=clf_test_features_to_remove)
    # print(x_test_clf.head())
    # print(x_test_clf.columns.tolist())

    x_train_clf_proc, x_test_clf_proc = \
        prepare_features_for_sklearn_modelling(x_train_clf, x_test_clf)

    if rerun_model:
        model_clf, clf_trn_true_pct, clf_test_true_pct, \
            clf_trn_score, clf_test_score, clf_report, clf_ck_score = \
            high_level_rf_clf_modeler(x_train_clf_proc,
                                      y_train_clf,
                                      x_test_clf_proc,
                                      y_test_clf)
        dump(model_clf, model_pickle_path)
        with open(run_log_path, 'a') as model_log:
            model_log.write(product_code + '\n')
            model_log.write('clf train true percent: ' + str(clf_trn_true_pct) + '\n')
            model_log.write('clf test true percent: ' + str(clf_test_true_pct) + '\n')
            model_log.write('clf train score: ' + str(clf_trn_score) + '\n')
            model_log.write('clf test score: ' + str(clf_test_score) + '\n')
            model_log.write('clf report:\n' + str(clf_report) + '\n')
            model_log.write('clf ck score: ' + str(clf_ck_score) + '\n')
    else:
        model_clf = load(model_pickle_path)

    is_nonzero_test = model_clf.predict(x_test_clf_proc)
    assert len(is_nonzero_test) == len(df_test_clf)
    df_test_clf.loc[:, 'pred_is_nonzero_shipments'] = is_nonzero_test
    # print(df_test_clf.head())

    return df_test_clf


def drop_cols_regression(df_train_reg: pd.DataFrame,
                         df_test_reg: pd.DataFrame,
                         y_col: str,
                         extra_test_features_introduced: list,
                         rerun_model: bool,
                         model_pickle_path: str) -> np.ndarray:
    reg_train_features_to_remove = reg_drop_cols.copy()
    reg_train_features_to_remove.append(y_col)
    x_train_reg =df_train_reg.drop(
        columns=reg_train_features_to_remove)
    # print(x_train_reg.head())
    # print(x_train_reg.columns)
    y_train_reg = df_train_reg[y_col]

    reg_test_features_to_remove = reg_train_features_to_remove.copy()
    reg_test_features_to_remove.extend(extra_test_features_introduced)
    reg_test_features_to_remove.append('pred_is_nonzero_shipments')
    x_test_reg =df_test_reg.drop(
        columns=reg_test_features_to_remove)
    # print(x_test_reg.head())
    # print(x_test_reg.columns)
    y_test_reg = df_test_reg[y_col]

    x_train_reg_proc, x_test_reg_proc = \
        prepare_features_for_sklearn_modelling(x_train_reg, x_test_reg)

    if rerun_model:
        model_reg, x_test_reg_proc, trn_score, val_score = \
            high_level_rf_reg_modeler(x_train_reg_proc,
                                      y_train_reg,
                                      x_test_reg_proc,
                                      y_test_reg)
        dump(model_reg, model_pickle_path)
    else:
        model_reg = load(model_pickle_path)

    y_pred = model_reg.predict(x_test_reg_proc)

    return y_pred


def multi_model_keep_cols_regression(df_train_reg: pd.DataFrame,
                                     df_test_reg: pd.DataFrame,
                                     y_col: str,
                                     rerun_model: bool,
                                     model_pickle_path_1: str,
                                     model_pickle_path_2: str) -> np.ndarray:
    if regression_mode != 'model_2':
        x_train_reg_1 =df_train_reg[reg_keep_cols_1]
        # print(x_train_reg_1.head())
        # print(x_train_reg_1.columns.tolist())
        y_train_reg_1 = df_train_reg[y_col]

        x_test_reg_1 =df_test_reg[reg_keep_cols_1]
        # print(x_test_reg_1.head())
        # print(x_test_reg_1.columns.tolist())
        y_test_reg_1 = df_test_reg[y_col]

        x_train_reg_proc_1, x_test_reg_proc_1 = \
            prepare_features_for_sklearn_modelling(x_train_reg_1, x_test_reg_1)
        # print(x_test_reg_proc_1.head())
        # print(x_test_reg_proc_1.head())

        if rerun_model:
            model_reg_1, trn_score_1, val_score_1 = \
                high_level_rf_reg_modeler(x_train_reg_proc_1,
                                          y_train_reg_1,
                                          x_test_reg_proc_1,
                                          y_test_reg_1)
            dump(model_reg_1, model_pickle_path_1)
        else:
            model_reg_1 = load(model_pickle_path_1)

        y_pred_1 = model_reg_1.predict(x_test_reg_proc_1)
        forecast_1 =df_test_reg.reset_index()[['invoice_date', 'pos_code']]
        forecast_1['predicted_loading_1'] = y_pred_1

    if regression_mode != 'model_1':
        x_train_reg_2 =df_train_reg[reg_keep_cols_2]
        # print(x_train_reg_2.head())
        # print(x_train_reg_2.columns.tolist())
        y_train_reg_2 = df_train_reg[y_col]

        x_test_reg_2 =df_test_reg[reg_keep_cols_2]
        # print(x_test_reg_2.head())
        # print(x_test_reg_2.columns.tolist())
        y_test_reg_2 = df_test_reg[y_col]

        x_train_reg_proc_2, x_test_reg_proc_2 = \
            prepare_features_for_sklearn_modelling(x_train_reg_2, x_test_reg_2)
        # print(x_train_reg_proc_2.head())
        # print(x_test_reg_proc_2.head())

        if rerun_model:
            model_reg_2, trn_score_2, val_score_2 = \
                high_level_rf_reg_modeler(x_train_reg_proc_2,
                                          y_train_reg_2,
                                          x_test_reg_proc_2,
                                          y_test_reg_2)
            dump(model_reg_2, model_pickle_path_2)
        else:
            model_reg_2 = load(model_pickle_path_2)

        y_pred_2 = model_reg_2.predict(x_test_reg_proc_2)
        forecast_2 =df_test_reg.reset_index()[['invoice_date', 'pos_code']]
        forecast_2['predicted_loading_2'] = y_pred_2

    if regression_mode == 'model_1':
        y_pred = y_pred_1
    elif regression_mode == 'model_2':
        y_pred = y_pred_2
    elif regression_mode == 'ensemble':
        temp = df_test_reg.reset_index()
        assert (temp['pre_nonreplacement_holiday'].dtype == 'bool'
                and temp['triple_sell'].dtype == 'bool'
                and temp['double_sell'].dtype == 'bool')
        special_days_mask = (temp['pre_nonreplacement_holiday']
                             | temp['triple_sell']
                             | temp['double_sell'])
        special_days_mask = (special_days_mask
                             | (temp['days_since_price_chg_ann'] >= 0)
                             | (temp['credit_request_coeff'] > 0)
                             | (temp['days_from_easter'] >= 0))
        # assert (temp['is_bad_weather_last_three'].dtype == 'bool'
        #         and temp['is_bad_weather_last_six'].dtype == 'bool')
        # bad_weather_mask = (temp['is_bad_weather_last_three']
        #                     | temp['is_bad_weather_last_six'])
        # bad_weather_mask = (bad_weather_mask & ~special_days_mask)
        forecast =forecast_1.rename(
            columns={'predicted_loading_1': 'predicted_loading'})
        forecast.loc[special_days_mask, 'predicted_loading'] = \
            [max(item) for item in zip(forecast_1.loc[special_days_mask, 'predicted_loading_1'],
                                       forecast_2.loc[special_days_mask, 'predicted_loading_2'])]
        # forecast.loc[bad_weather_mask, 'predicted_loading'] = forecast_2.loc[
        #     bad_weather_mask, 'predicted_loading_2']
        assert len(forecast) == len(y_pred_1) == len(y_pred_2)
        y_pred = forecast['predicted_loading'].to_numpy()

    return y_pred


def model(df_train: pd.DataFrame,
          df_test: pd.DataFrame,
          y_col: str,
          is_pred_date_first_in_period: bool,
          extra_test_features_introduced: list,
          model_period: str,
          current_product: str,
          so_code: str,
          model_pickle_path: str,
          run_log_path: str) -> tuple:

    clf_model_pickle_path = os.path.join(
        model_pickle_path,
        'model_clf_{}_{}_{}.jpkl'.format(model_period, so_code, current_product))
    if is_pred_date_first_in_period:
        # Do training till last period marker & pickle the model for later use
        df_test = nonzero_shipment_classification(
            df_train,
            df_test,
            y_col,
            clf_drop_cols,
            extra_test_features_introduced,
            current_product,
            True,
            clf_model_pickle_path,
            run_log_path
        )
    else:
        try:
            df_test = nonzero_shipment_classification(
                df_train,
                df_test,
                y_col,
                clf_drop_cols,
                extra_test_features_introduced,
                current_product,
                False,
                clf_model_pickle_path,
                run_log_path
            )
        except FileNotFoundError:
            # Do training till last period marker & pickle the model for later use
            df_test = nonzero_shipment_classification(
                df_train,
                df_test,
                y_col,
                clf_drop_cols,
                extra_test_features_introduced,
                current_product,
                True,
                clf_model_pickle_path,
                run_log_path
            )

    # non-zero marked shipments forecast using regression
    df_train_reg = df_train.loc[~df_train['is_zero_sale']].copy()
    # Take only visit plan wise
    df_test_reg = df_test.loc[(df_test['isVisitPlan']
                               & df_test['pred_is_nonzero_shipments'])].copy()
    if len(df_test_reg) == 0:
        forecast_result =df_test.loc[
            df_test['isVisitPlan']].rename(columns={'shipments': 'known_shipment'})
        forecast_result = forecast_result.reset_index()
        forecast_result['predicted_loading'] = 0
        return df_test, forecast_result
    if use_drop_reg_cols_instead_of_keep:
        reg_model_pickle_path = os.path.join(
            model_pickle_path,
            'model_reg_{}_{}_{}.jpkl'.format(model_period, so_code, current_product))
        if is_pred_date_first_in_period:
            # Do training till last period marker & pickle the model for later use
            y_pred = drop_cols_regression(
                df_train_reg,
                df_test_reg,
                y_col,
                extra_test_features_introduced,
                True,
                reg_model_pickle_path
            )
        else:
            try:
                y_pred = drop_cols_regression(
                    df_train_reg,
                    df_test_reg,
                    y_col,
                    extra_test_features_introduced,
                    False,
                    reg_model_pickle_path
                )
            except FileNotFoundError:
                # Do training till last period marker & pickle the model for later use
                y_pred = drop_cols_regression(
                    df_train_reg,
                    df_test_reg,
                    y_col,
                    extra_test_features_introduced,
                    True,
                    reg_model_pickle_path
                )
        forecast =df_test_reg.reset_index()[['invoice_date', 'pos_code']]
        forecast['predicted_loading'] = y_pred
    else:
        reg_model_pickle_path_1 = os.path.join(
            model_pickle_path,
            'model_reg_{}_{}_{}_1.jpkl'.format(model_period, so_code, current_product))
        reg_model_pickle_path_2 = os.path.join(
            model_pickle_path,
            'model_reg_{}_{}_{}_2.jpkl'.format(model_period, so_code, current_product))
        if is_pred_date_first_in_period:
            # Do training till last period marker & pickle the model for later use
            y_pred = multi_model_keep_cols_regression(
                df_train_reg,
                df_test_reg,
                y_col,
                True,
                reg_model_pickle_path_1,
                reg_model_pickle_path_2
            )
        else:
            try:
                y_pred = multi_model_keep_cols_regression(
                    df_train_reg,
                    df_test_reg,
                    y_col,
                    False,
                    reg_model_pickle_path_1,
                    reg_model_pickle_path_2
                )
            except FileNotFoundError:
                # Do training till last period marker & pickle the model for later use
                y_pred = multi_model_keep_cols_regression(
                    df_train_reg,
                    df_test_reg,
                    y_col,
                    True,
                    reg_model_pickle_path_1,
                    reg_model_pickle_path_2
                )
    forecast =df_test_reg.reset_index()[['invoice_date', 'pos_code']]
    forecast['predicted_loading'] = y_pred
    forecast_result =df_test.loc[
        df_test['isVisitPlan']].rename(columns={'shipments': 'known_shipment'})
    forecast_result = forecast_result.merge(forecast,
                                            how='left',
                                            on=['invoice_date', 'pos_code'])
    forecast_result['predicted_loading'] = forecast_result['predicted_loading'].fillna(0)

    return df_test, forecast_result


def high_level_post_model_operations(agent_forecasts: pd.DataFrame,
                                     train_data_with_agent: pd.DataFrame,
                                     past_test_data_with_agent: pd.DataFrame,
                                     inflated_demand_marker_data: pd.DataFrame,
                                     model_type: str,
                                     product_code: str,
                                     hyperparams_dict: dict,
                                     int_result_dump_path: str,
                                     run_log_path: str) :
    assert agent_forecasts['agent_code'].nunique() == 1
    assert agent_forecasts['visit_date'].nunique() == 1
    current_pred_date = agent_forecasts['visit_date'].iloc[0]
    current_agent = agent_forecasts['agent_code'].iloc[0]
    current_product_cat = agent_forecasts['product_cat'].iloc[0]
    # print('current agent:', current_agent)

    agent_params: dict = hyperparams_dict[current_agent]
    chosen_agg_buffer_pct = agent_params['agg_buffer_pct']
    chosen_forced_loading_upper_limit = agent_params['force_upper_limit']
    chosen_force_inflation_factor = agent_params['force_inflation_factor']
    chosen_std_adjust_factor = agent_params['std_adjust_factor']

    agg_forecast_df = __high_level_std_inflate_agg_wrapper(
        agent_forecasts,
        train_data_with_agent,
        past_test_data_with_agent,
        inflated_demand_marker_data,
        current_pred_date,
        model_type,
        product_code,
        current_agent,
        int_result_dump_path,
        chosen_agg_buffer_pct,
        chosen_forced_loading_upper_limit,
        chosen_force_inflation_factor,
        chosen_std_adjust_factor
    )

    with open(run_log_path, 'a') as model_log:
        model_log.write('hyperparams loaded from storage:\n')
        model_log.write('final params used for {}, {}: \n'.format(current_agent, current_pred_date))
        model_log.write('agg buffer pct: {}\n'.format(chosen_agg_buffer_pct))
        model_log.write('force upper limit: {}\n'.format(chosen_forced_loading_upper_limit))
        model_log.write('force inflation factor: {}\n'.format(chosen_force_inflation_factor))
        model_log.write('std adjust factor: {}\n'.format(chosen_std_adjust_factor))

    return agg_forecast_df


def _flag_problematic_pos(train_data: pd.DataFrame,
                          test_data: pd.DataFrame) -> tuple:
    train_df = train_data.copy()
    test_df = test_data.copy()

    train_measures =train_df.groupby(
        'pos_code').agg({'shipments': ['std', 'median', 'mean', 'count']})
    train_measures.columns = ['train_' + '_'.join(col) for
                              col in train_measures.columns.values]
    train_measures['train_shipments_vp_count'] = \
        train_df[train_df['isVisitPlan']].groupby(
            'pos_code').agg({'shipments': 'count'})
    # print(train_measures)
    test_df = test_df.merge(train_measures, how='left', on='pos_code')

    train_dates_count = len(set(train_df.index))
    train_df['shipments'] = train_df['shipments'].replace(0, np.nan)
    train_df = train_df.dropna(subset=['shipments'])

    train_nonzero_measures =train_df.groupby(
        'pos_code').agg({'shipments': ['std', 'median', 'mean', 'count'],
                         'isVisitPlan': 'sum'})
    train_nonzero_measures.columns = ['train_nonzero_' + '_'.join(col) for
                                      col in train_nonzero_measures.columns.values]
    train_nonzero_measures['train_nonzero_shipments_vp_count'] = \
        train_df[train_df['isVisitPlan']].groupby(
            'pos_code').agg({'shipments': 'count'})
    # TODO: train_nonzero_shipments_vp_count is same as train_nonzero_isVisitPlan_sum
    test_df = test_df.merge(train_nonzero_measures, how='left', on='pos_code')
    test_df['Shipments_train_density'] = test_df['train_nonzero_shipments_count'] / train_dates_count
    test_df['Shipments_train_sparsity'] = 1 - (test_df['train_nonzero_shipments_vp_count']
                                               / test_df['train_shipments_vp_count'])
    test_df['isVisitPlan_train_density'] = (test_df['train_nonzero_isVisitPlan_sum']
                                            / test_df['train_nonzero_shipments_count'])
    # print(test_df)

    mask = ((test_df['train_nonzero_shipments_std']
             >= test_df['train_nonzero_shipments_std'].quantile(high_std_thr))
            | (test_df['train_shipments_std']
               >= test_df['train_shipments_std'].quantile(high_std_thr))) \
           & ((test_df['Shipments_train_density'] <= low_density_thr)
              | (test_df['isVisitPlan_train_density'] <= low_vp_thr))
    problematic_pos = pd.Series(test_df.loc[mask, 'pos_code'].unique())

    extra_test_features_introduced = ['train_nonzero_shipments_std',
                                      'train_nonzero_shipments_median',
                                      'train_nonzero_shipments_mean',
                                      'train_nonzero_shipments_count',
                                      'train_nonzero_isVisitPlan_sum',
                                      'train_nonzero_shipments_vp_count',
                                      'train_shipments_std',
                                      'train_shipments_mean',
                                      'train_shipments_median',
                                      'train_shipments_count',
                                      'train_shipments_vp_count',
                                      'Shipments_train_density',
                                      'Shipments_train_sparsity',
                                      'isVisitPlan_train_density']

    return problematic_pos, test_df, extra_test_features_introduced


# Adjustments based on std
def _post_model_std_based_adjust(forecast_result: pd.DataFrame,
                                 std_adjustment_factor: float) -> pd.DataFrame:
    forecast_result.loc[:, 'predicted_loading_std_adj_amt'] = 0
    high_std_thr_val = forecast_result['train_nonzero_shipments_std'].quantile(high_std_thr)
    high_overall_std_thr_val = forecast_result['train_shipments_std'].quantile(high_std_thr)

    if std_adjustment_factor > 0:
        std_adjust_mask = (((forecast_result['train_nonzero_shipments_std'] >= high_std_thr_val)
                            | (forecast_result['train_shipments_std'] >= high_overall_std_thr_val))
                           & (forecast_result['Shipments_train_density'] <= low_density_thr)
                           & (forecast_result['isVisitPlan_train_density'] > low_vp_thr))
        predicted_loading_std_adj = forecast_result.loc[
                                        std_adjust_mask,
                                        'train_nonzero_shipments_std'] * std_adjustment_factor
        predicted_loading_std_adj = predicted_loading_std_adj.fillna(0)
        forecast_result.loc[
            std_adjust_mask, 'predicted_loading'] = forecast_result.loc[
            std_adjust_mask, 'predicted_loading'] + predicted_loading_std_adj
        forecast_result.loc[
            std_adjust_mask, 'predicted_loading_std_adj_amt'] = predicted_loading_std_adj

    return forecast_result


# double/triple sell inflation before aggregation
def _post_model_result_inflation(forecast_result: pd.DataFrame,
                                 inflation_factor: float,
                                 int_result_dump_path: str,
                                 pred_date: pd.Timestamp,
                                 model_type: str,
                                 product_code: str,
                                 agent_code = None) -> pd.DataFrame:
    def __inflation_levels_list(row, result_column_name):
        _inflation_factor = inflation_factor
        if row['triple_sell'] or row['credit_request_type'] == 'TRIPLE':
            _inflation_factor = _inflation_factor + 0.2

        if not row['pred_is_nonzero_shipments']:
            return [row[result_column_name] * _inflation_factor, 0]
        else:
            return [row[result_column_name] * _inflation_factor,
                    (row[result_column_name]
                     + row['train_nonzero_shipments_mean'] * (_inflation_factor - 1))]

    forecast_result.loc[:, 'predicted_loading'] = \
        forecast_result.apply(lambda row:
                              np.nanmax(__inflation_levels_list(row, 'predicted_loading'))
                              if (row['triple_sell']
                                  or row['double_sell']
                                  or (row['credit_request_type'] == 'TRIPLE')
                                  or (row['credit_request_type'] == 'DOUBLE')
                                  or (row['days_from_easter'] >= 0)
                                  or row['pre_nonreplacement_holiday'])

                              else np.nanmean(__inflation_levels_list(row, 'predicted_loading'))
                              if (row['days_since_price_chg_ann'] > 0
                                  and row['days_from_price_chg'] > 0)

                              else row['predicted_loading'],
                              axis=1)

    if agent_code:
        forecast_result.to_csv(
            os.path.join(int_result_dump_path,
                         'result_with_zeroes_std_adjust_n_inflated_{}_{}_{}_{}_{}.csv'.format(
                             inflation_factor, model_type, agent_code, product_code, pred_date.date()))
        )
    else:
        forecast_result.to_csv(
            os.path.join(int_result_dump_path,
                         'result_with_zeroes_std_adjust_n_inflated_{}_{}_{}_{}.csv'.format(
                             inflation_factor, model_type, product_code, pred_date.date()))
        )
    return forecast_result


def __high_level_std_inflate_agg_wrapper(agent_forecasts: pd.DataFrame,
                                         train_data_with_agent: pd.DataFrame,
                                         past_test_data_with_agent: pd.DataFrame,
                                         inflated_demand_marker_data: pd.DataFrame,
                                         pred_date: pd.Timestamp,
                                         model_type: str,
                                         product_code: str,
                                         agent_code: str,
                                         int_result_dump_path: str,
                                         agg_buffer_pct: float,
                                         force_upper_limit: bool,
                                         inflation_factor: float,
                                         std_adjustment_factor: float) -> pd.DataFrame:
    agent_forecasts = _post_model_std_based_adjust(
        agent_forecasts,
        std_adjustment_factor
    )
    agent_forecasts = _post_model_result_inflation(
        agent_forecasts,
        inflation_factor,
        int_result_dump_path,
        pred_date,
        model_type,
        product_code,
        agent_code
    )
    agg_forecast_df =__high_level_agg(
        agent_forecasts,
        train_data_with_agent,
        past_test_data_with_agent,
        inflated_demand_marker_data,
        product_code,
        agg_buffer_pct,
        force_upper_limit
    )

    return agg_forecast_df


def __high_level_agg(agent_forecasts: pd.DataFrame,
                     train_data_with_agent: pd.DataFrame,
                     past_test_data_with_agent: pd.DataFrame,
                     inflated_demand_marker_data: pd.DataFrame,
                     product_code: str,
                     agg_buffer_pct: float,
                     force_upper_limit: bool) -> pd.DataFrame:

    agg_forecast_df =high_level_post_model_aggregator(
        agent_forecasts,
        train_data_with_agent,
        past_test_data_with_agent,
        inflated_demand_marker_data,
        agg_buffer_pct,
        force_upper_limit
    )
    agg_forecast_df['product_code'] = product_code

    return agg_forecast_df


def __form_tuning_result_df(keys: list, values: list) -> pd.DataFrame:
    assert len(keys) == len(values)
    assert isinstance(values[0], list)
    df_len = len(values[0])
    if df_len > 1:
        return pd.DataFrame(dict(zip(keys, values)))
    elif df_len == 1:
        return pd.DataFrame(dict(zip(keys, values)), index=[0])

In [61]:
shipment_data = broadcasted_df.value[0][1]
present_agent_map_data = broadcasted_df.value[1][1].copy()
prepared_product_df = broadcasted_df.value[2][1].copy()
inflated_demand_marker_data = broadcasted_df.value[3][1].copy()
shipment_split_data = broadcasted_df.value[4][1].copy()
sr_loading_data = broadcasted_df.value[5][1].copy()
sr_unloading_data = broadcasted_df.value[6][1].copy()
present_agent_map_data = present_agent_map_data.reset_index()

prepared_product_data = prepared_product_df[prepared_product_df.product_code.isin([pro_code])].copy()
prepared_product_data = prepared_product_data.set_index(['product_code', 'invoice_date']).sort_index()

current_product = pro_code

model_pickle_path = 'model_pickles'
os.makedirs(model_pickle_path, exist_ok=True)
# Model pickles are only utilised for reusing during the full run for 1 product
# So at the start of run for a new product any existing model pickle file can be
# discarded to save disk space.
exisiting_model_pickle_files = os.listdir(model_pickle_path)
if len(exisiting_model_pickle_files) > 0:
    for model_pickle in exisiting_model_pickle_files:
        file_path = os.path.join(model_pickle_path, model_pickle)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
        except Exception as e:
            pass
int_result_dump_path = os.path.join(output_path, 'intermediate_dumps')
os.makedirs(int_result_dump_path, exist_ok=True)  # succeeds even if directory exists.
hyperparams_storage_path = 'hyperparams'
os.makedirs(hyperparams_storage_path, exist_ok=True)

current_product = prepared_product_data.index.get_level_values(0)[0]
# print("current_product", current_product)
product_df =prepared_product_data.copy()
product_df.index = product_df.index.droplevel()
product_df = product_df.sort_index()
dates_in_data: pd.DatetimeIndex = product_df.index
pred_dates = dates_in_data[dates_in_data >= pred_start_date].unique().array
daywise_forecast_df_list: list = []
daywise_forecast_raw_annual_model_list: list = []
daywise_forecast_raw_monthly_model_list: list = []
daywise_forecast_raw_weekly_model_list: list = []

past_agent_map =shipment_data.loc[shipment_data['product_code'] == current_product]
past_agent_map = past_agent_map.reset_index()
past_agent_map = past_agent_map[['invoice_date', 'pos_code', 'agent_code']]

with open(os.path.join(
        hyperparams_storage_path,
        '{}_{}_hyperparams.json'.format(so_code, current_product)), 'r') as fp:
    hyperparams_dict: dict = json.load(fp)


In [64]:
pred_date = pred_dates[1]
pred_date

Timestamp('2018-01-04 00:00:00')

In [76]:
daywise_forecasts = high_level_daywise_model_handler(
            product_df,
            inflated_demand_marker_data,
            present_agent_map_data,
            past_agent_map,
            pred_date,
            remove_first_month_from_training,
            y_col,
            hyperparams_dict,
            current_product,
            so_code,
            run_log_path,
            model_pickle_path,
            int_result_dump_path
        )

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


# High Level Handlers

In [55]:
def high_level_model_handler(pro_code):
    
    shipment_data = broadcasted_df.value[0][1]
    present_agent_map_data = broadcasted_df.value[1][1].copy()
    prepared_product_df = broadcasted_df.value[2][1].copy()
    inflated_demand_marker_data = broadcasted_df.value[3][1].copy()
    shipment_split_data = broadcasted_df.value[4][1].copy()
    sr_loading_data = broadcasted_df.value[5][1].copy()
    sr_unloading_data = broadcasted_df.value[6][1].copy()
    present_agent_map_data = present_agent_map_data.reset_index()

    prepared_product_data = prepared_product_df[prepared_product_df.product_code.isin([pro_code])].copy()
    prepared_product_data = prepared_product_data.set_index(['product_code', 'invoice_date']).sort_index()

    current_product = pro_code
    
    model_pickle_path = 'model_pickles'
    os.makedirs(model_pickle_path, exist_ok=True)
    # Model pickles are only utilised for reusing during the full run for 1 product
    # So at the start of run for a new product any existing model pickle file can be
    # discarded to save disk space.
    exisiting_model_pickle_files = os.listdir(model_pickle_path)
    if len(exisiting_model_pickle_files) > 0:
        for model_pickle in exisiting_model_pickle_files:
            file_path = os.path.join(model_pickle_path, model_pickle)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception as e:
                pass
    int_result_dump_path = os.path.join(output_path, 'intermediate_dumps')
    os.makedirs(int_result_dump_path, exist_ok=True)  # succeeds even if directory exists.
    hyperparams_storage_path = 'hyperparams'
    os.makedirs(hyperparams_storage_path, exist_ok=True)

    current_product = prepared_product_data.index.get_level_values(0)[0]
    # print("current_product", current_product)
    product_df =prepared_product_data.copy()
    product_df.index = product_df.index.droplevel()
    product_df = product_df.sort_index()
    dates_in_data: pd.DatetimeIndex = product_df.index
    pred_dates = dates_in_data[dates_in_data >= pred_start_date].unique().array
    daywise_forecast_df_list: list = []
    daywise_forecast_raw_annual_model_list: list = []
    daywise_forecast_raw_monthly_model_list: list = []
    daywise_forecast_raw_weekly_model_list: list = []

    past_agent_map =shipment_data.loc[shipment_data['product_code'] == current_product]
    past_agent_map = past_agent_map.reset_index()
    past_agent_map = past_agent_map[['invoice_date', 'pos_code', 'agent_code']]

    with open(os.path.join(
            hyperparams_storage_path,
            '{}_{}_hyperparams.json'.format(so_code, current_product)), 'r') as fp:
        hyperparams_dict: dict = json.load(fp)

    # TODO: record date wise runtime
    for pred_date in tqdm(pred_dates):
        # print(pred_date)
        daywise_forecasts = high_level_daywise_model_handler(
            product_df,
            inflated_demand_marker_data,
            present_agent_map_data,
            past_agent_map,
            pred_date,
            remove_first_month_from_training,
            y_col,
            hyperparams_dict,
            current_product,
            so_code,
            run_log_path,
            model_pickle_path,
            int_result_dump_path
        )
        if daywise_forecasts is not None:
            curr_forecast_df, curr_forecast_raw_annual_model, \
                curr_forecast_raw_monthly_model, curr_forecast_raw_weekly_model = daywise_forecasts
            daywise_forecast_df_list.append(curr_forecast_df)
            daywise_forecast_raw_annual_model_list.append(curr_forecast_raw_annual_model)
            daywise_forecast_raw_monthly_model_list.append(curr_forecast_raw_monthly_model)
            daywise_forecast_raw_weekly_model_list.append(curr_forecast_raw_weekly_model)
        else:
            continue

    if len(daywise_forecast_df_list) == 0:
        return None
    else:
        forecast_df =pd.concat(daywise_forecast_df_list, sort=False)
        forecast_raw_annual_model =pd.concat(daywise_forecast_raw_annual_model_list, sort=False)
        forecast_raw_annual_model.to_csv(
            os.path.join(int_result_dump_path,
                         'result_with_zeroes_annual_model_{}.csv'.format(current_product))
        )
        forecast_raw_monthly_model =pd.concat(daywise_forecast_raw_monthly_model_list, sort=False)
        forecast_raw_monthly_model.to_csv(
            os.path.join(int_result_dump_path,
                         'result_with_zeroes_monthly_model_{}.csv'.format(current_product))
        )
        forecast_raw_weekly_model =pd.concat(daywise_forecast_raw_weekly_model_list, sort=False)
        forecast_raw_weekly_model.to_csv(
            os.path.join(int_result_dump_path,
                         'result_with_zeroes_weekly_model_{}.csv'.format(current_product))
        )

        forecast_comparison_df = high_level_post_agg_comparison_prep(
            forecast_df,
            current_product,
            shipment_data,
            shipment_split_data,
            sr_loading_data,
            sr_unloading_data
        )
        asl_unload_pct, asl_oos_pct, jti_unload_pct, jti_oos_pct = get_forecast_performance_figures(
            forecast_comparison_df
        )
        asl_cost = asl_unload_pct + asl_oos_pct
        jti_cost = jti_unload_pct + jti_oos_pct
        with open(run_log_path, 'a') as model_log:
            model_log.write('product {} final result:'.format(current_product))
            model_log.write('asl_unload_pct: ' + str(asl_unload_pct)
                            + 'asl_oos_pct: ' + str(asl_oos_pct)
                            + 'asl_cost: ' + str(asl_cost) + '\n')
            model_log.write('jti_unload_pct: ' + str(jti_unload_pct)
                            + 'jti_oos_pct: ' + str(jti_oos_pct)
                            + 'jti_cost: ' + str(jti_cost) + '\n')
        forecast_comparison_df.to_csv(os.path.join(
            output_path,
            'final_output_rf_high_no_pos_removed_{}.csv'.format(current_product)))

        return forecast_comparison_df


def high_level_daywise_model_handler(product_data: pd.DataFrame,
                                     inflated_demand_marker_data: pd.DataFrame,
                                     present_agent_map_data: pd.DataFrame,
                                     past_agent_map_data: pd.DataFrame,
                                     pred_date: pd.Timestamp,
                                     remove_first_month_from_training: bool,
                                     y_col: str,
                                     hyperparams_dict: dict,
                                     product_code: str,
                                     so_code: str,
                                     run_log_path: str,
                                     model_pickle_path: str,
                                     int_result_dump_path: str):
    try:
        # Annual model section
        annual_model_forecasts = single_date_split_model_forecast(
            product_data,
            inflated_demand_marker_data,
            present_agent_map_data,
            past_agent_map_data,
            pred_date,
            remove_first_month_from_training,
            y_col,
            'annual',
            hyperparams_dict,
            product_code,
            so_code,
            model_pickle_path,
            run_log_path,
            int_result_dump_path
        )
        if annual_model_forecasts is not None:
            forecast_raw_annual_model, forecast_df_annual_model = annual_model_forecasts
        else:
            return None

        # Monthly model section
        monthly_model_forecasts = single_date_split_model_forecast(
            product_data,
            inflated_demand_marker_data,
            present_agent_map_data,
            past_agent_map_data,
            pred_date,
            remove_first_month_from_training,
            y_col,
            'monthly',
            hyperparams_dict,
            product_code,
            so_code,
            model_pickle_path,
            run_log_path,
            int_result_dump_path
        )
        if monthly_model_forecasts is not None:
            forecast_raw_monthly_model, forecast_df_monthly_model = monthly_model_forecasts
        else:
            return None

        # Weekly model section
        weekly_model_forecasts = single_date_split_model_forecast(
            product_data,
            inflated_demand_marker_data,
            present_agent_map_data,
            past_agent_map_data,
            pred_date,
            remove_first_month_from_training,
            y_col,
            'weekly',
            hyperparams_dict,
            product_code,
            so_code,
            model_pickle_path,
            run_log_path,
            int_result_dump_path
        )
        if weekly_model_forecasts is not None:
            forecast_raw_weekly_model, forecast_df_weekly_model = weekly_model_forecasts
        else:
            return None

        assert len(forecast_df_annual_model) == len(forecast_df_monthly_model) == len(forecast_df_weekly_model)
        # check if some other columns need to be dropped
        forecast_df =forecast_df_annual_model.drop(columns=['predicted_loading'])
        forecast_df.loc[:, 'predicted_loading_annual_model'] = forecast_df_annual_model['predicted_loading'].array
        forecast_df.loc[:, 'predicted_loading_monthly_model'] = forecast_df_monthly_model['predicted_loading'].array
        forecast_df.loc[:, 'predicted_loading_weekly_model'] = forecast_df_weekly_model['predicted_loading'].array
        forecast_df['predicted_loading'] = forecast_df[
            ['predicted_loading_annual_model',
             'predicted_loading_monthly_model',
             'predicted_loading_weekly_model']].mean(axis=1, skipna=True)
        forecast_df['predicted_loading'] = forecast_df['predicted_loading'].round(1)

        return forecast_df, forecast_raw_annual_model, forecast_raw_monthly_model, forecast_raw_weekly_model
    except Exception as e:
        print('error in {}: check log'.format(product_code))
        with open(run_log_path, 'a') as model_log:
            model_log.write(product_code + ' ' + str(pred_date) + ' error encountered: \n')
            model_log.write(traceback.format_exc())

# Modelers

In [45]:
def high_level_daywise_model_handler(product_data: pd.DataFrame,
                                     inflated_demand_marker_data: pd.DataFrame,
                                     present_agent_map_data: pd.DataFrame,
                                     past_agent_map_data: pd.DataFrame,
                                     pred_date: pd.Timestamp,
                                     remove_first_month_from_training: bool,
                                     y_col: str,
                                     hyperparams_dict: dict,
                                     product_code: str,
                                     so_code: str,
                                     run_log_path: str,
                                     model_pickle_path: str,
                                     int_result_dump_path: str) :
    try:
        # Annual model section
        annual_model_forecasts = single_date_split_model_forecast(
            product_data,
            inflated_demand_marker_data,
            present_agent_map_data,
            past_agent_map_data,
            pred_date,
            remove_first_month_from_training,
            y_col,
            'annual',
            hyperparams_dict,
            product_code,
            so_code,
            model_pickle_path,
            run_log_path,
            int_result_dump_path
        )
        if annual_model_forecasts is not None:
            forecast_raw_annual_model, forecast_df_annual_model = annual_model_forecasts
        else:
            return None

        # Monthly model section
        monthly_model_forecasts = single_date_split_model_forecast(
            product_data,
            inflated_demand_marker_data,
            present_agent_map_data,
            past_agent_map_data,
            pred_date,
            remove_first_month_from_training,
            y_col,
            'monthly',
            hyperparams_dict,
            product_code,
            so_code,
            model_pickle_path,
            run_log_path,
            int_result_dump_path
        )
        if monthly_model_forecasts is not None:
            forecast_raw_monthly_model, forecast_df_monthly_model = monthly_model_forecasts
        else:
            return None

        # Weekly model section
        weekly_model_forecasts = single_date_split_model_forecast(
            product_data,
            inflated_demand_marker_data,
            present_agent_map_data,
            past_agent_map_data,
            pred_date,
            remove_first_month_from_training,
            y_col,
            'weekly',
            hyperparams_dict,
            product_code,
            so_code,
            model_pickle_path,
            run_log_path,
            int_result_dump_path
        )
        if weekly_model_forecasts is not None:
            forecast_raw_weekly_model, forecast_df_weekly_model = weekly_model_forecasts
        else:
            return None

        assert len(forecast_df_annual_model) == len(forecast_df_monthly_model) == len(forecast_df_weekly_model)
        # check if some other columns need to be dropped
        forecast_df =forecast_df_annual_model.drop(columns=['predicted_loading'])
        forecast_df.loc[:, 'predicted_loading_annual_model'] = forecast_df_annual_model['predicted_loading'].array
        forecast_df.loc[:, 'predicted_loading_monthly_model'] = forecast_df_monthly_model['predicted_loading'].array
        forecast_df.loc[:, 'predicted_loading_weekly_model'] = forecast_df_weekly_model['predicted_loading'].array
        forecast_df['predicted_loading'] = forecast_df[
            ['predicted_loading_annual_model',
             'predicted_loading_monthly_model',
             'predicted_loading_weekly_model']].mean(axis=1, skipna=True)
        forecast_df['predicted_loading'] = forecast_df['predicted_loading'].round(1)

        return forecast_df, forecast_raw_annual_model, forecast_raw_monthly_model, forecast_raw_weekly_model
    except Exception as e:
        print('error in {}: check log'.format(product_code))
        with open(run_log_path, 'a') as model_log:
            model_log.write(product_code + ' ' + str(pred_date) + ' error encountered: \n')
            model_log.write(traceback.format_exc())

# ML Preprocess

In [33]:

# Convert strings into categorical for training data
def _categorize_train(x_train_df: pd.DataFrame,
                      y_train_series = pd.Series(),
                      x_skip_cols = None) -> dict:
    x_train_df = x_train_df.copy()
    y_train_series = y_train_series.copy()
    if x_skip_cols is None:
        x_skip_cols = []

    for col, series in x_train_df.items():
        if col not in x_skip_cols:
            if is_string_dtype(series):
                x_train_df[col] = series.astype('category').cat.as_ordered()

    categorize_train_dict = {'x_train_df': x_train_df,
                             'y_train_series': y_train_series,
                             'x_skip_cols': x_skip_cols}

    return categorize_train_dict


# Convert strings into categorical for validation data
def _categorize_val(categorize_train_dict: dict,
                    x_val_df: pd.DataFrame,
                    y_val_series = None) -> tuple:
    x_val_df = x_val_df.copy()
    if y_val_series is None:
        y_val_series = pd.Series()
    else:
        y_val_series = y_val_series.copy()

    x_train_df = categorize_train_dict['x_train_df']
    y_train_series = categorize_train_dict['y_train_series']
    x_skip_cols = categorize_train_dict['x_skip_cols']

    for col, series in x_val_df.items():
        if (col in x_train_df.columns) and (col not in x_skip_cols):
            if x_train_df[col].dtype.name == 'category':
                x_val_df[col] = pd.Categorical(series,
                                               categories=x_train_df[col].cat.categories,
                                               ordered=True)

    return x_val_df, y_val_series


# Common non-text preprocessing for training data
def _data_preproc_train(x_train_df: pd.DataFrame,
                        y_train_series = pd.Series(),
                        x_skip_cols = None,
                        max_cat_count: int = 0) -> dict:
    x_train_df = x_train_df.copy()
    y_train_series = y_train_series.copy()
    if x_skip_cols is None:
        x_skip_cols = []

    x_train_df.drop(x_skip_cols, axis=1, inplace=True)

    for col, series in x_train_df.items():
        if (not is_numeric_dtype(series)
                and series.nunique() > max_cat_count):
            x_train_df[col] = series.cat.codes + 1
    x_train_df = pd.get_dummies(x_train_df, dummy_na=True)

    if len(y_train_series) != 0:
        if not is_numeric_dtype(y_train_series):
            y_train_series = y_train_series.cat.codes

    data_preproc_train_dict = {'x_train_df': x_train_df,
                               'y_train_series': y_train_series,
                               'x_skip_cols': x_skip_cols,
                               'max_n_cat': max_cat_count}

    return data_preproc_train_dict


# Ensure validation data has the same columns as the training data
def _handle_missing_cols(x_train_df: pd.DataFrame,
                         x_val_df: pd.DataFrame,
                         fill_value = np.nan) -> pd.DataFrame:
    missing_cols = set(x_train_df.columns) - set(x_val_df.columns)
    for col in missing_cols:
        x_val_df[col] = fill_value
    return x_val_df[x_train_df.columns]


# Common non-text preprocessing for validation data
def _data_preproc_val(data_preproc_train_dict: dict,
                      x_val_df: pd.DataFrame,
                      y_val_series = None) -> list:
    x_val_df = x_val_df.copy()
    if y_val_series is None:
        y_val_series = pd.Series()
    else:
        y_val_series = y_val_series.copy()

    x_train_df = data_preproc_train_dict['x_train_df']
    y_train_series = data_preproc_train_dict['y_train_series']
    x_skip_cols = data_preproc_train_dict['x_skip_cols']
    max_cat_count: int = data_preproc_train_dict['max_n_cat']

    x_val_df.drop(x_skip_cols, axis=1, inplace=True)

    for col, series in x_val_df.items():
        if (not is_numeric_dtype(series)
                and series.nunique() > max_cat_count):
            x_val_df[col] = series.cat.codes + 1
    x_val_df = pd.get_dummies(x_val_df, dummy_na=True)
    x_val_df = _handle_missing_cols(x_train_df, x_val_df)
    x_val_df.fillna(-1, inplace=True)

    if len(y_val_series) != 0:
        if not is_numeric_dtype(y_val_series):
            y_val_series = y_val_series.cat.codes
            y_val_series.fillna(-1, inplace=True)

    return [x_val_df, y_val_series]

# Modelers

In [32]:
def high_level_rf_clf_modeler(x_train: pd.DataFrame,
                              y_train_df: pd.Series,
                              x_test: pd.DataFrame,
                              y_test_df: pd.Series) -> tuple:
    y_train = y_train_df.astype(bool)
    y_train = y_train.values.reshape(1, -1)[0]
    y_test = y_test_df.astype(bool)
    y_test = y_test.values.reshape(1, -1)[0]

    clf_train_true_pct = sum(y_train) / len(y_train)
    clf_test_true_pct = sum(y_test) / len(y_test)

    model_clf, training_score, acc_clf, \
        clf_report, ck_score = rf_classifier(x_train, y_train, x_test, y_test)

    return model_clf, clf_train_true_pct, clf_test_true_pct, \
        training_score, acc_clf, clf_report, ck_score


def high_level_rf_reg_modeler(x_train: pd.DataFrame,
                              y_train_df: pd.Series,
                              x_test: pd.DataFrame,
                              y_test_df: pd.Series) -> tuple:
    y_train = y_train_df.values.reshape(1, -1)[0]
    y_test = y_test_df.values.reshape(1, -1)[0]

    model_rf, trn_score_rf, acc_rf = rf_regressor(x_train, y_train,
                                                  x_test, y_test)

    forecast_type = 'rf_regression'

    # y_pred = model_rf.predict(x_test)
    # result = pd.DataFrame(y_pred,
    #                                     columns=['predicted_loading'],
    #                                     index=y_test_df.index)
    # result['known_shipment'] = y_test_df
    # print(result)

    return model_rf, trn_score_rf, acc_rf


def high_level_event_product_rough_estimate_modeler(event_product_forecast_comparison_data: pd.DataFrame,
                                                    train_data_with_agent: pd.DataFrame,
                                                    train_start_date: pd.Timestamp,
                                                    train_end_date: pd.Timestamp):
    def agentwise_rough_estimate(product_agent_test_df: pd.DataFrame,
                                 train_data_agent: pd.DataFrame) -> Optional[pd.DataFrame]:
        current_agent = product_agent_test_df['agent_code'].iloc[0]
        keep_in_test_df = ['visit_date', 'product_code', 'pre_nonreplacement_holiday',
                           'week', 'month', 'year',
                           'NationalDoubleSell', 'NationalTripleSell',
                           'predicted_event_normal']
        test_df = product_agent_test_df.loc[:, keep_in_test_df].copy()
        train_df = train_data_agent.loc[
            train_data_agent['agent_code'] == current_agent].copy()
        if len(train_df) == 0:
            # no training data
            return None

        model_train_df = train_df.groupby('visit_date').agg({'shipments': 'sum',
                                                             'week': 'first',
                                                             'month': 'first',
                                                             'year': 'first'})
        full_train_index = pd.date_range(train_start_date, train_end_date)
        model_train_df = model_train_df.reindex(full_train_index)
        model_train_df['shipments'] = model_train_df['shipments'].fillna(0)
        model_train_df['week'] = model_train_df.index.weekofyear
        model_train_df['month'] = model_train_df.index.month
        model_train_df['year'] = model_train_df.index.year
        model_train_df['shipments_log'] = np.log(model_train_df['shipments'].clip(0.001, None))

        train_group = model_train_df.groupby(['week']).agg({'shipments_log': ['mean']})
        train_group.columns = ['shipments_log_weekly_mean']
        test_df = pd.merge(test_df, train_group.reset_index(),
                           how='left', on='week')
        train_group = model_train_df.groupby(['month']).agg({'shipments_log': ['max']})
        train_group.columns = ['shipments_log_monthly_max']
        test_df = pd.merge(test_df, train_group.reset_index(),
                           how='left', on='month')
        test_df['predicted_loading'] = np.exp(test_df['shipments_log_weekly_mean']).clip(
            np.exp(test_df['shipments_log_monthly_max']), None)
        test_df.loc[(test_df['NationalDoubleSell']
                     | (test_df['pre_nonreplacement_holiday'] > 0)),
                    'predicted_loading'] = test_df.loc[(test_df['NationalDoubleSell']
                                                        | (test_df['pre_nonreplacement_holiday'] > 0)),
                                                       'predicted_loading'] * 1.3
        test_df.loc[test_df['NationalTripleSell'],
                    'predicted_loading'] = test_df.loc[test_df['NationalTripleSell'],
                                                       'predicted_loading'] * 1.5
        return test_df.set_index(['visit_date'])

    rough_estimates = event_product_forecast_comparison_data.groupby(
        'agent_code').apply(agentwise_rough_estimate, train_data_with_agent)

    return rough_estimates


def post_model_sparse_modeler(date_agent_product_data: pd.DataFrame,
                              prepared_ts_data: pd.DataFrame):
    drop_cols = ['pos_product', 'characteristics', 'forecast_type']
    return_cols = ['predicted_loading', 'double_sell',
                   'triple_sell', 'price_chg_effect']
    assert (date_agent_product_data['double_sell'].dtype == 'bool') and \
           (date_agent_product_data['triple_sell'].dtype == 'bool') and \
           (date_agent_product_data['weekday_holiday'].dtype == 'bool')

    if date_agent_product_data['weekday_holiday'].iloc[0]:
        return None
    dt_agent_prod_df = date_agent_product_data.drop(columns=drop_cols)

    # print(dt_agent_prod_df)
    sparse_model_entries = dt_agent_prod_df[dt_agent_prod_df['predicted_loading'].isnull()]
    norm_model_entries = dt_agent_prod_df[~dt_agent_prod_df['predicted_loading'].isnull()]
    # print(sparse_model_entries)

    if len(sparse_model_entries) > 0:
        product_code = sparse_model_entries['product_code'].iloc[0]
        visit_date = sparse_model_entries['visit_date'].iloc[0]
        pos_product_list = [x + '_' + product_code for x in
                            sparse_model_entries['pos_code'].values]

        # take care to take train data only till the last date before current visit_date
        # here that is done by taking a final .iloc[:-1] slice
        sparse_model_train_list = [prepared_ts_data.loc[
                                   ([pos_product],
                                    slice(visit_date)),
                                   ['quantity',
                                    'pos_code',
                                    'double_sell',
                                    'triple_sell',
                                    'weekday_holiday',
                                    'price_chg_effect']].iloc[:-1] for pos_product in
                                   pos_product_list]
        # print(sparse_model_train_list)
        # remove the pos_product info from all train timeseries
        # so that bare timeseries are left
        sparse_list = []
        discarded_sparse_list = []
        for sparse_train_ts in sparse_model_train_list:
            sparse_train_ts.index = sparse_train_ts.index.droplevel()
            # not_considered_mask = (sparse_train_ts['double_sell'] |
            #                        sparse_train_ts['triple_sell'] |
            #                        sparse_train_ts['weekday_holiday'])
            # sparse_train_ts = sparse_train_ts.loc[~not_considered_mask]
            if sparse_train_ts['quantity'].replace(0, np.nan).mean() >= 10:
                discarded_sparse_list.append(sparse_train_ts['pos_code'].iloc[0])
            else:
                sparse_train_ts = sparse_train_ts[['quantity']]
                sparse_list.append(sparse_train_ts)
        sparse_model_train_list = sparse_list
        # print(sparse_model_train_list)

        if len(sparse_model_train_list) > 0:
            sparse_model_train_data = pd.concat(sparse_model_train_list)
            # print(sparse_model_train_data)
            sparse_model_train_data = sparse_model_train_data.groupby(
                sparse_model_train_data.index).sum()
            # print(sparse_model_train_data)

            sparse_model_forecast = sparse_model_train_data.max().values[0]
            # adjust for holiday effect if current date is holiday effect date
            # if sparse_model_entries['double_sell'].iloc[0]:
            #     sparse_model_forecast = sparse_model_forecast * 2
            # elif sparse_model_entries['triple_sell'].iloc[0]:
            #     sparse_model_forecast = sparse_model_forecast * 3
            # print('sparse_model_forecast:', sparse_model_forecast)
        else:
            sparse_model_forecast = 0
    else:
        sparse_model_forecast = 0
        discarded_sparse_list = []

    ret = dt_agent_prod_df[return_cols].iloc[:1]
    norm_model_forecast = norm_model_entries['predicted_loading'].sum()
    # ret['norm_model_forecast_back'] = norm_model_forecast
    # ret['sparse_model_forecast_back'] = sparse_model_forecast
    # if ret['holiday_effect'].iloc[0] == -1:
    #     if (norm_model_forecast > 5) and (norm_model_forecast <= 15):
    #         sparse_model_forecast = 0
    #     else:
    #         sparse_model_forecast = 0
    #         norm_model_forecast = norm_model_forecast*0.7
    ret['norm_model_forecast'] = norm_model_forecast
    ret['sparse_model_forecast'] = sparse_model_forecast
    ret['predicted_loading'] = norm_model_forecast + sparse_model_forecast
    ret['known_shipment_norm'] = norm_model_entries['known_shipment'].sum()
    ret['known_shipment_sparse'] = sparse_model_entries['known_shipment'].sum()
    ret['non_sparse_count'] = len(norm_model_entries)
    ret['sparse_count'] = len(dt_agent_prod_df) - ret['non_sparse_count']
    ret['behavior_changed_sparse'] = str(discarded_sparse_list)
    ret.index.name = 'discard'
    # print(ret)

    return ret