In [None]:
!pip install pmdarima

In [None]:
from pmdarima.arima import auto_arima,OCSBTest,KPSSTest,ADFTest
from tqdm import tqdm
from sklearn import metrics
import pandas as pd
import numpy as np
import os
# from stationarizer import simple_auto_stationarize
%matplotlib inline

In [None]:
calendar = pd.read_csv("calendar.csv")
sales_data = pd.read_csv("sales_train_validation.csv")
eval_data = pd.read_csv("sales_train_evaluation.csv")
sell_price = pd.read_csv("sell_prices.csv")
submissions = pd.read_csv("sample_submission.csv")

In [None]:
eval_data.head(10)

In [None]:
eval_data['cat_id']  = (eval_data['store_id'] + '_' + eval_data['cat_id'])
eval_data['dept_id'] = (eval_data['store_id'] + '_' + eval_data['dept_id'])
eval_data['id'] = (eval_data['store_id'] + '_' + eval_data['id'])
days = ['d_'+str(i) for i in range(1,1549)]
eval_data = eval_data.drop(columns=days)

In [None]:
hierarchy = {0: "state_id", 1: "store_id", 2: "cat_id", 3: "dept_id", 4: "id"}

In [None]:
## make heirarchical dataframe 
def generate_heirarchy(eval_data):
    final_work = pd.DataFrame()
    for i in hierarchy:
      col = hierarchy[i]
      df_1 = eval_data.groupby(by = [col],as_index= False).sum()
      data_final = df_1.T
      data_final.columns = data_final.loc[col]
      data_final=data_final.drop(col)
      if final_work.empty:
        final_work = data_final.copy()
      else:
        final_work = pd.merge(final_work,data_final,left_index=True, right_index=True)
    final_work['total'] = final_work['CA']+final_work['TX']+final_work['WI']
    final_work= final_work.T
    final_work.to_csv("final_work.csv")
    return final_work

In [None]:
# remove trends and/or seasonality from the data using OSCB test

def stationarize(one, counter=0):    
    try:
        trend_result = ADFTest().should_diff(one)[1]  
        if trend_result and counter<2:
            one = one.diff()
            one =one.dropna()
            counter += 1
            one = stationarize(one, counter)
    except:
        None
    return one

def season_stationary(one):
    try:
        season_result = OCSBTest(m=30).estimate_seasonal_differencing_term(one)
        if season_result:
            one = one.diff(30)
            one =one.dropna()
    except:
        None
    return one
  


In [None]:
# make predictions for the entire heirarchy on each level

# I call this function on different ranges on different machines at the same time  

def make_predictions(final_work, start=0, end=0):
    if end==0:
        end = final_work.shape[0]
    for i in tqdm(range(start,end), position=0, leave=True):
        one = final_work.iloc[i]
        id = final_work.index[i]
        # one = one.drop(['id'])
        one = stationarize(one)
        one = season_stationary(one)
        model_fit = auto_arima(one, m=1, suppress_warnings = True, error_action="ignore")
        pred = model_fit.predict(28)
        result_sales.loc[id] = pred

    result_sales.dropna()
    num = result_sales._get_numeric_data()
    num[num < 0] = 0
    result_sales.to_csv(f'eval_{start}.csv')

    return result_sales


In [None]:
final_work = generate_heirarchy(sales_data)
pred_days= ['F'+ str(i) for i in range(1,29)]
result_sales = pd.DataFrame(columns=pred_days, index = final_work.index)
make_predictions(final_work)

In [None]:
total_pred = pd.read_csv('eval_final.csv')

In [None]:
states = eval_data.state_id.unique()
stores = eval_data.store_id.unique()
depts = eval_data.dept_id.unique()
cats = eval_data.cat_id.unique()
items = eval_data.id.unique()

# Here we build the tree as a dictionary. Each node (key in dict) has a list of 
# children value in dict, which in turn may also be a key in the dict, and have 
# children as well 

total = {'total': list(states)}
state_h = {k: [v for v in stores if v.startswith(k)] for k in states}
store_h = {k: [v for v in cats if v.startswith(k)] for k in stores}
dept_h = {k: [v for v in depts if v.startswith(k)] for k in cats}
item_h = {k: [v for v in items if v.startswith(k)] for k in depts}

In [None]:
hier_dict = {**total, **state_h, **store_h, **dept_h, **item_h}
hier_total = list(states)+list(stores)+list(cats)+list(depts)
hier_total.insert(0,'total')

In [None]:
# total_pred.index = total_pred['Unnamed: 0']
total_pred = total_pred.rename(columns={'Unnamed: 0':'id'})

total_pred = total_pred[~total_pred['id'].isin(hier_total)]

In [None]:
hr = eval_data.iloc[:,0:6]
heir_pred_valid = pd.merge(hr,total_pred,on = 'id')

In [None]:
## rework the entire hierarchy after prediction 

pred_final = generate_heirarchy(heir_pred_valid)

In [None]:
pred_final.to_csv('pred_all_sums.csv')

In [None]:
all_predi = pd.read_csv('eval_final.csv')
all_sum = pd.read_csv('pred_all_sums.csv')

In [None]:
num = all_predi._get_numeric_data()
num[num < 0] = 0

In [None]:
all_sum.head()

In [None]:
all_predi.index = all_predi['Unnamed: 0']
all_sum.index = all_sum['Unnamed: 0']

all_predi = all_predi.drop(columns = ['Unnamed: 0'])
all_sum = all_sum.drop(columns = ['Unnamed: 0'])

all_predi = all_predi.T
all_sum = all_sum.T


In [None]:
# hier_pred = [a for a in final_work.columns if a.endswith('validation')]

In [None]:
# getting disaggregation proportions as mentioned by Hyndman

## p = Y/S 
# Y = predictions on that level
# S = sum of predictions of all levels below that level

def get_proportions(Y, S):
  Y_arr = Y.to_numpy(dtype='float32')
  S_arr = S.to_numpy(dtype='float32')
  P_arr = np.true_divide(Y_arr,S_arr, out=np.zeros_like(Y_arr), where=S_arr!=0)
  P = pd.Series(P_arr)
  return P

In [None]:
extremely_final = pd.DataFrame()
extremely_final['total'] = all_predi['total'] 

In [None]:
# Now we disaggregate the proportions to each level

for i in hier_total:
  p = get_proportions(extremely_final[i],all_sum[i])
  for j in hier_dict[i]:
    x = all_predi[j].to_numpy(dtype='float32')
    y = p.to_numpy(dtype='float32')
    m = np.multiply(x,y)
    extremely_final[j] = m.tolist()

In [None]:
extremely_final = extremely_final.drop(columns=hier_total)
extremely_final = extremely_final.T

In [None]:
extremely_final['id'] = extremely_final.index
extremely_final['id'] = extremely_final['id'].str[5:]

In [None]:
extremely_final.to_csv('extremely_final.csv')