In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from tqdm import tqdm
pd.set_option('display.max_columns', None)

In [2]:
J2_2019 = pd.read_excel('sales_movements_J2.xlsx', sheet_name = '2019')
J2_2018 = pd.read_excel('sales_movements_J2.xlsx', sheet_name = '2018')

In [6]:
len(J2_2018.StockCode.unique())

4226

In [4]:
lost_sales = pd.read_excel("Lost Sales.xlsx", sheet_name = "LostSalesAll")

In [9]:
def lossFunctionSingle(sales, loss_sales, wh, windowSize, sc):
    lost_sales_dict = {}
    """"
    
    sales - first-year sales dataset
    loss_sales - lost sales dataset
    wh - warehouse code (e.g. "J2")
    windowSize - the size used for the slidng window for calculating moving averages and standard deviations 
    sc - a given stock code
    
    returns a dictionary mapping a sales code to an array where index 0 is the 
    number of times a lost sale occurred and index 1 is the total of lost quantity
    windowSize: how many rows at once 
   
    """
    sales.loc[:, 'TrnQty'] = sales['TrnQty'].abs()
    sales = sales[sales["StockCode"] == sc]
    loss_sales = loss_sales[loss_sales["StockCode"] == sc]
    loss_sales = loss_sales[loss_sales["Warehouse"] == wh]
    for index,row in loss_sales.iterrows():
        code = row["StockCode"]
        if code in lost_sales_dict.keys(): 
            lost_sales_dict[code][0] += 1
            lost_sales_dict[code][1] += row["QuantityLost"] 
        else:
            lost_sales_dict[code] = []
            lost_sales_dict[code].append(1)
            lost_sales_dict[code].append(row["QuantityLost"])
    TrnQty_withLoss = []
    sales = sales.groupby(["StockCode","EntryDate"])['TrnQty'].sum().reset_index()
    for index, row in sales.iterrows():
        code = row["StockCode"]
        TrnQty = row['TrnQty']
        if code in lost_sales_dict.keys(): 
            lostSalesAvg = lost_sales_dict[code][1] / lost_sales_dict[code][0]
            TrnQty_withLoss.append(TrnQty + lostSalesAvg)
        else:
            TrnQty_withLoss.append(TrnQty)
            
    sales.loc[:, "TrnQty_withLoss"] = TrnQty_withLoss
    sales.loc[:, "count"] = sales.groupby('StockCode')['StockCode'].transform('count')
    sales.loc[:, "EntryDate"] = pd.to_datetime(sales['EntryDate'])
    sales.loc[:, "mean"] = sales.TrnQty_withLoss.rolling(windowSize, min_periods = 1).mean()
    sales.loc[:, "std"] = sales.TrnQty_withLoss.rolling(windowSize, min_periods = 1).std()
    return sales

    """
    returns a dataframe that contains the moving averages and moving standard deviations 
    for the turnover quantity with loss for each stock code
    
    """

In [7]:
J2_2018.head()

Unnamed: 0,StockCode,Warehouse,TrnYear,TrnMonth,EntryDate,TrnTime,MovementType,TrnQty,TrnValue,UnitCost,SalesOrder,Invoice,DocType,Customer,CostValue,Branch,Salesperson,SalesBin,CustomerPoNumber,ProductClass,DateInfoPulled
0,RDHY0060,J2,2018,1,2017-03-01,10154083,S,1,620.0,470.2936,OJ10282197,IJ10159889,I,JMID200,470.29,J1,J60,J2ROWC01,,RD,2019-10-11
1,RDMZ0260,J2,2018,1,2017-03-01,10154114,S,1,853.0,698.36306,OJ10282197,IJ10159889,I,JMID200,698.36,J1,J60,J2RECV,,RD,2019-10-11
2,WS4531-GG,J2,2018,1,2017-03-01,10165278,S,1,632.58,544.83158,OJ10282244,IJ10159890,I,JDEL300,544.83,J1,K23,BULK14,SINDY,WS,2019-10-11
3,159440,J2,2018,1,2017-03-01,10165356,S,24,1464.0,48.92001,OJ10282244,IJ10159890,I,JDEL300,1174.08,J1,K23,CAGE,SINDY,SIK,2019-10-11
4,RDTA0850,J2,2018,1,2017-03-01,10173083,S,2,3488.0,1137.77313,OJ10282227,IJ10159891,I,JGEM030,2275.55,J1,K11,J2ROWD02,1317AD,RD,2019-10-11


In [None]:
lossFunctionSingle(J2_2018, lost_sales, "J2", 10, "RDHY0060")

In [6]:
def calculateError(dwrw, nextYear, sd):
    
    """
    dwrw - the dataframe that function 'lossFunctionSingle' returns
    nextYear - second-year sales dataframe
    sd - the number of standard deviations that the upper bound is above the mean
    
    returns the mean squared error of turnover quantity of a specific stockcode on the same dates between two years
    """
    
    error = 0
    count = 0
    nNy = nextYear[nextYear["StockCode"] == dwrw.StockCode.unique()[0]]
    nNy.loc[:, "EntryDate"] = pd.to_datetime(nNy['EntryDate'])
    nNy = nNy.groupby(["StockCode","EntryDate"])['TrnQty'].sum().reset_index()
    nNy['month'] = [row['EntryDate'].month for index, row in nNy.iterrows()]
    nNy['day'] = [row['EntryDate'].day for index, row in nNy.iterrows()]
    for index, row in dwrw.iterrows():
        upperBound = row["mean"] + sd * row["std"]
        q = nNy[nNy["month"] == row["EntryDate"].month]
        q = q[q["day"]== row["EntryDate"].day]
        q = q["TrnQty"]
        if len(q) > 0 and not np.isnan(upperBound):
            idx = list(q.index.values)[0]
            qtyatIndex = q[idx]
            if qtyatIndex >= 0:
                error += (q[idx] - upperBound)**2
                count += 1
    if count > 0:
        return error/count
    else:
        return -1
        


In [15]:
def minError(salesdf, nextYear, sd, lossSales, wh, startingpos):
    
    """
    salesdf - first-year sales dataset
    nextYear - second-year sales dataset
    sd - the number of standard deviations that the upper bound is above the mean
    lossSales - lost sales dataframe
    wh - warehouse code (e.g. "J2")
    startingpos - the starting index used to start reading the data
    
    returns a dictionary mapping stock codes to the corresponding optimal 
    window sizes, which should, for each stock code, minimize the error of using the moving averages and moving 
    standard deviations to predict turnover quantity in next year 
    """
    
    salesdf = salesdf.groupby(["StockCode","EntryDate"])['TrnQty'].sum().reset_index()
    salesdf.loc[:, "count"] = salesdf.groupby('StockCode')['StockCode'].transform('count')
    salesdf = salesdf[salesdf["count"] > 9]
    unqStockCodes = salesdf.StockCode.unique()
    minerrordict = {}
    for x in tqdm(range(startingpos, len(unqStockCodes))):
        f = open("n.txt", "a+")
        sc = unqStockCodes[x]
        base = salesdf[salesdf["StockCode"] == sc].shape[0]
        rate = int(round(base * .7))
        dwrw = lossFunctionSingle(salesdf, lossSales, wh, base, sc)
        priorWindowSize = base
        ct = base + 1
        priorError = calculateError(dwrw, nextYear, sd)
        attemptedWindowSizes = []
        errorWindowSizes = []
        minerrordict[sc] = base
        while True:
            if ct == 0:
                break
            if rate < 1:
                break
            currWindowSize = priorWindowSize - rate
            if currWindowSize in attemptedWindowSizes or currWindowSize < 1:
                break
            if len(errorWindowSizes) > 0:
                if currWindowSize < min(errorWindowSizes):
                    break
            attemptedWindowSizes.append(currWindowSize)
            if currWindowSize < 1:
                break
            dwrw = lossFunctionSingle(salesdf, lossSales, wh, currWindowSize, sc)
            currError = calculateError(dwrw, nextYear, sd)
            if currError < 0:
                break
            if currError < priorError:
                rate = 2
                minerrordict[sc] = currWindowSize
                priorError = currError
                priorWindowSize = currWindowSize
            elif currError == priorError:
                rate = rate + int(round(.1 * base))
            else:
                errorWindowSizes.append(currWindowSize)
                rate = rate - int(round(.05 * base))
            ct-=1
        f.write(sc + ": " + str(minerrordict[sc]))
        f.write("\n")
        f.close()
    return minerrordict
        
        

In [16]:
minError(J2_2018, J2_2019, 0, lost_sales, "J2", 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


19
13
7
10



  0%|          | 1/1080 [01:27<26:13:39, 87.51s/it][A

29
19
update
17



  0%|          | 2/1080 [02:40<24:51:42, 83.03s/it][A

10
7
4
6



  0%|          | 3/1080 [04:04<24:57:12, 83.41s/it][A

KeyboardInterrupt: 