# Next Purchase Date with DL

In this use case, we use deep learning approach to know the model's performance if trained with the routine transactions between the user and the item. We assume that routine transactions are when a user buys a certain item at least once a month. To avoid overfitting for the too-frequent transactions (e.g., buying an item every day), we choose two interactions of user and item with the biggest average interval of transactions.

We also perform model training by some items with the highest number of transactions. Hopefully, we can achieve the best score because using the largest number of transactions compared to the others will make the model learn and perform better.

## Load and Preprocess Data

We use more than 500k transaction data between users and items from the EPM database. The raw data still has some returning transactions with a negative amount, but we are only looking for buying transactions. Each transaction has a timestamp record daily. Because a user can buy the same item multiple times on the same day, we consider it a single data aggregating the sales quantity column.

In [2]:
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler

In [3]:
# Load Dataset
df = pd.read_csv('/kaggle/input/epm-prep/EPM.csv')
df = df.drop(['Unnamed: 0', 'principal_code'], axis=1)
df.head()

Unnamed: 0,trx_date,customer_name,ship_to_id,branch_code,item_code,item_desc,principal_desc,gross_sales_amount,sales_qty
0,2021-08-20,JK1-AP. DEVITA_GROUP_NA,EPM_34950,JK1,KCFMB,CEFIXIME 100MG 50 KAPSUL,HEXPHARM (PHARMAMED),195000,3
1,2021-11-02,JK1-AP. DEVITA_GROUP_NA,EPM_34950,JK1,CKCOA,KALCINOL N CREAM 5 GR,KALBE NIMITZ (PHARMAMED),28500,3
2,2021-11-02,JK1-AP. DEVITA_GROUP_NA,EPM_34950,JK1,TMFNB,METFORMIN HCL 200 TABLET,HEXPHARM (PHARMAMED),175000,5
3,2021-11-02,JK1-AP. DEVITA_GROUP_NA,EPM_34950,JK1,TBSVC,BRONSOLVAN 100 TABLET,HEXPHARM TSJ (PHARMAMED),35000,1
4,2021-08-20,JK1-AP. DEVITA_GROUP_NA,EPM_34950,JK1,TALNF,AMLODIPINE BESILATE 10MG,HEXPHARM (PHARMAMED),255000,3


In [4]:
# Filter negative transactions
df = df[(df['sales_qty'] > 0) & (df['gross_sales_amount'] > 0)]
df['trx_date'] = pd.to_datetime(df['trx_date'])
df.head()

Unnamed: 0,trx_date,customer_name,ship_to_id,branch_code,item_code,item_desc,principal_desc,gross_sales_amount,sales_qty
0,2021-08-20,JK1-AP. DEVITA_GROUP_NA,EPM_34950,JK1,KCFMB,CEFIXIME 100MG 50 KAPSUL,HEXPHARM (PHARMAMED),195000,3
1,2021-11-02,JK1-AP. DEVITA_GROUP_NA,EPM_34950,JK1,CKCOA,KALCINOL N CREAM 5 GR,KALBE NIMITZ (PHARMAMED),28500,3
2,2021-11-02,JK1-AP. DEVITA_GROUP_NA,EPM_34950,JK1,TMFNB,METFORMIN HCL 200 TABLET,HEXPHARM (PHARMAMED),175000,5
3,2021-11-02,JK1-AP. DEVITA_GROUP_NA,EPM_34950,JK1,TBSVC,BRONSOLVAN 100 TABLET,HEXPHARM TSJ (PHARMAMED),35000,1
4,2021-08-20,JK1-AP. DEVITA_GROUP_NA,EPM_34950,JK1,TALNF,AMLODIPINE BESILATE 10MG,HEXPHARM (PHARMAMED),255000,3


In [5]:
# Drop duplicate transactions
temp = df[['ship_to_id', 'item_code', 'trx_date', 'sales_qty']].groupby(['ship_to_id', 'item_code', 'trx_date']).sum().reset_index(drop=True)
df = df.drop_duplicates(['ship_to_id', 'item_code', 'trx_date']).reset_index(drop=True)
df['sales_qty'] = temp
df.head()

Unnamed: 0,trx_date,customer_name,ship_to_id,branch_code,item_code,item_desc,principal_desc,gross_sales_amount,sales_qty
0,2021-08-20,JK1-AP. DEVITA_GROUP_NA,EPM_34950,JK1,KCFMB,CEFIXIME 100MG 50 KAPSUL,HEXPHARM (PHARMAMED),195000,1
1,2021-11-02,JK1-AP. DEVITA_GROUP_NA,EPM_34950,JK1,CKCOA,KALCINOL N CREAM 5 GR,KALBE NIMITZ (PHARMAMED),28500,1
2,2021-11-02,JK1-AP. DEVITA_GROUP_NA,EPM_34950,JK1,TMFNB,METFORMIN HCL 200 TABLET,HEXPHARM (PHARMAMED),175000,1
3,2021-11-02,JK1-AP. DEVITA_GROUP_NA,EPM_34950,JK1,TBSVC,BRONSOLVAN 100 TABLET,HEXPHARM TSJ (PHARMAMED),35000,1
4,2021-08-20,JK1-AP. DEVITA_GROUP_NA,EPM_34950,JK1,TALNF,AMLODIPINE BESILATE 10MG,HEXPHARM (PHARMAMED),255000,2


## Training and Evaluation Model

In [6]:
input_window = 3
output_window = 1
block_len = input_window + output_window

# Mean Average Error
criterion = nn.L1Loss() 
lr = 0.005

def create_input_sequences(input_data, input_window, output_window):
    input_seq = []
    L = len(input_data)
    block_num = L - block_len + 1

    for i in range(block_num):
        train_seq = input_data[i : i + input_window]
        train_label = input_data[i + output_window : i + input_window + output_window]
        input_seq.append((train_seq ,train_label))

    return torch.FloatTensor(np.array(input_seq))

def create_sliding_window(x):

    data = pd.DataFrame(x).to_numpy().reshape(-1, 1).reshape(-1)
    times = len(data)

    sampels = int(times * 0.8)
    train_data = data[:sampels]
    test_data = data[sampels:]
    test_data2 = test_data

    train_data = create_input_sequences(train_data, input_window, output_window).to('cpu')
    test_data = create_input_sequences(test_data, input_window, output_window).to('cpu')
    
    return train_data, test_data, test_data2

class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()       
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = 1 / (10000 ** ((2 * np.arange(d_model)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term[0::2])
        pe[:, 1::2] = torch.cos(position * div_term[1::2])

        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :].repeat(1,x.shape[1],1)
          
class Transformer(nn.Module):
    def __init__(self, feature_size=250, num_layers=1, dropout=0.1):
        super(Transformer, self).__init__()
        self.model_type = 'Transformer'
        self.input_embedding  = nn.Linear(1, feature_size)
        self.src_mask = None

        self.pos_encoder = PositionalEncoding(feature_size)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=feature_size, nhead=10, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.decoder = nn.Linear(feature_size, 1)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1    
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.input_embedding(src)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

def get_batch(input_data, i, batch_size):

    batch_len = min(batch_size, len(input_data) - i)
    data = input_data[i:i + batch_len]
    src = torch.stack([item[0] for item in data]).view((batch_len,input_window,1))
    target = torch.stack([item[1] for item in data]).view((batch_len,input_window,1))
    return src, target

def train_one_epoch(epoch, train_data, model, optimizer):

    model.train()
    batch_size = len(train_data) // 5
    total_loss = 0
    start_time = time.time()

    for i in range(0, len(train_data), batch_size):
        data, targets = get_batch(train_data, i, batch_size)
        optimizer.zero_grad()
        output = model(data)

        loss = criterion(output, targets)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.7)
        optimizer.step()

        total_loss += loss.item()
        log_interval = int(len(train_data) / batch_size / 5)
        log_interval = max(log_interval, 1)
        if (i // batch_size) % log_interval == 0 and i > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} instances | {:5.2f} ms | loss {:5.5f}'.format(
                    epoch, i, len(train_data), elapsed * 1000, cur_loss))
            total_loss = 0
            start_time = time.time()
    
    return model

We use two usecases from the machine learning code (routine transactions and top items). After choosing a specific user and item, we put those data into the model. We perform 5 epochs for training the transformers model. The models are trained with 80% of the data and evaluated with the rest by the Mean Average Error and Mean Squared Error metrics.

### Routine Transactions

In [7]:
# Taken from ML - Routine Transactions code
lst = [('EPM_136080', 'TRGCA'), ('EPM_3041843', 'CKPXB')]
series = []
for l in lst:
    dfU = df[(df['ship_to_id'] == l[0]) & (df['item_code'] == l[1])].sort_values('trx_date').reset_index(drop=True)[['ship_to_id', 'item_code', 'trx_date', 'sales_qty']]
    dfU['trx_date'] = pd.to_datetime(dfU['trx_date'])
    dfU = dfU.drop_duplicates('trx_date').reset_index(drop=True)
    dfU['Period'] = dfU['trx_date'].diff().apply(lambda x: x.days)[1:].reset_index(drop=True)
    series.append(list(dfU['Period'][:-1]))

dfU.head() # We only use 'Period' column as the 'series' object

Unnamed: 0,ship_to_id,item_code,trx_date,sales_qty,Period
0,EPM_3041843,CKPXB,2021-01-20,1,30.0
1,EPM_3041843,CKPXB,2021-02-19,1,18.0
2,EPM_3041843,CKPXB,2021-03-09,2,10.0
3,EPM_3041843,CKPXB,2021-03-19,1,15.0
4,EPM_3041843,CKPXB,2021-04-03,5,30.0


In [8]:
# Convert series data to sliding window data
input_window = 3
output_window = 1
block_len = input_window + output_window
train_datas = []
test_datas = []

for s in series:
    train, test, x = create_sliding_window(s)
    train_datas.append(train)
    test_datas.append(test)

print("Size of the train and test data:", train_datas[-1].shape, test_datas[-1].shape)
print("The first three data:")
train_datas[-1][:3]

Size of the train and test data: torch.Size([28, 2, 3]) torch.Size([5, 2, 3])
The first three data:


tensor([[[30., 18., 10.],
         [18., 10., 15.]],

        [[18., 10., 15.],
         [10., 15., 30.]],

        [[10., 15., 30.],
         [15., 30., 15.]]])

Explanation of the sliding window data. Imagine we have time series data in the form of t1, t2, t3, t4, t5, ..., tn. Then the first data will be like this:

Input -> Target

[t1, t2, t3] -> [t2, t3, t4]

It means that the first three data will be learned by the model to predict the fourth data (t4). Then we apply sliding method so the second data will be like this:

Input -> Target

[t2, t3, t4] -> [t3, t4, t5]

It continues until we get the last three data that we can use to predict the next data.

In [9]:
# Training Model

models = []
for idx in range(len(train_datas)):

    print(lst[idx][0], lst[idx][1])
    print('-' * 89)

    # Mean Average Error
    criterion = nn.L1Loss() 
    lr = 0.005 
    model = Transformer().to('cpu')
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.95)

    for i in range(5):
        model = train_one_epoch(i + 1, train_datas[idx], model, optimizer)
        print('-' * 89)
        scheduler.step() 
    
    models.append(model)

EPM_136080 TRGCA
-----------------------------------------------------------------------------------------




| epoch   1 |     7/   37 instances | 158.81 ms | loss 17.24604
| epoch   1 |    14/   37 instances | 14.67 ms | loss 11.35866
| epoch   1 |    21/   37 instances | 16.65 ms | loss 11.82564
| epoch   1 |    28/   37 instances | 13.49 ms | loss 11.66547
| epoch   1 |    35/   37 instances | 11.56 ms | loss 15.73454
-----------------------------------------------------------------------------------------
| epoch   2 |     7/   37 instances | 27.74 ms | loss 16.38223
| epoch   2 |    14/   37 instances | 13.12 ms | loss 9.05232
| epoch   2 |    21/   37 instances | 12.89 ms | loss 11.45190
| epoch   2 |    28/   37 instances | 15.07 ms | loss 10.52405
| epoch   2 |    35/   37 instances | 11.33 ms | loss 14.13780
-----------------------------------------------------------------------------------------
| epoch   3 |     7/   37 instances | 26.85 ms | loss 19.90362
| epoch   3 |    14/   37 instances | 12.53 ms | loss 7.68657
| epoch   3 |    21/   37 instances | 14.09 ms | loss 10.92470
| 



| epoch   1 |     5/   28 instances | 34.81 ms | loss 29.30421
| epoch   1 |    10/   28 instances | 13.17 ms | loss 15.98377
| epoch   1 |    15/   28 instances | 14.45 ms | loss 14.77681
| epoch   1 |    20/   28 instances | 13.81 ms | loss 11.44895
| epoch   1 |    25/   28 instances | 12.42 ms | loss 8.29645
-----------------------------------------------------------------------------------------
| epoch   2 |     5/   28 instances | 26.49 ms | loss 15.70588
| epoch   2 |    10/   28 instances | 12.98 ms | loss 13.20625
| epoch   2 |    15/   28 instances | 12.28 ms | loss 9.94258
| epoch   2 |    20/   28 instances | 13.84 ms | loss 6.65999
| epoch   2 |    25/   28 instances | 12.12 ms | loss 6.69000
-----------------------------------------------------------------------------------------
| epoch   3 |     5/   28 instances | 27.44 ms | loss 17.78985
| epoch   3 |    10/   28 instances | 14.34 ms | loss 12.34952
| epoch   3 |    15/   28 instances | 12.46 ms | loss 9.66815
| epoc

In [10]:
# Evaluation
for idx in range(len(test_datas)):

    print(lst[idx][0], lst[idx][1])
    eval_batch_size = len(test_datas[idx])

    model = models[idx]
    model.eval()
    with torch.no_grad():
        data, targets = get_batch(test_datas[idx], 0, eval_batch_size)
        predictions = model(data)

        mae = mean_absolute_error(targets.view(3, eval_batch_size).numpy(), predictions.view(3, eval_batch_size).numpy())
        mse = mean_squared_error(targets.view(3, eval_batch_size).numpy(), predictions.view(3, eval_batch_size).numpy())

    print("MAE: {:<5} MSE: {:<5}".format(str(round(mae, 3)), str(round(mse, 3))))
    print('-' * 89)

EPM_136080 TRGCA
MAE: 10.046 MSE: 117.798
-----------------------------------------------------------------------------------------
EPM_3041843 CKPXB
MAE: 6.727 MSE: 58.512
-----------------------------------------------------------------------------------------


### Top Items

In [11]:
# Taken from ML - Top Items code
item = 'TALNE'
lst = ['EPM_35159', 'EPM_4334085', 'EPM_1807311', 'EPM_3564728', 'EPM_35002', 'EPM_34985', 'EPM_136080', 'EPM_1624002', 'EPM_3676050', 'EPM_34923']
series = []
for l in lst:
    dfU = df[(df['ship_to_id'] == l) & (df['item_code'] == item)].sort_values('trx_date').reset_index(drop=True)[['ship_to_id', 'item_code', 'trx_date', 'sales_qty']]
    dfU['trx_date'] = pd.to_datetime(dfU['trx_date'])
    dfU = dfU.drop_duplicates('trx_date').reset_index(drop=True)
    dfU['Period'] = dfU['trx_date'].diff().apply(lambda x: x.days)[1:].reset_index(drop=True)
    series.append(list(dfU['Period'][:-1]))

dfU.head() # We only use 'Period' column as the 'series' object

Unnamed: 0,ship_to_id,item_code,trx_date,sales_qty,Period
0,EPM_34923,TALNE,2021-01-20,2,3.0
1,EPM_34923,TALNE,2021-01-23,5,5.0
2,EPM_34923,TALNE,2021-01-28,10,20.0
3,EPM_34923,TALNE,2021-02-17,10,3.0
4,EPM_34923,TALNE,2021-02-20,5,5.0


In [12]:
# Convert series data to sliding window data
train_datas = []
test_datas = []

for s in series:
    train, test, x = create_sliding_window(s)
    train_datas.append(train)
    test_datas.append(test)

print("Size of the train and test data:", train_datas[-1].shape, test_datas[-1].shape)
print("The first three data:")
train_datas[-1][:3]

Size of the train and test data: torch.Size([95, 2, 3]) torch.Size([22, 2, 3])
The first three data:


tensor([[[ 3.,  5., 20.],
         [ 5., 20.,  3.]],

        [[ 5., 20.,  3.],
         [20.,  3.,  5.]],

        [[20.,  3.,  5.],
         [ 3.,  5., 16.]]])

In [13]:
# Training Model

models = []
for idx in range(len(train_datas)):

    print(lst[idx], item)
    print('-' * 89)
    # Mean Average Error
    criterion = nn.L1Loss() 
    lr = 0.005 
    model = Transformer().to('cpu')
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.95)

    for i in range(5):
        model = train_one_epoch(i + 1, train_datas[idx], model, optimizer)
        print('-' * 89)
        scheduler.step() 
    
    models.append(model)

EPM_35159 TALNE
-----------------------------------------------------------------------------------------
| epoch   1 |    27/  138 instances | 45.37 ms | loss 12.57479
| epoch   1 |    54/  138 instances | 22.50 ms | loss 4.72645
| epoch   1 |    81/  138 instances | 33.94 ms | loss 2.67864
| epoch   1 |   108/  138 instances | 29.78 ms | loss 2.35305
| epoch   1 |   135/  138 instances | 15.74 ms | loss 0.96102
-----------------------------------------------------------------------------------------




| epoch   2 |    27/  138 instances | 59.50 ms | loss 4.49643
| epoch   2 |    54/  138 instances | 27.76 ms | loss 3.40243
| epoch   2 |    81/  138 instances | 25.50 ms | loss 2.44292
| epoch   2 |   108/  138 instances | 24.88 ms | loss 2.41719
| epoch   2 |   135/  138 instances | 12.98 ms | loss 0.82932
-----------------------------------------------------------------------------------------
| epoch   3 |    27/  138 instances | 54.81 ms | loss 4.92077
| epoch   3 |    54/  138 instances | 26.89 ms | loss 2.98112
| epoch   3 |    81/  138 instances | 24.23 ms | loss 2.16505
| epoch   3 |   108/  138 instances | 31.15 ms | loss 2.26282
| epoch   3 |   135/  138 instances | 13.25 ms | loss 0.81894
-----------------------------------------------------------------------------------------
| epoch   4 |    27/  138 instances | 51.62 ms | loss 4.66377
| epoch   4 |    54/  138 instances | 25.14 ms | loss 3.12828
| epoch   4 |    81/  138 instances | 24.50 ms | loss 2.28695
| epoch   4 | 

In [14]:
# Evaluation
for idx in range(len(test_datas)):

    print(lst[idx], item)
    eval_batch_size = len(test_datas[idx])

    model = models[idx]
    model.eval()
    with torch.no_grad():
        data, targets = get_batch(test_datas[idx], 0, eval_batch_size)
        predictions = model(data)

        mae = mean_absolute_error(targets.view(3, eval_batch_size).numpy(), predictions.view(3, eval_batch_size).numpy())
        mse = mean_squared_error(targets.view(3, eval_batch_size).numpy(), predictions.view(3, eval_batch_size).numpy())

    print("MAE: {:<5} MSE: {:<5}".format(str(round(mae, 3)), str(round(mse, 3))))
    print('-' * 89)

EPM_35159 TALNE
MAE: 2.084 MSE: 7.252
-----------------------------------------------------------------------------------------
EPM_4334085 TALNE
MAE: 1.807 MSE: 7.506
-----------------------------------------------------------------------------------------
EPM_1807311 TALNE
MAE: 5.579 MSE: 216.754
-----------------------------------------------------------------------------------------
EPM_3564728 TALNE
MAE: 2.445 MSE: 8.904
-----------------------------------------------------------------------------------------
EPM_35002 TALNE
MAE: 2.605 MSE: 9.068
-----------------------------------------------------------------------------------------
EPM_34985 TALNE
MAE: 2.411 MSE: 11.862
-----------------------------------------------------------------------------------------
EPM_136080 TALNE
MAE: 4.995 MSE: 76.317
-----------------------------------------------------------------------------------------
EPM_1624002 TALNE
MAE: 2.688 MSE: 10.559
----------------------------------------------------