In [1]:
import os;os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
import time
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn import metrics

import seaborn as sns
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn
from torch.utils.data import DataLoader, TensorDataset, Dataset

In [3]:
SEED = 20193575

In [4]:
np.random.seed(SEED)
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
articles = pd.read_parquet('../data/articles.parquet')
customers = pd.read_parquet('../data/customers.parquet')
transactions = pd.read_parquet('../data/transactions_train.parquet')
sample_submission = pd.read_csv('../data/sample_submission.csv')

# Simple preprocessing

In [6]:
article_id_map = {original: (idx + 1) for idx, original in enumerate(articles["article_id"].unique())}
inverse_article_id_map = {(idx + 1): original for idx, original in enumerate(articles["article_id"].unique())}
articles["article_id_mapped"] = articles["article_id"].map(article_id_map)
transactions["article_id_mapped"] = transactions["article_id"].map(article_id_map)

In [7]:
articles.nunique()

article_id                      105542
product_code                     47224
prod_name                        45875
product_type_no                    132
product_type_name                  131
product_group_name                  19
graphical_appearance_no             30
graphical_appearance_name           30
colour_group_code                   50
colour_group_name                   50
perceived_colour_value_id            8
perceived_colour_value_name          8
perceived_colour_master_id          20
perceived_colour_master_name        20
department_no                      299
department_name                    250
index_code                          10
index_name                          10
index_group_no                       5
index_group_name                     5
section_no                          57
section_name                        56
garment_group_no                    21
garment_group_name                  21
detail_desc                      43405
article_id_mapped        

## Model definition

In [8]:
class LSTMRecommender(nn.Module):
    def __init__(self, embedding_dim, input_dim, hidden_dim, n_articles, num_layers=2, bidirectional=True, dropout=0.2):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.n_articles = n_articles
        self.n_directions = 2 if bidirectional else 1
        self.num_layers = num_layers

        # Embedding articles to a lower dimension
        self.embedding = nn.Embedding(n_articles, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_dim * num_layers, n_articles)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h0 = torch.zeros(self.n_directions * self.num_layers, x.size(0), self.hidden_dim, requires_grad=True, device=device)
        c0 = torch.zeros(self.n_directions * self.num_layers, x.size(0), self.hidden_dim, requires_grad=True, device=device)
        # Embed
        embedded_sequence = self.embedding(x)
        # Forward propagate LSTM
        out, (hn, cn) = self.lstm(embedded_sequence, (h0.detach(), c0.detach()))
        # Dropout
        out = self.dropout(out)
        # Decode hidden state of last time step
        out = self.fc(out[:, -1, :])
        out = F.softmax(out, dim=1)
        return torch.max(out, dim=1)[1]

## Training

In [9]:
# Split 80/20 on customer id
train_customers, val_customers = train_test_split(transactions.customer_id.unique(), test_size=0.2, random_state=SEED)
training_transactions_df = transactions[transactions.customer_id.isin(train_customers)]
validation_transactions_df = transactions[transactions.customer_id.isin(val_customers)]

In [10]:
N_TRAINING_WEEKS = 5
MAX_WEEK = 105

In [11]:
def filter_transactions_by_weeks(transactions):
    _transactions = transactions["customer_id article_id_mapped week".split()]
    filtered_transactions = _transactions[_transactions.week.between(MAX_WEEK - N_TRAINING_WEEKS, MAX_WEEK - 1)]
    filtered_transactions = filtered_transactions.groupby("customer_id")["article_id_mapped"].apply(list).reset_index(name="history")
    return filtered_transactions

In [12]:
training_transactions = filter_transactions_by_weeks(training_transactions_df)

In [13]:
validation_transactions = filter_transactions_by_weeks(validation_transactions_df)

In [14]:
transactions.customer_id.value_counts().describe()

count    1.362281e+06
mean     2.333463e+01
std      3.924225e+01
min      1.000000e+00
25%      3.000000e+00
50%      9.000000e+00
75%      2.700000e+01
max      1.895000e+03
Name: count, dtype: float64

In [15]:
training_transactions_df.customer_id.value_counts().median()

9.0

In [16]:
EMBEDDING_DIM = 64
HIDDEN_DIM = 100

BATCH_SIZE = 128

# Padding article added to map, actual articles start at 1
article_id_map[-1] = 0
PADDING_ARTICLE = 0

NUM_ARTICLES_IN_SEQUENCE = 12
N_ARTICLES = articles.article_id.nunique()

model = LSTMRecommender(
    input_dim=NUM_ARTICLES_IN_SEQUENCE,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    # Output dim is only the number of articles while n_articles is for the embedding and has to include the padding
    n_articles=N_ARTICLES+1,
    bidirectional=False,
    num_layers=1,
    dropout=0.2
    )

loss_fn = nn.MSELoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [17]:
articles.article_id.describe()

count    1.055420e+05
mean     6.984246e+08
std      1.284624e+08
min      1.087750e+08
25%      6.169925e+08
50%      7.022130e+08
75%      7.967030e+08
max      9.594610e+08
Name: article_id, dtype: float64

In [18]:
articles.article_id.max() - articles.article_id.min()

850685986

In [19]:
class SequenceDataset(Dataset):
   def __init__(self, sequences, targets):
       self.sequences = sequences
       self.targets = targets

   def __len__(self):
       return len(self.sequences)

   def __getitem__(self, idx):
       return self.sequences[idx], self.targets[idx]


def combine_sequences(user_transactions):
    combined_sequence_batch = []
    combined_target_batch = []
    
    for idx, (customer, history) in user_transactions.iterrows():
        history_batch, target_batch = create_batch(history)
        if history_batch is None or target_batch is None:
            continue
        combined_sequence_batch.extend(history_batch)
        combined_target_batch.extend(target_batch)
    
    sequence_dataset = SequenceDataset(combined_sequence_batch, combined_target_batch)
    dataloader = DataLoader(sequence_dataset, batch_size=BATCH_SIZE, shuffle=True)
    return dataloader


def create_batch(history):
    # Create batch of sequences
    if len(history) <= 1:
        return None, None
    history_batch = []
    target_batch = []
    for i in range(1, len(history)):
        if i < 12:
            # Add padding to the beginning of the sequence
            history_batch.append(torch.tensor([PADDING_ARTICLE] * (NUM_ARTICLES_IN_SEQUENCE - i) + history[:i], dtype=torch.int32))
        else:
            history_batch.append(torch.tensor(history[i-12:i], dtype=torch.int32))
        target_batch.append(torch.tensor(history[i], dtype=torch.float32, requires_grad=True))
    return history_batch, target_batch

In [20]:
training_transactions.head()

Unnamed: 0,customer_id,history
0,116809474287335,"[103797, 105274, 81604, 97235, 87175, 102762]"
1,200292573348128,[102421]
2,329094189075899,"[85869, 85869]"
3,690285180337957,"[103798, 103796, 103798]"
4,745180086074610,"[94675, 95031, 80903, 74854, 18611, 85677, 102..."


In [21]:
def run_validation():
    y_true, y_pred = [], []
    criterion = nn.MSELoss()
    val_loss = 0.0
    
    with torch.no_grad():
        dataloader = combine_sequences(validation_transactions)
        for sequences, targets in dataloader:
            sequence = sequences.to(device)
            target = targets.to(device)
            
            # Predict
            out = model(sequence)
            loss = criterion(out.to(torch.float32).clone().detach(), target)
            val_loss += loss.item()
            y_pred.extend(out.cpu().numpy())
            y_true.extend(target.cpu().numpy())
        
    val_loss /= len(dataloader)
    accuracy = metrics.accuracy_score(y_true, y_pred)
    return accuracy, val_loss

In [22]:
EPOCHS = 50

print(len(training_transactions))

start_time = time.perf_counter()
print("Training start:", time.asctime(time.localtime()))

header_printed = False
col_widths = []
table_seperator = ""

running_loss = 0.0
val_losses = []
train_losses = []

model.to(device)
dataloader = combine_sequences(training_transactions)
len(dataloader)
for epoch in range(EPOCHS):
    epoch_start_time = time.perf_counter()
    
    training_accuracy = 0
    epoch_loss = 0.0
    for idx, (sequence, target) in enumerate(dataloader):
        sequence = sequence.to(device)
        target = target.to(device)
        # Predict
        out = model(sequence)
        # Calculate loss
        loss = loss_fn(
            # torch.tensor(out, dtype=torch.float32, requires_grad=True), torch.tensor(target[0], dtype=torch.float32, requires_grad=True)
            out.to(torch.float32).clone().detach().requires_grad_(True), target
        )
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Statistics
        epoch_loss += loss.item()
        training_accuracy += (out == target).sum().item()

    epoch_loss /= len(dataloader)
    running_loss += epoch_loss
    training_accuracy /= len(dataloader.dataset)
    
    val_accuracy, val_loss = run_validation()
    
    train_losses.append(epoch_loss)
    val_losses.append(float(val_loss))
    
    if not header_printed:
        header_printed = True
        header_text = "| Epoch | Validation Accuracy | Training Accuracy |     Validation Loss     |     Epoch Loss     |     Running Loss     | Epoch Time |"
        col_widths = [len(s)-2 for s in header_text.split("|")[1:-1]]
        table_seperator = f"+{'+'.join(['-' * (x + 2) for x in col_widths])}+"
        print(table_seperator)
        print(header_text)
        print(table_seperator)
    
    print(f"| {str(epoch + 1):<{col_widths[0]}} | "
              f"{f'{val_accuracy:.4%}':<{col_widths[1]}} | "
              f"{f'{training_accuracy:.4%}':<{col_widths[2]}} | "
              f"{f'{val_loss:.4f}':<{col_widths[3]}} | "
              f"{f'{epoch_loss:.4f}':<{col_widths[4]}} | "
              f"{f'{running_loss:.4f}':<{col_widths[5]}} | "
              f"{f'{time.perf_counter() - epoch_start_time:.2f}s':<{col_widths[6]}} |")
    
    torch.save(model.state_dict(), f"./models/LSTM_Model_Epoch_{epoch + 1}.pt")

end_time = time.perf_counter()
print(f"Training time: {end_time - start_time:.2f}s")

218069
Training start: Wed Nov  8 02:42:16 2023
+-------+---------------------+-------------------+-------------------------+--------------------+----------------------+------------+
| Epoch | Validation Accuracy | Training Accuracy |     Validation Loss     |     Epoch Loss     |     Running Loss     | Epoch Time |
+-------+---------------------+-------------------+-------------------------+--------------------+----------------------+------------+
| 1     | 0.0010%             | 0.0007%           | 2450971860.7291         | 2443925412.3400    | 2443925412.3400      | 26.53s     |
| 2     | 0.0010%             | 0.0009%           | 2454524028.5320         | 2449663098.4400    | 4893588510.7800      | 27.64s     |
| 3     | 0.0014%             | 0.0007%           | 2453551624.2759         | 2445347007.5600    | 7338935518.3400      | 28.39s     |
| 4     | 0.0014%             | 0.0007%           | 2448871336.3547         | 2446907938.6800    | 9785843457.0200      | 28.46s     |
| 5    

# Generate submission

In [23]:
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

In [24]:
sub = pd.read_csv('../data/sample_submission.csv')

In [25]:
model = LSTMRecommender(
    input_dim=NUM_ARTICLES_IN_SEQUENCE,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    # Output dim is only the number of articles while n_articles is for the embedding and has to include the padding
    n_articles=N_ARTICLES+1,
    bidirectional=True,
    dropout=0.2
    )

In [27]:
model.load_state_dict(torch.load("./models/LSTM_Model_Epoch_50.pt"))
model.to(device)

LSTMRecommender(
  (embedding): Embedding(105543, 64)
  (lstm): LSTM(64, 100, batch_first=True)
  (fc): Linear(in_features=100, out_features=105543, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [28]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,article_id_mapped
25784,2018-09-20,1728846800780188,519773001,0.028458,2,0,8050
25785,2018-09-20,1728846800780188,578472001,0.032525,2,0,18291
5389,2018-09-20,2076973761519164,661795002,0.167797,2,0,39546
5390,2018-09-20,2076973761519164,684080003,0.101678,2,0,45748
47429,2018-09-20,2918879973994241,662980001,0.033881,1,0,39897


In [29]:
class HistoryDataset(Dataset):
    def __init__(self, history):
        self.histories = history

    def __len__(self):
        return len(self.histories)

    def __getitem__(self, idx):
        history = self.histories[idx]
        if len(history) < 12:
            history = [PADDING_ARTICLE] * (NUM_ARTICLES_IN_SEQUENCE - len(history)) + history
        return torch.tensor(history[-12:], dtype=torch.int32)

In [30]:
# Sort the dataframe by "customer_id" and "t_dat" in descending order
df = transactions.sort_values(by=['customer_id', 't_dat'], ascending=[True, True])

# Group by "customer_id" and get the last 12 transactions for each customer
df_grouped = df.groupby('customer_id')['article_id_mapped'].apply(list)
transactions_filtered = pd.DataFrame({"customer_id": df_grouped.index, "sequence": df_grouped.apply(lambda x: x[-12:])})

In [31]:
# Convert customer ids to integers
customer_ids = sub.customer_id.apply(hex_id_to_int).tolist()

In [32]:
missing_customer_ids = list(set(customer_ids).difference(set(transactions_filtered.customer_id)))

In [33]:
# Create a new dataframe with the missing customer_ids and an empty list as the sequence
df_missing = pd.DataFrame({
  'customer_id': missing_customer_ids,
  'sequence': [[] for _ in range(len(missing_customer_ids))]
})

# Concatenate df_result and df_missing
transactions_filtered = pd.concat([transactions_filtered, df_missing])

In [34]:
# Sorting based on customer_ids in submission
transactions_df = transactions_filtered.copy()
transactions_df['customer_id'] = pd.Categorical(transactions_df['customer_id'], categories=customer_ids, ordered=True)
transactions_df_sorted = transactions_df.sort_values("customer_id")

In [35]:
transactions_df_sorted

Unnamed: 0,customer_id,sequence
6.883939e+18,6883939031699146327,"[79279, 79279, 76504, 49479, 76591, 83623, 900..."
1.124633e+19,11246327431398957306,"[83344, 83344, 83344, 59459, 13741, 22665, 834..."
1.843990e+19,18439897732908966680,"[1470, 60254, 60260, 93586, 91842, 67275, 6727..."
1.835267e+19,18352672461570950206,"[64526, 61176]"
1.816278e+19,18162778555210377306,"[43443, 51125, 54462, 2182, 2182, 20518, 87477..."
...,...,...
7.551062e+18,7551062398649767985,"[86089, 22653, 69116, 86099, 45799, 86106, 564..."
9.305342e+18,9305341941720086711,"[7193, 89051, 92636, 92643, 81335, 41294, 4645..."
1.016043e+19,10160427316885688932,"[69226, 53894, 84735, 16535, 16535, 32744, 889..."
2.551401e+18,2551401172826382186,"[85873, 81752, 56695, 95507, 89865, 95507, 99150]"


In [36]:
history_dataset = HistoryDataset(transactions_df_sorted.sequence.tolist())
history_dataloader = DataLoader(history_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [37]:
preds = []
counter = 0
history_batches = []
history_batch = []

for idx, batch in enumerate(history_dataloader):
    batch = batch.to(device)
    
    for i in range(12):
        # Pass padded batches to the model
        with torch.no_grad():
            out = model(batch[:, -12:]).unsqueeze(1)

        # Append model's output to each transaction in the batch
        batch = torch.cat((batch, out), dim=1)
    for i in range(batch.shape[0]):
        preds.append(batch[i, -12:].tolist())
    if idx % 100 == 0:
        print(idx)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700


In [None]:
preds = []
counter = 0
for c_id in customer_hex_id_to_int(sub.customer_id):
    history = transactions[transactions.customer_id==c_id]["article_id_mapped"].tolist()

    for i in range(12):
        if len(history) < 12:
            history = [PADDING_ARTICLE] * (NUM_ARTICLES_IN_SEQUENCE - len(history)) + history
        with torch.no_grad():
            out = model(torch.tensor([history[-12:]], dtype=torch.int32).to(device)).item()
        history.append(out)

    preds.append(history[-12:])
    counter+=1
    if counter % 100 == 0:
        print(counter)

In [None]:
import pickle
with open("rec_list.bin", "wb") as f:
    pickle.dump(preds, f)

In [None]:
with open("rec_list.bin", "rb") as f:
    some_list = pickle.load(f)

In [None]:
print(len(preds), len(some_list))

In [None]:
transactions["article_id"].value_counts()

In [36]:
_preds = [' '.join(['0' + str(inverse_article_id_map.get(p, 706016001)) for p in ps]) for ps in preds]
sub.prediction = _preds

In [37]:
sub_name = 'lstm_model_submission_e21'
sub.to_csv(f'{sub_name}.csv.gz', index=False)