In [21]:
import pandas as pd
import matplotlib as plt

In [22]:
transactions = pd.read_csv("/Users/gursanjjam/Documents/hnm-recsys/kaggle_data/transactions_train.csv")

In [23]:
print("TRANSACTIONS\n")
print(transactions.head())

TRANSACTIONS

        t_dat                                        customer_id  article_id  \
0  2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   663713001   
1  2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   541518023   
2  2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   505221004   
3  2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   685687003   
4  2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   685687004   

      price  sales_channel_id  
0  0.050831                 2  
1  0.030492                 2  
2  0.015237                 2  
3  0.016932                 2  
4  0.016932                 2  


In [24]:
baskets = transactions[["t_dat", "customer_id", "article_id"]]

In [25]:
baskets = (
    baskets.groupby(['customer_id', 't_dat'])['article_id']              #Keep baskets with t_dat for splitting
      .apply(list)
      .reset_index(name='products')
)
baskets.head()

Unnamed: 0,customer_id,t_dat,products
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2018-12-27,"[625548001, 176209023, 627759010]"
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-05-02,[697138006]
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-05-25,"[568601006, 568601006]"
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-07-25,"[607642008, 745232001]"
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-09-18,[656719005]


In [26]:
baskets.head()

Unnamed: 0,customer_id,t_dat,products
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2018-12-27,"[625548001, 176209023, 627759010]"
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-05-02,[697138006]
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-05-25,"[568601006, 568601006]"
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-07-25,"[607642008, 745232001]"
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-09-18,[656719005]


In [27]:
## time split

In [28]:
baskets['t_dat'] = pd.to_datetime(baskets['t_dat'])
max_date = baskets['t_dat'].max()  #parsing t_dat to date time format and then checking min max date
min_date = baskets['t_dat'].min()
print("Latest date:", max_date)
print("Oldest date:", min_date)

Latest date: 2020-09-22 00:00:00
Oldest date: 2018-09-20 00:00:00


In [29]:
total_days = (max_date - min_date).days

train_end = min_date + pd.Timedelta(days=int(0.70 * total_days))
val_end   = min_date + pd.Timedelta(days=int(0.85 * total_days))

train = baskets[baskets['t_dat'] <= train_end]
val   = baskets[(baskets['t_dat'] > train_end) & (baskets['t_dat'] <= val_end)]
test  = baskets[baskets['t_dat'] > val_end]

print("Train:", train['t_dat'].min(), "→", train['t_dat'].max())
print("Val:  ", val['t_dat'].min(),   "→", val['t_dat'].max())
print("Test: ", test['t_dat'].min(),  "→", test['t_dat'].max())

Train: 2018-09-20 00:00:00 → 2020-02-15 00:00:00
Val:   2020-02-16 00:00:00 → 2020-06-04 00:00:00
Test:  2020-06-05 00:00:00 → 2020-09-22 00:00:00


In [30]:
train.head()

Unnamed: 0,customer_id,t_dat,products
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2018-12-27,"[625548001, 176209023, 627759010]"
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-05-02,[697138006]
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-05-25,"[568601006, 568601006]"
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-07-25,"[607642008, 745232001]"
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-09-18,[656719005]


In [31]:
#flattening each split.

In [32]:
train_seq = (
    train.sort_values(["customer_id", "t_dat"])
         .groupby("customer_id")["products"]
         .apply(lambda lists: [item for sublist in lists for item in sublist])
         .reset_index(name="sequence")
)

val_seq = (
    val.sort_values(["customer_id", "t_dat"])
       .groupby("customer_id")["products"]
       .apply(lambda lists: [item for sublist in lists for item in sublist])
       .reset_index(name="sequence")
)

test_seq = (
    test.sort_values(["customer_id", "t_dat"])
        .groupby("customer_id")["products"]
        .apply(lambda lists: [item for sublist in lists for item in sublist])
        .reset_index(name="sequence")
)

In [33]:
import numpy as np

# Collect all unique product IDs across train/val/test
all_products = set(
    np.concatenate(train_seq["sequence"].values)
) | set(
    np.concatenate(val_seq["sequence"].values)
) | set(
    np.concatenate(test_seq["sequence"].values)
)

# Map product → index (reserve 0 for padding if needed)
product2idx = {p: i+1 for i, p in enumerate(sorted(all_products))}
idx2product = {i: p for p, i in product2idx.items()}

#encode sequences to indices
def encode_sequence(seq, mapping):
    return [mapping[p] for p in seq if p in mapping]

train_seq["encoded"] = train_seq["sequence"].apply(lambda x: encode_sequence(x, product2idx))
val_seq["encoded"]   = val_seq["sequence"].apply(lambda x: encode_sequence(x, product2idx))
test_seq["encoded"]  = test_seq["sequence"].apply(lambda x: encode_sequence(x, product2idx))

In [34]:
import torch
MAX_LEN = 50  # or choose based on quantile of sequence lengths

def pad_fixed_length(sequences, max_len, padding_value=0):
    out = []
    for seq in sequences:
        if len(seq) > max_len:   # truncate
            seq = seq[-max_len:]
        else:                   # pre-pad
            seq = torch.cat([torch.full((max_len - len(seq),), padding_value, dtype=torch.long), seq])
        out.append(seq)
    return torch.stack(out)

In [35]:
# convert encoded sequences to list of tensors
train_tensors = [torch.tensor(seq, dtype=torch.long) for seq in train_seq["encoded"]]
val_tensors   = [torch.tensor(seq, dtype=torch.long) for seq in val_seq["encoded"]]
test_tensors  = [torch.tensor(seq, dtype=torch.long) for seq in test_seq["encoded"]]

# now pad
train_padded = pad_fixed_length(train_tensors, MAX_LEN)
val_padded   = pad_fixed_length(val_tensors, MAX_LEN)
test_padded  = pad_fixed_length(test_tensors, MAX_LEN)

In [36]:
from torch.utils.data import Dataset, DataLoader

class SequenceDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx]

train_dataset = SequenceDataset(train_padded)
val_dataset   = SequenceDataset(val_padded)
test_dataset  = SequenceDataset(test_padded)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=64)
test_loader  = DataLoader(test_dataset, batch_size=64)

In [37]:
torch.save(train_padded, "data/splits/train_padded.pt")
torch.save(val_padded, "data/splits/val_padded.pt")
torch.save(test_padded, "data/splits/test_padded.pt")