In [2]:
# Models.py
from typing import Optional
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn
from torch.nn import Linear
from torch.nn import functional as F


def masked_accuracy(y_pred: torch.Tensor, y_true: torch.Tensor, mask: torch.Tensor):

    _, predicted = torch.max(y_pred, 1)

    y_true = torch.masked_select(y_true, mask)
    predicted = torch.masked_select(predicted, mask)

    acc = (y_true == predicted).double().mean()

    return acc


def masked_ce(y_pred, y_true, mask):

    loss = F.cross_entropy(y_pred, y_true, reduction="none")

    loss = loss * mask

    return loss.sum() / (mask.sum() + 1e-8)


class Recommender(pl.LightningModule):
    def __init__(
        self,
        vocab_size,
        channels=128,
        cap=0,
        mask=1,
        dropout=0.4,
        lr=1e-4,
    ):
        super().__init__()

        self.cap = cap
        self.mask = mask

        self.lr = lr
        self.dropout = dropout
        self.vocab_size = vocab_size

        self.item_embeddings = torch.nn.Embedding(
            self.vocab_size, embedding_dim=channels
        )

        self.input_pos_embedding = torch.nn.Embedding(512, embedding_dim=channels)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=channels, nhead=4, dropout=self.dropout
        )

        self.encoder = torch.nn.TransformerEncoder(encoder_layer, num_layers=6)

        self.linear_out = Linear(channels, self.vocab_size)

        self.do = nn.Dropout(p=self.dropout)

    def encode_src(self, src_items):
        src_items = self.item_embeddings(src_items)

        batch_size, in_sequence_len = src_items.size(0), src_items.size(1)
        pos_encoder = (
            torch.arange(0, in_sequence_len, device=src_items.device)
            .unsqueeze(0)
            .repeat(batch_size, 1)
        )
        pos_encoder = self.input_pos_embedding(pos_encoder)

        src_items += pos_encoder

        src = src_items.permute(1, 0, 2)

        src = self.encoder(src)

        return src.permute(1, 0, 2)

    def forward(self, src_items):

        src = self.encode_src(src_items)

        out = self.linear_out(src)

        return out

    def training_step(self, batch, batch_idx):
        src_items, y_true = batch

        y_pred = self(src_items)

        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        src_items = src_items.view(-1)
        mask = src_items == self.mask

        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
        accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

        self.log("train_loss", loss)
        self.log("train_accuracy", accuracy)

        return loss

    def validation_step(self, batch, batch_idx):
        src_items, y_true = batch

        y_pred = self(src_items)

        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        src_items = src_items.view(-1)
        mask = src_items == self.mask

        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
        accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

        self.log("valid_loss", loss)
        self.log("valid_accuracy", accuracy)

        return loss

    def test_step(self, batch, batch_idx):
        src_items, y_true = batch

        y_pred = self(src_items)

        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        src_items = src_items.view(-1)
        mask = src_items == self.mask

        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
        accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

        self.log("test_loss", loss)
        self.log("test_accuracy", accuracy)

        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, patience=10, factor=0.1
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": scheduler,
            "monitor": "valid_loss",
        }


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
import random

import numpy as np
import pandas as pd

PAD = 0
MASK = 1


def map_column(df: pd.DataFrame, col_name: str):
    values = sorted(list(df[col_name].unique()))
    mapping = {k: i + 2 for i, k in enumerate(values)}
    inverse_mapping = {v: k for k, v in mapping.items()}

    df[col_name + "_mapped"] = df[col_name].map(mapping)

    return df, mapping, inverse_mapping


def get_context(df: pd.DataFrame, split: str, context_size: int = 120, val_context_size: int = 5):
    if split == "train":
        if val_context_size >= df.shape[0] - 10:
            end_index = 10  # Set a reasonable fallback value
        else:
            end_index = random.randint(10, df.shape[0] - val_context_size)
    elif split in ["val", "test"]:
        end_index = df.shape[0]
    else:
        raise ValueError

    start_index = max(0, end_index - context_size)

    context = df[start_index:end_index]

    return context


def pad_arr(arr: np.ndarray, expected_size: int = 30):
    arr = np.pad(arr, [(expected_size - arr.shape[0], 0), (0, 0)], mode="edge")
    return arr


def pad_list(list_integers, history_size: int, pad_val: int = PAD, mode="left"):
    if len(list_integers) < history_size:
        if mode == "left":
            list_integers = [pad_val] * (history_size - len(list_integers)) + list_integers
        else:
            list_integers = list_integers + [pad_val] * (history_size - len(list_integers))

    return list_integers


def df_to_np(df, expected_size=30):
    arr = np.array(df)
    arr = pad_arr(arr, expected_size=expected_size)
    return arr



In [4]:
# training.py
import random
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader
from pathlib import Path
import numpy as np

def mask_last_elements_list(l1, val_context_size: int = 5):

    l1 = l1[:-val_context_size] + mask_list(l1[-val_context_size:], p=0.5)

    return l1

def mask_list(l1, p=0.8):

    l1 = [a if random.random() < p else MASK for a in l1]

    return l1

class Dataset(torch.utils.data.Dataset):
    def __init__(self, groups, grp_by, split, history_size=120):
        self.groups = groups
        self.grp_by = grp_by
        self.split = split
        self.history_size = history_size

    def __len__(self):
        return len(self.groups)

    def __getitem__(self, idx):
        group = self.groups[idx]

        df = self.grp_by.get_group(group)

        context = get_context(df, split=self.split, context_size=self.history_size)

        trg_items = context["product_id_mapped"].tolist()

        if self.split == "train":
            src_items = mask_list(trg_items)
        else:
            src_items = mask_last_elements_list(trg_items)

        pad_mode = "left" if random.random() < 0.5 else "right"
        trg_items = pad_list(trg_items, history_size=self.history_size, mode=pad_mode)
        src_items = pad_list(src_items, history_size=self.history_size, mode=pad_mode)

        src_items = torch.tensor(src_items, dtype=torch.long)

        trg_items = torch.tensor(trg_items, dtype=torch.long)

        return src_items, trg_items

In [9]:
def train(
    data_csv_path: Path,
    log_dir: str = "/kaggle/working/recommender_logs",
    model_dir: str = "/kaggle/working/recommender_models",
    batch_size: int = 32,
    epochs: int = 2000,
    history_size: int = 120,
):
    data = pd.read_csv(data_csv_path)
    
    data['review_date'] = pd.to_datetime(data['review_date'])

    data['timestamp'] = data['review_date'].astype(np.int64) // 10**9
    
    # print(type(data['timestamp'][0]))

    data.sort_values(by="timestamp", inplace=True)

    data, mapping, inverse_mapping = map_column(data, col_name="product_id")

    grp_by_train = data.groupby(by="customer_id")

    groups = list(grp_by_train.groups)

    train_data = Dataset(
        groups=groups,
        grp_by=grp_by_train,
        split="train",
        history_size=history_size,
    )
    val_data = Dataset(
        groups=groups,
        grp_by=grp_by_train,
        split="val",
        history_size=history_size,
    )

    print("len(train_data)", len(train_data))
    print("len(val_data)", len(val_data))

    train_loader = DataLoader(
        train_data,
        batch_size=batch_size,
        num_workers=10,
        shuffle=True,
    )
    val_loader = DataLoader(
        val_data,
        batch_size=batch_size,
        num_workers=10,
        shuffle=False,
    )

    model = Recommender(
        vocab_size=len(mapping) + 2,
        lr=1e-4,
        dropout=0.3,
    )

    logger = TensorBoardLogger(
        save_dir=log_dir,
    )

    checkpoint_callback = ModelCheckpoint(
        monitor="valid_loss",
        mode="min",
        dirpath=model_dir,
        filename="recommender-jewellery",
    )

    trainer = pl.Trainer(
        max_epochs=epochs,
        logger=logger,
        callbacks=[checkpoint_callback],
    )
    trainer.fit(model, train_loader, val_loader)

    result_val = trainer.test(dataloaders=val_loader)

    output_json = {
        "val_loss": result_val[0]["test_loss"],
        "best_model_path": checkpoint_callback.best_model_path,
    }

    print(output_json)

    return output_json

In [None]:
train(
        data_csv_path=Path(r"/kaggle/input/augmented-dataset/Augmented_Jewelry.csv"),
        batch_size = 32,
        epochs=150,
    )

len(train_data) 97
len(val_data) 97


  data['review_date'] = pd.to_datetime(data['review_date'])
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [6]:
data = pd.read_csv("/kaggle/input/augmented-dataset/Augmented_Jewelry.csv")
data.shape

(1194, 6)

In [33]:
data.dropna()

Unnamed: 0,customer_id,review_id,product_id,product_title,product_category,rating,review_headline,review_body,review_date
0,24371595,R27ZP1F1CD0C3Y,B004LLIL5A,Amazon eGift Card - Celebrate,Gift Card,5,Five Stars,Great birthday gift for a young adult.,31-08-2015
1,42489718,RJ7RSBCHUDNNE,B004LLIKVU,Amazon.com eGift Cards,Gift Card,5,Gift card for the greatest selection of items ...,It's an Amazon gift card and with over 9823983...,31-08-2015
2,861463,R1HVYBSKLQJI5S,B00IX1I3G6,Amazon.com Gift Card Balance Reload,Gift Card,5,Five Stars,Good,31-08-2015
3,25283295,R2HAXF0IIYQBIR,B00IX1I3G6,Amazon.com Gift Card Balance Reload,Gift Card,1,One Star,Fair,31-08-2015
4,397970,RNYLPX611NB7Q,B005ESMGV4,"Amazon.com Gift Cards, Pack of 3 (Various Desi...",Gift Card,5,Five Stars,I can't believe how quickly Amazon can get the...,31-08-2015
...,...,...,...,...,...,...,...,...,...
995,9033254,R1TS3MW501CTP6,B00B2TFURQ,Amazon Gift Card - Print - Thank You (Note),Gift Card,5,Five Stars,Awesome gift,31-08-2015
996,29132432,R1Y3I0RJ1JZD53,BT00CTOY20,Amazon.com Gift Card in a Greeting Card (Vario...,Gift Card,5,Works like a charm,Good as money on this site!,31-08-2015
997,18985804,R1EDV9IZ0628IG,B00PG40CO4,Amazon eGift Card - Happy Birthday (Doughnuts),Gift Card,5,Five Stars,great,31-08-2015
998,30984333,R3ATXW3TX9TQM6,B00BWDH3VS,Amazon.com eGift Cards,Gift Card,5,Great Choice.,Suited the person I sent it to well. Graphics...,31-08-2015


In [34]:
data.shape

(1000, 9)

In [35]:
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

data_csv_path = "/kaggle/input/d/geetansher/grid-prs-amazon-dataset/Apparel.csv"
products_path = "/kaggle/input/d/geetansher/grid-prs-amazon-dataset/Apparel.csv"

model_path = "/kaggle/working/recommender_models/recommender-apparal.ckpt"



In [36]:
data = pd.read_csv(data_csv_path)
products = pd.read_csv(products_path)

In [37]:
data['review_date'] = pd.to_datetime(data['review_date'])
data['timestamp'] = data['review_date'].astype(np.int64) // 10**9
data.sort_values(by="timestamp", inplace=True)
data, mapping, inverse_mapping = map_column(data, col_name="product_id")
grp_by_train = data.groupby(by="customer_id")

  data['review_date'] = pd.to_datetime(data['review_date'])


In [38]:
random.sample(list(grp_by_train.groups), k=10)

[45912412,
 46726859,
 4105777,
 2457386,
 22745771,
 48785098,
 11003977,
 935737,
 511888,
 45510794]

In [39]:
model = Recommender(
        vocab_size=len(mapping) + 2,
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])

<All keys matched successfully>

In [40]:
product_to_idx = {a: mapping[b] for a, b in zip(products.product_title.tolist(), products.product_id.tolist()) if b in mapping}
idx_to_product = {v: k for k, v in product_to_idx.items()}

In [41]:
def predict(list_products, model, product_to_idx, idx_to_product):
    
    ids = [PAD] * (120 - len(list_products) - 1) + [product_to_idx[a] for a in list_products] + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()
    
    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [idx_to_product[a] for a in sorted_predicted_ids[:30] if a in idx_to_product]


In [42]:
# priniting product id
def predict_id(list_products_id, model, product_to_idx, idx_to_product):
    
    ids = [PAD] * (120 - len(list_products_id) - 1) + [a for a in list_products_id] + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()
    
    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [a for a in sorted_predicted_ids[:30]]

In [43]:
list_products = ["Levi's Boys' 514 Straight Fit Jeans",
               "Jockey Women's Underwear Supersoft Brief - 3 Pack",
               "Jerzees Men's Super Sweats Crew Neck Sweatshirt",
               "SEOBEAN Mens Low Rise Sexy Sport Swimwear Trunk Boxer Brief Bikini Swimsuit"]

top_products = predict(list_products, model, product_to_idx, idx_to_product)
top_products

['ANGVNS Ladies Women Sexy Floor Length Strapless Long Dress for Party',
 'Women Blue and Black Dresses under $25 for all Occasion',
 "The North Face Women's ThermoBall Hybrid Hoodie - Magic Magenta",
 "Luouse Women Vintage 1950's Style 3/4 Sleeves Garden Floral Print Windbreaker Dress",
 "Norfolk Branded Men's Cushioned Running / Jogging Ankle Sports Socks - Owens",
 'SweetBridal Sweetheart Sleeveless Halter Evening Dress',
 "Harriton Men's Barbados Textured Camp Shirt",
 'Robes King RK Classical Sleepwear Mens Broadcloth Woven Pajama Set',
 'Womens Summer Open Shoulder Chiffon Shift Dress',
 "JAEDEN Women's Beaded Spaghetti Straps Sexy Long Formal Prom Evening Dresses",
 "Glamorise Women's #1006 Full-Figure Sports Bra",
 'New Womens Long Sleeve Blouse Tops Round Neck Plaid Checked Loose Shirt',
 'Amdirect 100W Manicure Pedicure Paraffin Warmer Waxing 400ml 220V Wax Heater Salon Spa',
 "Match Men's Athletic Fit Straight Leg Casual Pants",
 'VIV Collection Best Selling Printed Brushed 