# script to do experiments described in paper: Deep Interest Evolution Network for Click-Through Rate Prediction

## how to run

1. Please run prepare_neg.ipynb first.

In [8]:
SEQ_MAX_LEN = 100 # maximum sequence length
BATCH_SIZE = 128
EMBEDDING_DIM = 18
DNN_HIDDEN_SIZE = [200, 80]
DNN_DROPOUT = 0.0
TEST_RUN = False
EPOCH = 2
SEED = 10

In [2]:
%matplotlib inline

import itertools
from collections import Counter, OrderedDict

import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

from prediction_flow.features import Number, Category, Sequence, Features
from prediction_flow.transformers.column import (
    StandardScaler, CategoryEncoder, SequenceEncoder)

from prediction_flow.pytorch.data import Dataset
from prediction_flow.pytorch import WideDeep, DeepFM, DNN, DIN, DIEN, AttentionGroup

from prediction_flow.pytorch.functions import fit, predict, create_dataloader_fn

In [3]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1265ba750>

In [4]:
train_df = pd.read_csv(
    "./local_train.csv", sep='\t')

valid_df = pd.read_csv(
    "./local_test.csv", sep='\t')

In [9]:
if TEST_RUN:
    train_df = train_df.sample(1000)
    valid_df = valid_df.sample(1000)

In [10]:
train_df.head()

Unnamed: 0,label,uid,mid,cat,hist_mids,hist_cats,neg_hist_mids,neg_hist_cats
488228,0,A263AXFZ228DM,0373695349,Books,03303135090374528535B005DT9R02031238066616...,BooksBooksLiterature & FictionBooksBooksB...,158430275515966833330786934352038549084407...,BooksBooksBooksBooksBooksBooksBooksBook...
796910,0,AY1FEZRH8DUO6,B00IAFE8E6,Literature & Fiction,0505527766193332063X193332077X193332080XB0...,BooksBooksBooksBooksLiterature & FictionL...,B005GPDKNI14141154660312354622006101338200...,AnthologiesBooksBooksBooksBooksBooksBook...
849192,0,A2PHAQ041SSKO8,0985906103,Books,156882131X09759229470765324024144956869603...,BooksBooksBooksBooksBooksBooksBooksBook...,030771827106715226631904902979B00GZTG4PUB0...,BooksBooksBooksLiterature & FictionParenti...
653157,1,A332QQ6M9SF0GX,0802122647,Books,039538992507475104310316176486039306447614...,BooksBooksBooksBooksBooks,079351181X00616723940226791459B005TD2FW419...,BooksBooksBooksLiterature & FictionBooks
816903,1,A34W6ACJ456AGC,111848147X,Books,078521411904466774770671027034044656740X07...,BooksBooksBooksBooksBooksBooksBooksBook...,07611673580811863360B0025VKZWU019992803706...,BooksBooksAction & AdventureBooksBooksBoo...


In [11]:
valid_df.head()

Unnamed: 0,label,uid,mid,cat,hist_mids,hist_cats,neg_hist_mids,neg_hist_cats
17198,0,A2WK7Q3UG59GGY,0451409450,Books,0596517742014101787202978480031591843626,BooksBooksBooksBooks,0634003143007180069703000870711771013397,BooksBooksBooksBooks
84428,0,AP4QL9E0S7NTX,0385742118,Books,08969378521576737330160096074X030727725915...,BooksBooksBooksBooksBooksBooksBooksBook...,188022661816203270821416989536B00BMAQY7014...,BooksBooksBooksDrawingBooksBooksBooksGe...
59295,1,A3B8HDDUN7WA0H,B00GTT1JL0,Old Testament,083074531914016039980892216948084991962203...,BooksBooksBooksBooksBooksBooksBooksBook...,B00HTQNIH0B006TJDJKEB00FKEYK1Q0758213174B0...,Mysteries & DetectivesMetaphysicalLiterature...
100466,0,A3C52KSMXXGPF9,0758227175,Books,075132781604714180480521553326093334693X04...,BooksBooksBooksBooksBooksBooksBooksBook...,0140437428B005ZI32JS0380789337157189250803...,BooksSleepBooksBooksBooksBooksBooksBook...
41105,1,A3T590XHG6IIMX,1482689928,Books,0375838813045104711706154210321493605852B0...,BooksBooksBooksBooksHorrorBooks,03161989351577316509B007K7ZGZU079225391405...,BooksBooksLiterature & FictionBooksBooksE...


# define features

In [12]:
cat_enc = SequenceEncoder(sep="\x02", min_cnt=1, max_len=SEQ_MAX_LEN)

In [13]:
cat_enc.fit(train_df.hist_cats.values)

<prediction_flow.transformers.column.sequence_encoder.SequenceEncoder at 0x15dbaddd8>

In [14]:
cat_word2idx, cat_idx2word = cat_enc.word2idx, cat_enc.idx2word

In [15]:
print(len(cat_word2idx))

299


In [16]:
mid_enc = SequenceEncoder(sep="\x02", min_cnt=1, max_len=SEQ_MAX_LEN)

In [17]:
mid_enc.fit(np.vstack([train_df.mid.values, train_df.hist_mids.values]))

<prediction_flow.transformers.column.sequence_encoder.SequenceEncoder at 0x15dbc3748>

In [18]:
mid_word2idx, mid_idx2word = mid_enc.word2idx, mid_enc.idx2word

In [19]:
print(len(mid_word2idx))

13570


In [20]:
number_features = []

category_features = [
    Category('mid',
             CategoryEncoder(min_cnt=1, word2idx=mid_word2idx, idx2word=mid_idx2word),
             embedding_name='mid'),
    Category('cat',
             CategoryEncoder(min_cnt=1, word2idx=cat_word2idx, idx2word=cat_idx2word),
             embedding_name='cat'),
]

sequence_features = [
    Sequence('hist_mids',
             SequenceEncoder(sep="\x02", min_cnt=1, max_len=SEQ_MAX_LEN,
                             word2idx=mid_word2idx, idx2word=mid_idx2word),
             embedding_name='mid'),
    Sequence('hist_cats',
             SequenceEncoder(sep="\x02", min_cnt=1, max_len=SEQ_MAX_LEN,
                             word2idx=cat_word2idx, idx2word=cat_idx2word),
             embedding_name='cat')
]

features, train_loader, valid_loader = create_dataloader_fn(
    number_features, category_features, sequence_features, BATCH_SIZE, train_df, 'label', valid_df, 4)

In [21]:
def evaluation(model, df, dataloader):
    preds = predict(model, dataloader)
    return roc_auc_score(df['label'], preds.ravel())

In [38]:
import pytorch_lightning as pl

class CoolModel(pl.LightningModule):
    def __init__(self):
        super(CoolModel, self).__init__()
        self.model = DNN(
            features,
            2,
            EMBEDDING_DIM,
            DNN_HIDDEN_SIZE,
            final_activation='sigmoid',
            dropout=DNN_DROPOUT)
    
    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_nb):
        # REQUIRED
        y = batch['label']
        y_hat = self.forward(batch)
        loss = F.binary_cross_entropy(y_hat, y)
        return {
            'loss': loss,
            'progress_bar':
            {'training_loss': loss}}

    def validation_step(self, batch, batch_nb):
        # OPTIONAL
        y = batch['label']
        y_hat = self.forward(batch)
        loss = F.binary_cross_entropy(y_hat, y)
        return {'val_loss': loss}

    def validation_end(self, outputs):
        # OPTIONAL
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        return {'progress_bar': {'val_loss': avg_loss}}

    def configure_optimizers(self):
        # REQUIRED
        return torch.optim.Adam(self.parameters(), lr=0.003)

    @pl.data_loader
    def train_dataloader(self):
        return train_loader

    @pl.data_loader
    def val_dataloader(self):
        # OPTIONAL
        # can also return a list of val dataloaders
        return valid_loader

In [41]:
from pytorch_lightning import Trainer

model = CoolModel()

# most basic trainer, uses good defaults
trainer = Trainer(max_nb_epochs=5)    

In [42]:
trainer.fit(model)

  0%|          | 0/5 [00:00<?, ?it/s]

                                 Name         Type Params
0                               model          DNN  280 K
1                 model.embedding:mid    Embedding  244 K
2                 model.embedding:cat    Embedding    5 K
3             model.pooling:hist_mids   MaxPooling    0  
4             model.pooling:hist_cats   MaxPooling    0  
5                           model.mlp          MLP   31 K
6               model.mlp._sequential   Sequential   31 K
7        model.mlp._sequential.dense0       Linear   14 K
8    model.mlp._sequential.batchnorm0  BatchNorm1d  400  
9   model.mlp._sequential.activation0         ReLU    0  
10       model.mlp._sequential.dense1       Linear   16 K
11   model.mlp._sequential.batchnorm1  BatchNorm1d  160  
12  model.mlp._sequential.activation1         ReLU    0  
13                  model.final_layer       Linear   81  


100%|██████████| 16/16 [00:00<00:00, 37.40it/s, batch_nb=7, epoch=19, loss=0.046, training_loss=0.00538, v_nb=7, val_loss=1.39]

1

100%|██████████| 16/16 [00:14<00:00, 37.40it/s, batch_nb=7, epoch=19, loss=0.046, training_loss=0.00538, v_nb=7, val_loss=1.39]

In [44]:
score = evaluation(model, valid_df, valid_loader)

In [46]:
print(f'auc: {score}')

auc: 0.5041570364663586
