# script to do experiments described in paper: Deep Interest Evolution Network for Click-Through Rate Prediction

## how to run

1. Please run prepare_neg.ipynb first.

In [1]:
SEQ_MAX_LEN = 100 # maximum sequence length
BATCH_SIZE = 128
EMBEDDING_DIM = 18
DNN_HIDDEN_SIZE = [200, 80]
DNN_DROPOUT = 0.0
TEST_RUN = False
EPOCH = 2
SEED = 10

In [2]:
%matplotlib inline

import itertools
from collections import Counter, OrderedDict

import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

from prediction_flow.features import Number, Category, Sequence, Features
from prediction_flow.transformers.column import (
    StandardScaler, CategoryEncoder, SequenceEncoder)

from prediction_flow.pytorch.data import Dataset
from prediction_flow.pytorch import WideDeep, DeepFM, DNN, DIN, DIEN, AttentionGroup

from prediction_flow.pytorch.functions import fit, predict, create_dataloader_fn

In [3]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f052fcd4150>

In [4]:
train_df = pd.read_csv(
    "./local_train.csv", sep='\t')

valid_df = pd.read_csv(
    "./local_test.csv", sep='\t')

In [5]:
if TEST_RUN:
    train_df = train_df.sample(1000)
    valid_df = valid_df.sample(1000)

In [6]:
train_df.head()

Unnamed: 0,label,uid,mid,cat,hist_mids,hist_cats,neg_hist_mids,neg_hist_cats
0,0,AZPJ9LUT0FEPY,B00AMNNTIA,Literature & Fiction,030774443400622483910470530707097892462215...,BooksBooksBooksBooksBooks,07868904870618539069B001IDZJO0160342154803...,BooksBooksBooksBooksBooks
1,1,AZPJ9LUT0FEPY,0800731603,Books,030774443400622483910470530707097892462215...,BooksBooksBooksBooksBooks,B00BEFIHOG14022452700670031747061578518214...,LiteraryBooksBooksBooksBooks
2,0,A2NRV79GKAU726,B003NNV10O,Russian,0814472869007146207415839423000812538366B0...,BooksBooksBooksBooksBakingBooksBooks,B00LQABRTG087830178X0991543009071533154X03...,NeuropsychologyBooksBooksBooksBooksBooks...
3,1,A2NRV79GKAU726,B000UWJ91O,Books,0814472869007146207415839423000812538366B0...,BooksBooksBooksBooksBakingBooksBooks,159532814915917978100451233018037377135514...,BooksBooksBooksBooksBooksBooksContempora...
4,0,A2GEQVDX2LL4V3,0321334094,Books,0743596870037428099114391406340976475731,BooksBooksBooksBooks,0316159735156718359X07868124000062506110,BooksBooksBooksBooks


In [7]:
valid_df.head()

Unnamed: 0,label,uid,mid,cat,hist_mids,hist_cats,neg_hist_mids,neg_hist_cats
0,0,A3BI7R43VUZ1TY,B00JNHU0T2,Literature & Fiction,0989464105B00B01691C14778097321608442845,BooksLiterature & FictionBooksBooks,0899576168B0056ATROO04466004740615209459,BooksSleepBooksBooks
1,1,A3BI7R43VUZ1TY,0989464121,Books,0989464105B00B01691C14778097321608442845,BooksLiterature & FictionBooksBooks,037352772109818545240470404159B00BWKBSOY,BooksBooksBooksLiterature & Fiction
2,0,A2Z3AHJPXG3ZNP,B0072YSPJ0,Literature & Fiction,147831096014922314521477603425B00FRKLA6Q,BooksBooksBooksUrban,B00EQAEA60B007D64VX6188547766X1590172477,Literature & FictionQuranBooksBooks
3,1,A2Z3AHJPXG3ZNP,B00G4I4I5U,Urban,147831096014922314521477603425B00FRKLA6Q,BooksBooksBooksUrban,1583942475158567860015701992210312373090,BooksBooksBooksBooks
4,0,A2KDDPJUNWC5CA,0316228532,Books,0141326085031026622X0316077046098864917914...,BooksBooksBooksBooksBooks,B0077FOPFC1594744106B00DFGN1DE0972259112B0...,GhostsBooksEroticaBooksSoups & Stews


# define features

In [8]:
cat_enc = SequenceEncoder(sep="\x02", min_cnt=1, max_len=SEQ_MAX_LEN)

In [9]:
cat_enc.fit(train_df.hist_cats.values)

<prediction_flow.transformers.column.sequence_encoder.SequenceEncoder at 0x7f04ba916080>

In [10]:
cat_word2idx, cat_idx2word = cat_enc.word2idx, cat_enc.idx2word

In [11]:
print(len(cat_word2idx))

1602


In [12]:
mid_enc = SequenceEncoder(sep="\x02", min_cnt=1, max_len=SEQ_MAX_LEN)

In [13]:
mid_enc.fit(np.vstack([train_df.mid.values, train_df.hist_mids.values]))

<prediction_flow.transformers.column.sequence_encoder.SequenceEncoder at 0x7f04ba92a7b8>

In [14]:
mid_word2idx, mid_idx2word = mid_enc.word2idx, mid_enc.idx2word

In [15]:
print(len(mid_word2idx))

367984


In [16]:
number_features = []

category_features = [
    Category('mid',
             CategoryEncoder(min_cnt=1, word2idx=mid_word2idx, idx2word=mid_idx2word),
             embedding_name='mid'),
    Category('cat',
             CategoryEncoder(min_cnt=1, word2idx=cat_word2idx, idx2word=cat_idx2word),
             embedding_name='cat'),
]

sequence_features = [
    Sequence('hist_mids',
             SequenceEncoder(sep="\x02", min_cnt=1, max_len=SEQ_MAX_LEN,
                             word2idx=mid_word2idx, idx2word=mid_idx2word),
             embedding_name='mid'),
    Sequence('hist_cats',
             SequenceEncoder(sep="\x02", min_cnt=1, max_len=SEQ_MAX_LEN,
                             word2idx=cat_word2idx, idx2word=cat_idx2word),
             embedding_name='cat')
]

features, train_loader, valid_loader = create_dataloader_fn(
    number_features, category_features, sequence_features, BATCH_SIZE, train_df, 'label', valid_df, 4)

In [17]:
def evaluation(model, df, dataloader):
    preds = predict(model, dataloader)
    return roc_auc_score(df['label'], preds.ravel())

In [18]:
import pytorch_lightning as pl

class CoolModel(pl.LightningModule):
    def __init__(self):
        super(CoolModel, self).__init__()
        self.model = DNN(
            features,
            2,
            EMBEDDING_DIM,
            DNN_HIDDEN_SIZE,
            final_activation='sigmoid',
            dropout=DNN_DROPOUT)
    
    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_nb):
        # REQUIRED
        y = batch['label']
        y_hat = self.forward(batch)
        loss = F.binary_cross_entropy(y_hat, y)
        return {
            'loss': loss,
            'progress_bar':
            {'training_loss': loss}}

    def validation_step(self, batch, batch_nb):
        # OPTIONAL
        y = batch['label']
        y_hat = self.forward(batch)
        loss = F.binary_cross_entropy(y_hat, y)
        return {'val_loss': loss}

    def validation_end(self, outputs):
        # OPTIONAL
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        return {'progress_bar': {'val_loss': avg_loss}}

    def configure_optimizers(self):
        # REQUIRED
        return torch.optim.Adam(self.parameters(), lr=0.003)

    @pl.data_loader
    def train_dataloader(self):
        return train_loader

    @pl.data_loader
    def val_dataloader(self):
        # OPTIONAL
        # can also return a list of val dataloaders
        return valid_loader

In [22]:
from pytorch_lightning import Trainer

model = CoolModel()

# most basic trainer, uses good defaults
trainer = Trainer(max_nb_epochs=EPOCH, gpus=1)    

gpu available: True, used: True
VISIBLE GPUS: 0


In [23]:
trainer.fit(model)

  0%|          | 0/5 [00:00<?, ?it/s]

                                 Name         Type Params
0                               model          DNN    6 M
1                 model.embedding:mid    Embedding    6 M
2                 model.embedding:cat    Embedding   28 K
3             model.pooling:hist_mids   MaxPooling    0  
4             model.pooling:hist_cats   MaxPooling    0  
5                           model.mlp          MLP   31 K
6               model.mlp._sequential   Sequential   31 K
7        model.mlp._sequential.dense0       Linear   14 K
8    model.mlp._sequential.batchnorm0  BatchNorm1d  400  
9   model.mlp._sequential.activation0         ReLU    0  
10       model.mlp._sequential.dense1       Linear   16 K
11   model.mlp._sequential.batchnorm1  BatchNorm1d  160  
12  model.mlp._sequential.activation1         ReLU    0  
13                  model.final_layer       Linear   81  


100%|██████████| 9433/9433 [01:21<00:00, 317.80it/s, batch_nb=8485, epoch=1, gpu=0, loss=0.596, training_loss=0.571, v_nb=1, val_loss=0.625]

1

100%|██████████| 9433/9433 [01:40<00:00, 317.80it/s, batch_nb=8485, epoch=1, gpu=0, loss=0.596, training_loss=0.571, v_nb=1, val_loss=0.625]

In [24]:
score = evaluation(model, valid_df, valid_loader)

In [25]:
print(f'auc: {score}')

auc: 0.7025202354666744
