In [145]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix, classification_report
from torch.utils.data import TensorDataset, DataLoader

import visdom

In [146]:
vis = visdom.Visdom()

Setting up a new session...


In [2]:
profile = pd.read_csv("data/profile.csv")
portfolio = pd.read_csv("data/portfolio-dummy.csv")
train_set = pd.read_csv("data/train.csv")
test_set = pd.read_csv("data/test.csv")

In [3]:
def encode_dataset(df, shuffle=False):
    df = df.merge(profile, left_on='person', right_on='id') \
        .merge(portfolio, left_on='offer_id', right_on="id")
    if shuffle:
        df = df.sample(frac=1)
    df.index = df[['person', 'offer_id']]
    y = df['offer viewed']
    df = df.drop(columns=['person', 'offer_id', 'id_x', 'id_y', 'index', 'offer viewed'])
    return df, y

X_train, y_train = encode_dataset(train_set)
X_test, y_test = encode_dataset(test_set)

In [4]:
user_to_index = {id: i for i, id in enumerate(profile.id)}
portfolio_to_index = {id: i for i, id in enumerate(portfolio.id)}

In [5]:
# user_to_index
portfolio_to_index

{'0b1e1539f2cc45b7b9fa7c272da2e1d7': 0,
 '2298d6c36e964ae4a3e7e9706d1fb8c2': 1,
 '2906b810c7d4411798c6938adc9daaa5': 2,
 '3f207df678b143eea3cee63160fa8bed': 3,
 '4d5c57ea9a6940dd891ad53e9dbe8da0': 4,
 '5a8bc65990b245e5a138643cd4eb9837': 5,
 '9b98b8c7a33c4b65b9aebfe6a799e6d9': 6,
 'ae264e3637204a6fb9bb56bc8210ddfd': 7,
 'f19421c1d4aa40978ebb69ca19b0e20d': 8,
 'fafdcd668e3743c1bb461111dcafc2a4': 9}

In [6]:
X_train.columns.shape[0]

18

In [21]:
    
def get_user_offers(index):
    users = [user_to_index[x[0]] for x in index]
    offers = [portfolio_to_index[x[1]] for x in index]

    return users, offers
    
class UserOfferClickDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        
        users, offers = get_user_offers(X.index)
        self.users = torch.tensor(users, dtype=torch.int64)
        self.offers = torch.tensor(offers, dtype=torch.int64)
        self.X = torch.tensor(X.to_numpy(), dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.length = len(X)
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, i):
        return self.X[i], self.users[i], self.offers[i], self.y[i]

In [132]:
train_set = UserOfferClickDataset(X_train, y_train)
test_set = UserOfferClickDataset(X_test, y_test)

In [37]:
dev = 'cuda'

In [194]:
class LinearModel(nn.Module):
    
    def __init__(self, feature_dim=18, user_emb_dim=16, offer_emb_dim=4, fc_size=64):
        super().__init__()
        
        self.user_emb = nn.Embedding(len(user_to_index), embedding_dim=user_emb_dim)
        self.offer_emb = nn.Embedding(len(portfolio_to_index), embedding_dim=offer_emb_dim)
        
        input_size = feature_dim + user_emb_dim + offer_emb_dim
        self.bnX = torch.nn.BatchNorm1d(feature_dim)
        # 18 + user_emb_dim + offer_emb_dim
        self.nn = nn.Sequential(
            nn.Linear(input_size, fc_size, bias=True),
            nn.LeakyReLU(),
            nn.Linear(fc_size, 1, bias=True),
            nn.Sigmoid()
        )
    
    def forward(self, users, offers, X):      
        users = self.user_emb(users)
        offers = self.offer_emb(offers)
        X = self.bnX(X)
        
        X = torch.cat([users, offers, X], dim=1)
        y = self.nn(X)
        return y

In [199]:
model = LinearModel()

In [202]:

def iterate(model, loader, dev, train):
    if train:
        model.train()
        opt = torch.optim.Adam(model.parameters(), lr=0.001)
        
    else:
        model.eval()
        
    total_loss = 0
    
    for X, users, offers, y in loader:
        X = X.to(dev)
        users = users.to(dev)
        offers = offers.to(dev)
        y = y.view(-1, 1).to(dev)

        y_pred = model(users, offers, X)
        loss = F.binary_cross_entropy(y_pred, y)
        
        total_loss += loss.item()

        if train:
            opt.zero_grad()
            loss.backward()
            opt.step()
    
    return total_loss


def train_test(vis, name, model, loader, val_loader, epochs=100, dev='cuda'):
    
    train_losses = []
    val_losses = []
    
    model.train()
    train_win = "train loss"
    val_win = "val loss"
    
    for i in range(epochs):
        
        train_loss = iterate(model, loader, dev, True)
        val_loss = iterate(model, val_loader, dev, False)
        
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        
        
        X = list(range(i + 1))
        
        train_win = vis.line(Y=train_losses, X=X, win=train_win, name="train")
        val_win = vis.line(Y=val_losses, X=X, win=val_win, name="val")

    return train_losses, val_losses

In [203]:
dev = 'cuda'

In [204]:
torch.cuda.empty_cache()
train_loader = DataLoader(train_set, batch_size=4096, shuffle=True)
test_loader = DataLoader(test_set, batch_size=1024, shuffle=True)

model = model.to(dev)
train_test(vis, "default", model, train_loader, test_loader, epochs=30, dev=dev)

([8.43303495645523,
  7.364043235778809,
  6.579260945320129,
  6.000435322523117,
  5.569187134504318,
  5.30958417057991,
  5.134786069393158,
  4.984393209218979,
  4.898108035326004,
  4.769923001527786,
  4.725673705339432,
  4.699775367975235,
  4.669571131467819,
  4.645609587430954,
  4.635508120059967,
  4.597072273492813,
  4.550907701253891,
  4.5831411480903625,
  4.526882886886597,
  4.555614650249481,
  4.515269547700882,
  4.522534161806107,
  4.502939343452454,
  4.504444390535355,
  4.475520640611649,
  4.4278139770030975,
  4.453568607568741,
  4.4231283366680145,
  4.3459406197071075,
  4.375648409128189],
 [27.391486883163452,
  24.199343621730804,
  14.253266870975494,
  11.887653857469559,
  11.429375499486923,
  11.090698778629303,
  10.768241733312607,
  10.50392472743988,
  10.299353301525116,
  10.194581925868988,
  10.124081254005432,
  10.099911212921143,
  10.07095181941986,
  10.056556552648544,
  10.06528753042221,
  10.041175901889801,
  10.0522409379482

In [207]:

def evaluate(name, model, loader):
    model.eval().cpu()
    
    pred = []
    real = []
    
    for X, users, offers, y in loader:
        y_pred = model(users, offers, X).squeeze().round().tolist()
        pred.extend(y_pred)
        real.extend(y.tolist())
        
    print("Classification Report for", name)
    print("Accuracy: ", np.sum([x == y for x, y in zip(pred, real)]) / len(pred))
    print(classification_report(pred, real))
        

In [208]:
evaluate("model train", model, train_loader)
evaluate("model test", model, test_loader)

Classification Report for model train
Accuracy:  0.8342038567493113
              precision    recall  f1-score   support

         0.0       0.52      0.69      0.59      7876
         1.0       0.93      0.87      0.90     37499

    accuracy                           0.83     45375
   macro avg       0.72      0.78      0.74     45375
weighted avg       0.86      0.83      0.84     45375

Classification Report for model test
Accuracy:  0.8018433179723502
              precision    recall  f1-score   support

         0.0       0.44      0.65      0.52      4079
         1.0       0.92      0.83      0.87     20442

    accuracy                           0.80     24521
   macro avg       0.68      0.74      0.70     24521
weighted avg       0.84      0.80      0.82     24521

