In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
import torch
from torch import optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [3]:
from sklearn.metrics import roc_auc_score

In [4]:
# Select GPU Number
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [5]:
# check if cuda available
device = "cuda" if torch.cuda.is_available() else "cpu"

torch.manual_seed(315)
if device == "cuda":
    torch.cuda.manual_seed_all(912)

device

'cuda'

### Load Data

In [6]:
with open('df_deep.pkl', 'rb') as f:
    df_deep = pickle.load(f)
    
with open('df_wide.pkl', 'rb') as f:
    df_wide = pickle.load(f)

데이터 변환
movielens 데이터는 unwatched 데이터가 존재하지 않으므로,  
1, 2, 3 을 비선호로 설정

In [7]:
Y = np.array(df_wide["rating"])
df_deep = df_deep.drop(columns = ["rating"])
df_wide = df_wide.drop(columns = ["rating"])

In [8]:
Y_new = np.where(Y == 1, 0, Y)
Y_new = np.where(Y_new == 2, 0, Y_new)
Y_new = np.where(Y_new == 3, 0, Y_new)
Y_new = np.where(Y_new == 4, 1, Y_new)
Y_new = np.where(Y_new == 5, 1, Y_new)

In [9]:
unique, counts = np.unique(Y_new, return_counts=True)
print(np.asarray((unique, counts)).T)

[[     0 424928]
 [     1 575281]]


In [10]:
from sklearn.model_selection import train_test_split

X_train_deep, X_test_deep = train_test_split(df_deep.values, test_size=0.3, random_state=1981)
X_train_wide, X_test_wide = train_test_split(df_wide.values, test_size=0.3, random_state=1981)
Y_train, Y_test = train_test_split(Y_new, test_size=0.3, random_state=1981)

In [11]:
class RatingDataset(Dataset):
    """
    torch.utils.data.Dataset 상속
    """
    def __init__(self, X_wide_tensor, X_deep_tensor, y_tensor):
        self.X_wide_tensor = X_wide_tensor
        self.X_deep_tensor = X_deep_tensor
        self.y_tensor = y_tensor
        
    def __len__(self):
        return self.X_wide_tensor.size(0)
    
    def __getitem__(self, index):
        return self.X_wide_tensor[index], self.X_deep_tensor[index], self.y_tensor[index]

In [12]:
train_dataset = RatingDataset(X_wide_tensor = torch.FloatTensor(X_train_wide),
                               X_deep_tensor = torch.LongTensor(X_train_deep),
                               y_tensor = torch.FloatTensor(Y_train))

In [13]:
# test data
test_wide_tensor = torch.FloatTensor(X_test_wide)
test_deep_tensor = torch.LongTensor(X_test_deep)
test_tensor = torch.FloatTensor(Y_test)

### Create Model

In [14]:
from tensorboardX import SummaryWriter
writer = SummaryWriter(logdir="runs/Wide_Deep")

In [15]:
class wide_deep(nn.Module):
    
    def __init__(self):
        super(wide_deep, self).__init__()
        
        # deep model
        self.embed_user = nn.Embedding(num_embeddings = 6041, embedding_dim = 32)
        self.embed_movie = nn.Embedding(num_embeddings = 3953, embedding_dim = 32)
        self.embed_genre = nn.Embedding(num_embeddings = 18, embedding_dim = 8)
        self.embed_gender = nn.Embedding(num_embeddings = 2, embedding_dim = 8)
        self.embed_age = nn.Embedding(num_embeddings = 7, embedding_dim = 8)
        self.embed_occupation = nn.Embedding(num_embeddings = 21, embedding_dim = 8)
        
        self.linear_1 = nn.Linear(in_features = 96, out_features = 64)
        self.linear_2 = nn.Linear(in_features = 64, out_features = 32)
        self.linear_3 = nn.Linear(in_features = 32, out_features = 16)
        
        # wide deep model
        self.linear = nn.Linear(in_features = 243, out_features = 1)
        self.logistic = nn.Sigmoid()
        
        
    def forward(self, X_w, X_d):
        
        # deep model
        user_embedding = self.embed_user(X_d[:, 0])
        movie_embedding = self.embed_movie(X_d[:, 1])
        genre_embedding = self.embed_genre(X_d[:, 2])
        gender_embedding = self.embed_gender(X_d[:, 4])
        age_embedding = self.embed_age(X_d[:, 5])
        occupation_embedding = self.embed_occupation(X_d[:, 6])
        
        vector = torch.cat([user_embedding, movie_embedding, genre_embedding, gender_embedding, age_embedding, occupation_embedding], dim=-1)
        
        vector = self.linear_1(vector)
        vector = nn.ReLU()(vector)
        vector = self.linear_2(vector)
        vector = nn.ReLU()(vector)
        vector = self.linear_3(vector)
        deep_out = nn.ReLU()(vector)
        
        # integrated
        wide_deep_input = torch.cat([X_w, deep_out], dim=1)
        logits = self.linear(wide_deep_input)
        out = self.logistic(logits)
        
        return out
        
    def init_weight(self):
        pass
        

In [16]:
model = wide_deep().cuda()
optimizer = optim.Adagrad(model.parameters(), lr=0.001)
loss_function = nn.BCELoss()
batch_size = 64
n_epochs = 100

In [17]:
for epoch_id in range(n_epochs):
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(train_loader):

        X_w, X_d, y = batch[0], batch[1], batch[2]
        X_w, X_d, y = X_w.cuda(), X_d.cuda(), y.cuda()

        optimizer.zero_grad()
        y_pred = model(X_w, X_d)
        loss = loss_function(y_pred.view(-1), y)
        loss.backward()
        optimizer.step()
        total_loss += loss
        
    model.eval()
    pred = model(test_wide_tensor.cuda(), test_deep_tensor.cuda())
    auc = roc_auc_score(test_tensor.cpu().detach().numpy(), pred.cpu().detach().numpy())
    
    writer.add_scalar("loss/ Train_loss", total_loss, epoch_id)
    writer.add_scalar("performace/AUC", auc, epoch_id)

    print ('Epoch {} of {}, training Loss: {:.4f}, auc score: {:.4f}'.format(epoch_id + 1, n_epochs, total_loss, auc))

Epoch 1 of 100, training Loss: 7323.4688, auc score: 0.6136
Epoch 2 of 100, training Loss: 7221.3262, auc score: 0.6266
Epoch 3 of 100, training Loss: 7171.6807, auc score: 0.6351
Epoch 4 of 100, training Loss: 7134.5322, auc score: 0.6417
Epoch 5 of 100, training Loss: 7102.7769, auc score: 0.6473
Epoch 6 of 100, training Loss: 7073.9858, auc score: 0.6521


KeyboardInterrupt: 