<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#K-Core-Pruning" data-toc-modified-id="K-Core-Pruning-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>K-Core Pruning</a></span></li><li><span><a href="#Spliting-Dataset" data-toc-modified-id="Spliting-Dataset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Spliting Dataset</a></span></li><li><span><a href="#Modeling" data-toc-modified-id="Modeling-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Modeling</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Regularization-적용하기" data-toc-modified-id="Regularization-적용하기-3.0.1"><span class="toc-item-num">3.0.1&nbsp;&nbsp;</span>Regularization 적용하기</a></span></li></ul></li><li><span><a href="#학습-데이터-구성하기" data-toc-modified-id="학습-데이터-구성하기-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>학습 데이터 구성하기</a></span><ul class="toc-item"><li><span><a href="#학습-데이터-구성하기" data-toc-modified-id="학습-데이터-구성하기-3.1.1"><span class="toc-item-num">3.1.1&nbsp;&nbsp;</span>학습 데이터 구성하기</a></span></li><li><span><a href="#모델-학습하기" data-toc-modified-id="모델-학습하기-3.1.2"><span class="toc-item-num">3.1.2&nbsp;&nbsp;</span>모델 학습하기</a></span></li></ul></li></ul></li></ul></div>

In [1]:
import os
import random
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google_drive_downloader import GoogleDriveDownloader as gdd
from sklearn.model_selection import train_test_split
import copy
import tensorflow as tf
from tensorflow.keras import Input, Model, Sequential
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Concatenate, Add, Dot, Multiply, Reshape, Activation, BatchNormalization, SimpleRNNCell, RNN, SimpleRNN, LSTM, Embedding, Bidirectional, TimeDistributed, Conv1D, Conv2D, MaxPool1D, MaxPool2D, GlobalMaxPool1D, GlobalMaxPool2D, AveragePooling1D, AveragePooling2D, GlobalAveragePooling1D, GlobalAveragePooling2D, ZeroPadding2D
from tensorflow.keras.optimizers import SGD, Adam, Adagrad
from tensorflow.keras.metrics import RootMeanSquaredError, BinaryCrossentropy, SparseCategoricalAccuracy
from tensorflow.keras.layers.experimental.preprocessing import Rescaling
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.activations import linear, sigmoid, relu

tqdm.pandas()
np.set_printoptions(precision=3)
plt.style.use("dark_background")

  from pandas import Panel


In [2]:
users = pd.read_csv("Datasets/MovieLens 100k/100k_users.csv")
movies = pd.read_csv("Datasets/MovieLens 100k/100k_movies.csv")
ratings = pd.read_csv("Datasets/MovieLens 100k/100k_ratings.csv")

# K-Core Pruning

In [3]:
thr = 5
len_prev = -1
len_next = -2
while len_prev != len_next:
    len_prev = len(ratings)
    print(f"len(ratings): {len(ratings):,}")
    
    user_n_ratings = ratings["user_id"].value_counts()
    users_ = user_n_ratings[user_n_ratings>thr].index
    
    item_n_ratings = ratings["item_id"].value_counts()
    items_ = item_n_ratings[item_n_ratings>thr].index

    ratings = ratings[(ratings["user_id"].isin(users_)) & (ratings["item_id"].isin(items_))]
    len_next = len(ratings)
print("Finished!")

len(ratings): 99,991
len(ratings): 99,023
Finished!


# Spliting Dataset
- 시간 순서대로 Dataset을 나누겠습니다.

In [4]:
ratings_tr = pd.DataFrame()
ratings_te = pd.DataFrame()
for _, group in tqdm(ratings.groupby(["user_id"])):
    tr, te = train_test_split(group, test_size=0.1, shuffle=False)
    ratings_tr = pd.concat([ratings_tr, tr], axis=0)
    ratings_te = pd.concat([ratings_te, te], axis=0)

print(f"len(ratings_tr): {len(ratings_tr):,}")
print(f"len(ratings_te): {len(ratings_te):,}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=943.0), HTML(value='')))


len(ratings_tr): 88,681
len(ratings_te): 10,342


In [5]:
movie_n_ratings = ratings.groupby(["item_id"]).size().sort_values(ascending=False)

X_te = copy.deepcopy(ratings_te)
y_te = ratings_te[["rating"]]
user_movies = ratings.groupby(["user_id"])["item_id"].apply(frozenset)
X_te["items"] = X_te["user_id"].apply(lambda x : user_movies[x])
X_te = X_te.drop(["rating"], axis=1)

# `item_id`: 본 영화 1개
# `items_100`: 보지 않은 영화 100개
X_te["items_100"] = X_te.progress_apply(lambda x:random.choices(list(x["items"] - {x["item_id"]}), k=100, weights=movie_n_ratings[list(x["items"] - {x["item_id"]})]), axis=1)

# def pick_items_100(x):
#     temp = movie_n_ratings[~movie_n_ratings.index.isin(x["items"])]
#     return set(temp.sample(100, replace=False, weights=movie_n_ratings).index)

# X_te["items_100"] = X_te.progress_apply(lambda x : pick_items_100(x), axis=1)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10342.0), HTML(value='')))




# Modeling

In [6]:
input_user = Input(shape=(), name="Input_user")
input_pos = Input(shape=(), name="Input_pos")
input_neg = Input(shape=(), name="Input_neg")
inputs = [input_user, input_pos, input_neg]

n_users = ratings["user_id"].max() + 1
n_items = ratings["item_id"].max() + 1
dim = 30
embedding_user = Embedding(input_dim=n_users, output_dim=dim + 1, name="Embedding_user")
embedding_item = Embedding(input_dim=n_items, output_dim=dim + 1, name="Embedding_item")

z1 = embedding_user(input_user)
z2 = embedding_item(input_pos)
z3 = embedding_item(input_neg)

pos_score = Dot(axes=(1, 1))([z1, z2])
neg_score = Dot(axes=(1, 1))([z1, z3])
diff = pos_score - neg_score
outputs = sigmoid(diff)

model = Model(inputs=inputs, outputs=outputs)

In [7]:
#유저 임베딩에서 우리는 마지막 임베딩에 1을 추가해주어야 합니다. 아이템 임베딩의 마지막 원소값 Bias를 추가하기 위함입니다.바로 아래와 같은 방식으로 유저 임베딩과 아이템 임베딩이 형성됩니다.
# Dot 연산으로 Bias 연산까지 같이 수행하기 위해 아래와 같이 코드를 작성하게 됩니다.
# U=[u1,u2,u3,...,u60,1]I=[i1,i2,i3,...,i60,ibias]

# z1 = embedding_user(input_user)
# one_emb = tf.ones_like(user_emb[:, -1:])

In [8]:
# Matrix Factoriation은 쉽게 Overfitting, 즉 학습 데이터에만 과적합되는 현상이 발생합니다. 이를 방지하기 위해 가장 기본적인 방법론 중 하나는 Weight Decay, 즉 weight의 값이 너무 커지지 않도록 방지하는 것입니다. 이를 위해 아래와 같이 Loss를 추가해주게 되면, weight가 어느정도 줄어드는 방향으로 모델이 학습하게 됩니다.
# l2_user = z1**2
# l2_pos_item = z2**2
# l2_neg_item = z3**2
# l2_reg = 0.0001

# weight_decay = l2_reg*tf.reduce_sum(l2_user + l2_pos_item + l2_neg_item)

# model.add_loss(weight_decay)

model.compile(optimizer=Adagrad(1), loss="binary_crossentropy", metrics=["acc"])

In [9]:
all_movies = set(ratings_tr["item_id"])
user_not_movies = all_movies - user_movies
user_not_movies = user_not_movies.map(list)

def get_bpr_dataset(ratings_tr, user_not_movies):
    ratings_tr_batch = copy.deepcopy(ratings_tr)
    ratings_tr_batch = ratings_tr_batch.sample(frac=1)
    ratings_tr_batch["neg_item"] = ratings_tr_batch.apply(lambda x : random.choice(user_not_movies[x["user_id"]]), axis=1)
    
    x = [ratings_tr_batch["user_id"].values, ratings_tr_batch["item_id"].values, ratings_tr_batch["neg_item"].values]
    y = np.ones(shape=(len(ratings_tr_batch), 1))
    
    return x, y

In [11]:
n_epochs = 10
for i in range(1, n_epochs + 1):
    print(f"epoch: {i:>3d}")
    X, y = get_bpr_dataset(ratings_tr, user_not_movies)
    model.fit(x=X, y=y, batch_size=64, verbose=2)

epoch:   1
1386/1386 - 2s - loss: 0.6919 - acc: 0.5740 - 2s/epoch - 1ms/step
epoch:   2
1386/1386 - 1s - loss: 0.5779 - acc: 0.8036 - 1s/epoch - 879us/step
epoch:   3
1386/1386 - 2s - loss: 0.4185 - acc: 0.8212 - 2s/epoch - 1ms/step
epoch:   4
1386/1386 - 1s - loss: 0.3616 - acc: 0.8485 - 1s/epoch - 902us/step
epoch:   5
1386/1386 - 1s - loss: 0.3099 - acc: 0.8729 - 1s/epoch - 942us/step
epoch:   6
1386/1386 - 1s - loss: 0.2777 - acc: 0.8848 - 1s/epoch - 894us/step
epoch:   7
1386/1386 - 1s - loss: 0.2514 - acc: 0.8985 - 1s/epoch - 947us/step
epoch:   8
1386/1386 - 1s - loss: 0.2346 - acc: 0.9049 - 1s/epoch - 879us/step
epoch:   9
1386/1386 - 1s - loss: 0.2194 - acc: 0.9105 - 1s/epoch - 929us/step
epoch:  10
1386/1386 - 1s - loss: 0.2036 - acc: 0.9179 - 1s/epoch - 915us/step


In [None]:
model = Model(inputs=inputs, outputs=outputs)