In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import HeNormal, GlorotNormal
import tensorflow as tf

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/패캠강의/fastcampus_recsys_2301/Fastcampus_Codes/03_DL_based_algorithms/')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# 무비렌즈 데이터 세트 호출
data_path = '../data/ml-latest-small/'
df = pd.read_csv(data_path+'ratings.csv')

In [4]:
# binary implicit feedback
# rating >= 3.0일 경우 positive로 간주
df['rating'] = df['rating'].apply(lambda x: 1 if x >= 3.0 else 0)

# 전체 유저 수와 영화 수
n_users = df.userId.nunique()
n_movies = df.movieId.nunique()

# 데이터를 train/valid로 나눔
train, validation = train_test_split(df, test_size=0.2, random_state=42)

# 유저 아이디 indexing
unique_user_ids = df["userId"].unique()
userId_dict = {id: i for i, id in enumerate(unique_user_ids)}


# 영화 아이디 indexing
unique_movie_ids = df["movieId"].unique()
movieId_dict = {id: i for i, id in enumerate(unique_movie_ids)}

# 인덱싱 된 딕셔너리를 사용하여 ID 맵핑
train["userId"] = train["userId"].map(userId_dict)
train["movieId"] = train["movieId"].map(movieId_dict)

validation["userId"] = validation["userId"].map(userId_dict)
validation["movieId"] = validation["movieId"].map(movieId_dict)

# 전체 유니크 유저 수 
num_unique_users = df['userId'].nunique() + 1
# 전체 유니크 영화 수
num_unique_movies = df['movieId'].nunique() + 1

In [5]:
# 유저/아이템 입력 레이어
user_input = Input(shape=(1,))
movie_input = Input(shape=(1,))

# 임베딩 차원
n_latent_factors = 20

# 임베딩 레이어 정의
user_embedding = Embedding(num_unique_users, n_latent_factors, name='user_embedding')(user_input)
movie_embedding = Embedding(num_unique_movies, n_latent_factors, name='movie_embedding')(movie_input)

# 임베딩 flatten
user_vector = Flatten()(user_embedding)
movie_vector = Flatten()(movie_embedding)

# 유저 및 영화 임베딩 벡터 컨캣
concat = Concatenate()([user_vector, movie_vector])

In [6]:
# 결과 저장 테이블
results_df = pd.DataFrame(columns=['initializer', 'activation', 'last_loss', 'last_val_loss'])

# Initialization, activation 옵션
initializers = [HeNormal(), HeNormal(), GlorotNormal(), GlorotNormal()]
activations = ['relu', 'sigmoid', 'relu', 'sigmoid']

for init, act in zip(initializers, activations):
    
    print("\n")
    print("initializer: {}".format(init.__class__.__name__))
    print("activation: {}".format(act))
    print("\n")

    def create_mlp(hidden_layers, output):
        for i in range(len(hidden_layers)):
            if i == 0:
                hidden = Dense(hidden_layers[i], activation=act, kernel_initializer=init)(output)
            else:
                hidden = Dense(hidden_layers[i], activation=act, kernel_initializer=init)(hidden)
        output = Dense(1, activation='sigmoid', kernel_initializer=init)(hidden)
        return output

    output = create_mlp([64, 64], concat)

    model = Model(inputs=[user_input, movie_input], outputs=output)

    auc = tf.keras.metrics.AUC()
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', auc])

    history = model.fit([train.userId, train.movieId], train.rating, epochs=3, verbose=2, validation_data=([validation.userId, validation.movieId], validation.rating))

    # DataFrame에 결과 append 
    results_df = results_df.append({'initializer': init.__class__.__name__,
                                    'activation': act,
                                    'last_loss': history.history['loss'][-1],
                                    'last_val_loss': history.history['val_loss'][-1]}, 
                                   ignore_index=True)



initializer: HeNormal
activation: relu






Epoch 1/3
2521/2521 - 18s - loss: 0.4077 - accuracy: 0.8262 - auc: 0.7702 - val_loss: 0.3929 - val_accuracy: 0.8314 - val_auc: 0.7976 - 18s/epoch - 7ms/step
Epoch 2/3
2521/2521 - 18s - loss: 0.3478 - accuracy: 0.8519 - auc: 0.8466 - val_loss: 0.4071 - val_accuracy: 0.8315 - val_auc: 0.7971 - 18s/epoch - 7ms/step
Epoch 3/3
2521/2521 - 15s - loss: 0.3144 - accuracy: 0.8682 - auc: 0.8766 - val_loss: 0.4097 - val_accuracy: 0.8252 - val_auc: 0.7858 - 15s/epoch - 6ms/step


initializer: HeNormal
activation: sigmoid


Epoch 1/3


  results_df = results_df.append({'initializer': init.__class__.__name__,


2521/2521 - 22s - loss: 0.3577 - accuracy: 0.8495 - auc_1: 0.8350 - val_loss: 0.4077 - val_accuracy: 0.8290 - val_auc_1: 0.7900 - 22s/epoch - 9ms/step
Epoch 2/3
2521/2521 - 24s - loss: 0.3325 - accuracy: 0.8592 - auc_1: 0.8613 - val_loss: 0.4047 - val_accuracy: 0.8279 - val_auc_1: 0.7914 - 24s/epoch - 10ms/step
Epoch 3/3
2521/2521 - 18s - loss: 0.3171 - accuracy: 0.8654 - auc_1: 0.8747 - val_loss: 0.4192 - val_accuracy: 0.8227 - val_auc_1: 0.7895 - 18s/epoch - 7ms/step


initializer: GlorotNormal
activation: relu


Epoch 1/3


  results_df = results_df.append({'initializer': init.__class__.__name__,


2521/2521 - 17s - loss: 0.2990 - accuracy: 0.8741 - auc_2: 0.8900 - val_loss: 0.4238 - val_accuracy: 0.8237 - val_auc_2: 0.7900 - 17s/epoch - 7ms/step
Epoch 2/3
2521/2521 - 17s - loss: 0.2765 - accuracy: 0.8847 - auc_2: 0.9069 - val_loss: 0.4415 - val_accuracy: 0.8200 - val_auc_2: 0.7791 - 17s/epoch - 7ms/step
Epoch 3/3
2521/2521 - 16s - loss: 0.2610 - accuracy: 0.8910 - auc_2: 0.9173 - val_loss: 0.4592 - val_accuracy: 0.8096 - val_auc_2: 0.7813 - 16s/epoch - 6ms/step


initializer: GlorotNormal
activation: sigmoid


Epoch 1/3


  results_df = results_df.append({'initializer': init.__class__.__name__,


2521/2521 - 22s - loss: 0.3372 - accuracy: 0.8595 - auc_3: 0.8552 - val_loss: 0.4148 - val_accuracy: 0.8277 - val_auc_3: 0.7854 - 22s/epoch - 9ms/step
Epoch 2/3
2521/2521 - 16s - loss: 0.3139 - accuracy: 0.8676 - auc_3: 0.8779 - val_loss: 0.4234 - val_accuracy: 0.8294 - val_auc_3: 0.7870 - 16s/epoch - 6ms/step
Epoch 3/3
2521/2521 - 15s - loss: 0.2979 - accuracy: 0.8731 - auc_3: 0.8899 - val_loss: 0.4313 - val_accuracy: 0.8302 - val_auc_3: 0.7868 - 15s/epoch - 6ms/step


  results_df = results_df.append({'initializer': init.__class__.__name__,


In [7]:
history

<keras.callbacks.History at 0x7f7791aa6170>

In [12]:
# Print the DataFrame
display(results_df)

Unnamed: 0,initializer,activation,last_loss,last_val_loss
0,HeNormal,relu,0.314416,0.40965
1,HeNormal,sigmoid,0.317065,0.419164
2,GlorotNormal,relu,0.261019,0.459178
3,GlorotNormal,sigmoid,0.297875,0.431285
