In [1]:
import tensorflow as tf
from tensorflow import keras
from keras.utils import plot_model


import numpy as np
import pandas as pd


print(tf.__version__)
print(keras.__version__)

2.16.2
3.6.0


# Load Data


In [2]:
rating = pd.read_csv("data/ratings.csv")
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [3]:
rating.describe() # movieId가 sequential 하지않아 시퀀스로 바꿈

Unnamed: 0,userId,movieId,rating,timestamp
count,100004.0,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608,1129639000.0
std,195.163838,26369.198969,1.058064,191685800.0
min,1.0,1.0,0.5,789652000.0
25%,182.0,1028.0,3.0,965847800.0
50%,367.0,2406.5,4.0,1110422000.0
75%,520.0,5418.0,4.0,1296192000.0
max,671.0,163949.0,5.0,1476641000.0


In [4]:
movieId_to_seqId = {}
seqId_to_movieId = {}
for sId, id in enumerate(rating["movieId"].unique()):
    seqId_to_movieId[sId] = id
    movieId_to_seqId[id] = sId + 1
    
def return_movieId_to_seqId(row):
  return movieId_to_seqId[row.movieId]

rating["new_movieId"] = rating.apply(return_movieId_to_seqId, axis=1)

rating.describe() # 신규로 생성된 new_movieId 는 시퀀셜하게 되어있는것을 확인

Unnamed: 0,userId,movieId,rating,timestamp,new_movieId
count,100004.0,100004.0,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608,1129639000.0,1661.778349
std,195.163838,26369.198969,1.058064,191685800.0,1893.955817
min,1.0,1.0,0.5,789652000.0,1.0
25%,182.0,1028.0,3.0,965847800.0,328.0
50%,367.0,2406.5,4.0,1110422000.0,874.0
75%,520.0,5418.0,4.0,1296192000.0,2345.0
max,671.0,163949.0,5.0,1476641000.0,9066.0


In [5]:
rating.info() # 널값은 없음

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   userId       100004 non-null  int64  
 1   movieId      100004 non-null  int64  
 2   rating       100004 non-null  float64
 3   timestamp    100004 non-null  int64  
 4   new_movieId  100004 non-null  int64  
dtypes: float64(1), int64(4)
memory usage: 3.8 MB


In [33]:
from sklearn.model_selection import train_test_split

# get number of unique users and movies 
num_users = len(rating.userId.unique())
num_movies = len(rating.new_movieId.unique())

# Create rating embedding path 
rating_train_full, rating_test = train_test_split(rating, train_size= 0.9)
rating_train, rating_valid = train_test_split(rating_train_full, train_size= 0.9)

print(f"rating_train: {rating_train.shape}")
print(f"rating_test: {rating_test.shape}")
print(f"rating_valid: {rating_valid.shape}")

rating_train: (81002, 5)
rating_test: (10001, 5)
rating_valid: (9001, 5)


### GMF Implementation


In [34]:
"""from keras.models import Model, Sequential
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from keras.constraints import non_neg

# this could be optimized for item and user
latent_dim = 10


# Movie Input
movie_input = Input(shape=[1],name='movie-input')
movie_embedding = Embedding(num_movies+1, latent_dim, name='movie-embedding', embeddings_constraint= non_neg())(movie_input)
movie_vec = Flatten(name='movie-flatten')(movie_embedding)

# User Input
user_input = Input(shape=[1],name='user-input')
user_embedding = Embedding(num_users+1, latent_dim, name='user-embedding', embeddings_constraint= non_neg())(user_input)
user_vec = Flatten(name='user-flatten')(user_embedding)

# Matrix Factorization Layer
prod = Dot(axes=1, name='dot-product')([movie_vec, user_vec])

model = Model([user_input, movie_input], prod)
model.compile('adam', 'mean_squared_error')



print(rating_train.userId.values.shape)         
print(rating_train.new_movieId.values.shape)    
print(rating_train.rating.values.shape)        

history = model.fit([rating_train.userId.values, rating_train.new_movieId.values], rating_train.rating.values, epochs=10)
  # 예: (num_samples,)
  
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
"""

"from keras.models import Model, Sequential\nfrom keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate\nfrom keras.constraints import non_neg\n\n# this could be optimized for item and user\nlatent_dim = 10\n\n\n# Movie Input\nmovie_input = Input(shape=[1],name='movie-input')\nmovie_embedding = Embedding(num_movies+1, latent_dim, name='movie-embedding', embeddings_constraint= non_neg())(movie_input)\nmovie_vec = Flatten(name='movie-flatten')(movie_embedding)\n\n# User Input\nuser_input = Input(shape=[1],name='user-input')\nuser_embedding = Embedding(num_users+1, latent_dim, name='user-embedding', embeddings_constraint= non_neg())(user_input)\nuser_vec = Flatten(name='user-flatten')(user_embedding)\n\n# Matrix Factorization Layer\nprod = Dot(axes=1, name='dot-product')([movie_vec, user_vec])\n\nmodel = Model([user_input, movie_input], prod)\nmodel.compile('adam', 'mean_squared_error')\n\n\n\nprint(rating_train.userId.values.shape)         \nprint(rating_train.new_movieI

In [36]:
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Dot, Dense
from keras.constraints import non_neg



import os

latent_dim = 30

# Define inputs
movie_input = Input(shape=[1],name='movie-input')

user_input = Input(shape=[1], name='user-input')

# MF Embeddings
movie_embedding_mf = Embedding(num_movies+1, latent_dim, name='movie-embedding-mf', embeddings_constraint= non_neg())(movie_input)
movie_vec_mf = Flatten(name='flatten-movie-mf')(movie_embedding_mf)

user_embedding_mf = Embedding(num_users+1, latent_dim, name='user-embedding-mf', embeddings_constraint= non_neg())(user_input)
user_vec_mf = Flatten(name='flatten-user-mf')(user_embedding_mf)

# MF Layers
pred_mf = Dot(axes=1, name='dot-product')([movie_vec_mf, user_vec_mf])



# MLP Embeddings
movie_embedding_mlp = Embedding(num_movies+1, latent_dim, name='movie-embedding-mlp')(movie_input)
movie_vec_mlp = Flatten(name='flatten-movie-mlp')(movie_embedding_mlp)

user_embedding_mlp = Embedding(num_users+1, latent_dim, name='user-embedding-mlp')(user_input)
user_vec_mlp = Flatten(name='flatten-user-mlp')(user_embedding_mlp)

# MLP layers
concat = tf.keras.layers.concatenate([movie_vec_mlp, user_vec_mlp],name='combine-mlp')

batchNorm_layer1 = tf.keras.layers.BatchNormalization()(concat)
drop_layer1 = tf.keras.layers.Dropout(rate = 0.5)(batchNorm_layer1)
fc_1 = Dense(100, name='fc-1', activation='relu',
             kernel_initializer="he_normal", kernel_regularizer=tf.keras.regularizers.l2(0.00))(drop_layer1)

batchNorm_layer2 = tf.keras.layers.BatchNormalization()(fc_1)
drop_layer2 = tf.keras.layers.Dropout(rate = 0.5)(batchNorm_layer2)
fc_2 = Dense(100, name='fc-2', activation='relu',
             kernel_initializer="he_normal", kernel_regularizer=tf.keras.regularizers.l2(0.00))(drop_layer2)

batchNorm_layer3 = tf.keras.layers.BatchNormalization()(fc_2)
drop_layer3 = tf.keras.layers.Dropout(rate = 0.5)(batchNorm_layer3)

# Prediction from both layers
pred_mlp = Dense(10, name='pred-mlp', activation='relu',
                 kernel_initializer="he_normal", kernel_regularizer=tf.keras.regularizers.l2(0.00))(drop_layer3)

combine_mlp_mf = tf.keras.layers.concatenate([pred_mf, pred_mlp],name='combine-mlp-mf')
 

# Final prediction
result = Dense(1, name='result', activation='relu')(combine_mlp_mf)

model = Model([user_input, movie_input], result)

plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

def root_mean_squared_error(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(y_true, y_pred))))

model.compile(loss = root_mean_squared_error, optimizer=keras.optimizers.Adam(learning_rate = 0.001, beta_1=0.9, beta_2=0.999))


#로그 기록용
def get_run_logdir(root_logdir = "./my_logs"):
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_logdir()

## 콜백 함수 지정
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)
checkpoint_cb = keras.callbacks.ModelCheckpoint("My DNCF Model.keras", save_best_only=True)
early_stoping_cb = keras.callbacks.EarlyStopping(patience=15, restore_best_weights=True)


history = model.fit([rating_train.userId.values, rating_train.new_movieId.values], rating_train.rating.values,epochs= 100,
                    validation_data=((rating_valid.userId.values, rating_valid.new_movieId.values), rating_valid.rating.values),
                    callbacks = [checkpoint_cb, early_stoping_cb, tensorboard_cb])


Epoch 1/100




[1m2532/2532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 1.6535 - val_loss: 0.9176
Epoch 2/100
[1m2532/2532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.9299 - val_loss: 0.9017
Epoch 3/100
[1m2532/2532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.8802 - val_loss: 0.8988
Epoch 4/100
[1m2532/2532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.8428 - val_loss: 0.8954
Epoch 5/100
[1m2532/2532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.8070 - val_loss: 0.8958
Epoch 6/100
[1m2532/2532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.7496 - val_loss: 0.9126
Epoch 7/100
[1m1830/2532[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m1s[0m 2ms/step - loss: 0.6846

KeyboardInterrupt: 